|
@@ -1,5 +1,5 @@
|
|
/*
|
|
/*
|
|
- * kernel/sched.c
|
|
|
|
|
|
+ * kernel/sched/core.c
|
|
*
|
|
*
|
|
* Kernel scheduler and related syscalls
|
|
* Kernel scheduler and related syscalls
|
|
*
|
|
*
|
|
@@ -56,7 +56,6 @@
|
|
#include <linux/percpu.h>
|
|
#include <linux/percpu.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/seq_file.h>
|
|
#include <linux/seq_file.h>
|
|
-#include <linux/stop_machine.h>
|
|
|
|
#include <linux/sysctl.h>
|
|
#include <linux/sysctl.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/syscalls.h>
|
|
#include <linux/times.h>
|
|
#include <linux/times.h>
|
|
@@ -75,129 +74,17 @@
|
|
|
|
|
|
#include <asm/tlb.h>
|
|
#include <asm/tlb.h>
|
|
#include <asm/irq_regs.h>
|
|
#include <asm/irq_regs.h>
|
|
-#include <asm/mutex.h>
|
|
|
|
#ifdef CONFIG_PARAVIRT
|
|
#ifdef CONFIG_PARAVIRT
|
|
#include <asm/paravirt.h>
|
|
#include <asm/paravirt.h>
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-#include "sched_cpupri.h"
|
|
|
|
-#include "workqueue_sched.h"
|
|
|
|
-#include "sched_autogroup.h"
|
|
|
|
|
|
+#include "sched.h"
|
|
|
|
+#include "../workqueue_sched.h"
|
|
|
|
|
|
#define CREATE_TRACE_POINTS
|
|
#define CREATE_TRACE_POINTS
|
|
#include <trace/events/sched.h>
|
|
#include <trace/events/sched.h>
|
|
|
|
|
|
-/*
|
|
|
|
- * Convert user-nice values [ -20 ... 0 ... 19 ]
|
|
|
|
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
|
|
|
|
- * and back.
|
|
|
|
- */
|
|
|
|
-#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
|
|
|
|
-#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
|
|
|
|
-#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * 'User priority' is the nice value converted to something we
|
|
|
|
- * can work with better when scaling various scheduler parameters,
|
|
|
|
- * it's a [ 0 ... 39 ] range.
|
|
|
|
- */
|
|
|
|
-#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
|
|
|
|
-#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
|
|
|
|
-#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Helpers for converting nanosecond timing to jiffy resolution
|
|
|
|
- */
|
|
|
|
-#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
|
|
|
|
-
|
|
|
|
-#define NICE_0_LOAD SCHED_LOAD_SCALE
|
|
|
|
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * These are the 'tuning knobs' of the scheduler:
|
|
|
|
- *
|
|
|
|
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
|
|
|
|
- * Timeslices get refilled after they expire.
|
|
|
|
- */
|
|
|
|
-#define DEF_TIMESLICE (100 * HZ / 1000)
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * single value that denotes runtime == period, ie unlimited time.
|
|
|
|
- */
|
|
|
|
-#define RUNTIME_INF ((u64)~0ULL)
|
|
|
|
-
|
|
|
|
-static inline int rt_policy(int policy)
|
|
|
|
-{
|
|
|
|
- if (policy == SCHED_FIFO || policy == SCHED_RR)
|
|
|
|
- return 1;
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline int task_has_rt_policy(struct task_struct *p)
|
|
|
|
-{
|
|
|
|
- return rt_policy(p->policy);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * This is the priority-queue data structure of the RT scheduling class:
|
|
|
|
- */
|
|
|
|
-struct rt_prio_array {
|
|
|
|
- DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
|
|
|
|
- struct list_head queue[MAX_RT_PRIO];
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-struct rt_bandwidth {
|
|
|
|
- /* nests inside the rq lock: */
|
|
|
|
- raw_spinlock_t rt_runtime_lock;
|
|
|
|
- ktime_t rt_period;
|
|
|
|
- u64 rt_runtime;
|
|
|
|
- struct hrtimer rt_period_timer;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-static struct rt_bandwidth def_rt_bandwidth;
|
|
|
|
-
|
|
|
|
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
|
|
|
|
-
|
|
|
|
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
|
|
|
|
-{
|
|
|
|
- struct rt_bandwidth *rt_b =
|
|
|
|
- container_of(timer, struct rt_bandwidth, rt_period_timer);
|
|
|
|
- ktime_t now;
|
|
|
|
- int overrun;
|
|
|
|
- int idle = 0;
|
|
|
|
-
|
|
|
|
- for (;;) {
|
|
|
|
- now = hrtimer_cb_get_time(timer);
|
|
|
|
- overrun = hrtimer_forward(timer, now, rt_b->rt_period);
|
|
|
|
-
|
|
|
|
- if (!overrun)
|
|
|
|
- break;
|
|
|
|
-
|
|
|
|
- idle = do_sched_rt_period_timer(rt_b, overrun);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static
|
|
|
|
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
|
|
|
|
-{
|
|
|
|
- rt_b->rt_period = ns_to_ktime(period);
|
|
|
|
- rt_b->rt_runtime = runtime;
|
|
|
|
-
|
|
|
|
- raw_spin_lock_init(&rt_b->rt_runtime_lock);
|
|
|
|
-
|
|
|
|
- hrtimer_init(&rt_b->rt_period_timer,
|
|
|
|
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
- rt_b->rt_period_timer.function = sched_rt_period_timer;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline int rt_bandwidth_enabled(void)
|
|
|
|
-{
|
|
|
|
- return sysctl_sched_rt_runtime >= 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
|
|
|
|
|
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
|
{
|
|
{
|
|
unsigned long delta;
|
|
unsigned long delta;
|
|
ktime_t soft, hard, now;
|
|
ktime_t soft, hard, now;
|
|
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
|
|
|
|
-{
|
|
|
|
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- if (hrtimer_active(&rt_b->rt_period_timer))
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- raw_spin_lock(&rt_b->rt_runtime_lock);
|
|
|
|
- start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
|
|
|
|
- raw_spin_unlock(&rt_b->rt_runtime_lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
|
|
|
|
-{
|
|
|
|
- hrtimer_cancel(&rt_b->rt_period_timer);
|
|
|
|
-}
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * sched_domains_mutex serializes calls to init_sched_domains,
|
|
|
|
- * detach_destroy_domains and partition_sched_domains.
|
|
|
|
- */
|
|
|
|
-static DEFINE_MUTEX(sched_domains_mutex);
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
-
|
|
|
|
-#include <linux/cgroup.h>
|
|
|
|
-
|
|
|
|
-struct cfs_rq;
|
|
|
|
-
|
|
|
|
-static LIST_HEAD(task_groups);
|
|
|
|
-
|
|
|
|
-struct cfs_bandwidth {
|
|
|
|
-#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
- raw_spinlock_t lock;
|
|
|
|
- ktime_t period;
|
|
|
|
- u64 quota, runtime;
|
|
|
|
- s64 hierarchal_quota;
|
|
|
|
- u64 runtime_expires;
|
|
|
|
-
|
|
|
|
- int idle, timer_active;
|
|
|
|
- struct hrtimer period_timer, slack_timer;
|
|
|
|
- struct list_head throttled_cfs_rq;
|
|
|
|
-
|
|
|
|
- /* statistics */
|
|
|
|
- int nr_periods, nr_throttled;
|
|
|
|
- u64 throttled_time;
|
|
|
|
-#endif
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-/* task group related information */
|
|
|
|
-struct task_group {
|
|
|
|
- struct cgroup_subsys_state css;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
- /* schedulable entities of this group on each cpu */
|
|
|
|
- struct sched_entity **se;
|
|
|
|
- /* runqueue "owned" by this group on each cpu */
|
|
|
|
- struct cfs_rq **cfs_rq;
|
|
|
|
- unsigned long shares;
|
|
|
|
-
|
|
|
|
- atomic_t load_weight;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
- struct sched_rt_entity **rt_se;
|
|
|
|
- struct rt_rq **rt_rq;
|
|
|
|
-
|
|
|
|
- struct rt_bandwidth rt_bandwidth;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- struct rcu_head rcu;
|
|
|
|
- struct list_head list;
|
|
|
|
-
|
|
|
|
- struct task_group *parent;
|
|
|
|
- struct list_head siblings;
|
|
|
|
- struct list_head children;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SCHED_AUTOGROUP
|
|
|
|
- struct autogroup *autogroup;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- struct cfs_bandwidth cfs_bandwidth;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-/* task_group_lock serializes the addition/removal of task groups */
|
|
|
|
-static DEFINE_SPINLOCK(task_group_lock);
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
-
|
|
|
|
-# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * A weight of 0 or 1 can cause arithmetics problems.
|
|
|
|
- * A weight of a cfs_rq is the sum of weights of which entities
|
|
|
|
- * are queued on this cfs_rq, so a weight of a entity should not be
|
|
|
|
- * too large, so as the shares value of a task group.
|
|
|
|
- * (The default weight is 1024 - so there's no practical
|
|
|
|
- * limitation from this.)
|
|
|
|
- */
|
|
|
|
-#define MIN_SHARES (1UL << 1)
|
|
|
|
-#define MAX_SHARES (1UL << 18)
|
|
|
|
-
|
|
|
|
-static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-/* Default task group.
|
|
|
|
- * Every task in system belong to this group at bootup.
|
|
|
|
- */
|
|
|
|
-struct task_group root_task_group;
|
|
|
|
-
|
|
|
|
-#endif /* CONFIG_CGROUP_SCHED */
|
|
|
|
-
|
|
|
|
-/* CFS-related fields in a runqueue */
|
|
|
|
-struct cfs_rq {
|
|
|
|
- struct load_weight load;
|
|
|
|
- unsigned long nr_running, h_nr_running;
|
|
|
|
-
|
|
|
|
- u64 exec_clock;
|
|
|
|
- u64 min_vruntime;
|
|
|
|
-#ifndef CONFIG_64BIT
|
|
|
|
- u64 min_vruntime_copy;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- struct rb_root tasks_timeline;
|
|
|
|
- struct rb_node *rb_leftmost;
|
|
|
|
-
|
|
|
|
- struct list_head tasks;
|
|
|
|
- struct list_head *balance_iterator;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * 'curr' points to currently running entity on this cfs_rq.
|
|
|
|
- * It is set to NULL otherwise (i.e when none are currently running).
|
|
|
|
- */
|
|
|
|
- struct sched_entity *curr, *next, *last, *skip;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
- unsigned int nr_spread_over;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
- struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
|
|
|
|
- * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
|
|
|
|
- * (like users, containers etc.)
|
|
|
|
- *
|
|
|
|
- * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
|
|
|
|
- * list is used during load balance.
|
|
|
|
- */
|
|
|
|
- int on_list;
|
|
|
|
- struct list_head leaf_cfs_rq_list;
|
|
|
|
- struct task_group *tg; /* group that "owns" this runqueue */
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * the part of load.weight contributed by tasks
|
|
|
|
- */
|
|
|
|
- unsigned long task_weight;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * h_load = weight * f(tg)
|
|
|
|
- *
|
|
|
|
- * Where f(tg) is the recursive weight fraction assigned to
|
|
|
|
- * this group.
|
|
|
|
- */
|
|
|
|
- unsigned long h_load;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * Maintaining per-cpu shares distribution for group scheduling
|
|
|
|
- *
|
|
|
|
- * load_stamp is the last time we updated the load average
|
|
|
|
- * load_last is the last time we updated the load average and saw load
|
|
|
|
- * load_unacc_exec_time is currently unaccounted execution time
|
|
|
|
- */
|
|
|
|
- u64 load_avg;
|
|
|
|
- u64 load_period;
|
|
|
|
- u64 load_stamp, load_last, load_unacc_exec_time;
|
|
|
|
-
|
|
|
|
- unsigned long load_contribution;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
- int runtime_enabled;
|
|
|
|
- u64 runtime_expires;
|
|
|
|
- s64 runtime_remaining;
|
|
|
|
-
|
|
|
|
- u64 throttled_timestamp;
|
|
|
|
- int throttled, throttle_count;
|
|
|
|
- struct list_head throttled_list;
|
|
|
|
-#endif
|
|
|
|
-#endif
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
-#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
- return &tg->cfs_bandwidth;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline u64 default_cfs_period(void);
|
|
|
|
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
|
|
|
|
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
|
|
|
|
-
|
|
|
|
-static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
|
|
|
|
-{
|
|
|
|
- struct cfs_bandwidth *cfs_b =
|
|
|
|
- container_of(timer, struct cfs_bandwidth, slack_timer);
|
|
|
|
- do_sched_cfs_slack_timer(cfs_b);
|
|
|
|
-
|
|
|
|
- return HRTIMER_NORESTART;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
|
|
|
|
-{
|
|
|
|
- struct cfs_bandwidth *cfs_b =
|
|
|
|
- container_of(timer, struct cfs_bandwidth, period_timer);
|
|
|
|
- ktime_t now;
|
|
|
|
- int overrun;
|
|
|
|
- int idle = 0;
|
|
|
|
-
|
|
|
|
- for (;;) {
|
|
|
|
- now = hrtimer_cb_get_time(timer);
|
|
|
|
- overrun = hrtimer_forward(timer, now, cfs_b->period);
|
|
|
|
-
|
|
|
|
- if (!overrun)
|
|
|
|
- break;
|
|
|
|
-
|
|
|
|
- idle = do_sched_cfs_period_timer(cfs_b, overrun);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|
|
|
-{
|
|
|
|
- raw_spin_lock_init(&cfs_b->lock);
|
|
|
|
- cfs_b->runtime = 0;
|
|
|
|
- cfs_b->quota = RUNTIME_INF;
|
|
|
|
- cfs_b->period = ns_to_ktime(default_cfs_period());
|
|
|
|
-
|
|
|
|
- INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
|
|
|
|
- hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
- cfs_b->period_timer.function = sched_cfs_period_timer;
|
|
|
|
- hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
|
- cfs_b->slack_timer.function = sched_cfs_slack_timer;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
|
|
|
|
-{
|
|
|
|
- cfs_rq->runtime_enabled = 0;
|
|
|
|
- INIT_LIST_HEAD(&cfs_rq->throttled_list);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/* requires cfs_b->lock, may release to reprogram timer */
|
|
|
|
-static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|
|
|
-{
|
|
|
|
- /*
|
|
|
|
- * The timer may be active because we're trying to set a new bandwidth
|
|
|
|
- * period or because we're racing with the tear-down path
|
|
|
|
- * (timer_active==0 becomes visible before the hrtimer call-back
|
|
|
|
- * terminates). In either case we ensure that it's re-programmed
|
|
|
|
- */
|
|
|
|
- while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
|
|
|
|
- raw_spin_unlock(&cfs_b->lock);
|
|
|
|
- /* ensure cfs_b->lock is available while we wait */
|
|
|
|
- hrtimer_cancel(&cfs_b->period_timer);
|
|
|
|
-
|
|
|
|
- raw_spin_lock(&cfs_b->lock);
|
|
|
|
- /* if someone else restarted the timer then we're done */
|
|
|
|
- if (cfs_b->timer_active)
|
|
|
|
- return;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- cfs_b->timer_active = 1;
|
|
|
|
- start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
|
|
|
|
-{
|
|
|
|
- hrtimer_cancel(&cfs_b->period_timer);
|
|
|
|
- hrtimer_cancel(&cfs_b->slack_timer);
|
|
|
|
-}
|
|
|
|
-#else
|
|
|
|
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
|
|
|
|
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
|
|
|
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
|
|
|
|
-
|
|
|
|
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
- return NULL;
|
|
|
|
-}
|
|
|
|
-#endif /* CONFIG_CFS_BANDWIDTH */
|
|
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
-
|
|
|
|
-/* Real-Time classes' related field in a runqueue: */
|
|
|
|
-struct rt_rq {
|
|
|
|
- struct rt_prio_array active;
|
|
|
|
- unsigned long rt_nr_running;
|
|
|
|
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
|
|
|
|
- struct {
|
|
|
|
- int curr; /* highest queued rt task prio */
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- int next; /* next highest */
|
|
|
|
-#endif
|
|
|
|
- } highest_prio;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- unsigned long rt_nr_migratory;
|
|
|
|
- unsigned long rt_nr_total;
|
|
|
|
- int overloaded;
|
|
|
|
- struct plist_head pushable_tasks;
|
|
|
|
-#endif
|
|
|
|
- int rt_throttled;
|
|
|
|
- u64 rt_time;
|
|
|
|
- u64 rt_runtime;
|
|
|
|
- /* Nests inside the rq lock: */
|
|
|
|
- raw_spinlock_t rt_runtime_lock;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
- unsigned long rt_nr_boosted;
|
|
|
|
-
|
|
|
|
- struct rq *rq;
|
|
|
|
- struct list_head leaf_rt_rq_list;
|
|
|
|
- struct task_group *tg;
|
|
|
|
-#endif
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * We add the notion of a root-domain which will be used to define per-domain
|
|
|
|
- * variables. Each exclusive cpuset essentially defines an island domain by
|
|
|
|
- * fully partitioning the member cpus from any other cpuset. Whenever a new
|
|
|
|
- * exclusive cpuset is created, we also create and attach a new root-domain
|
|
|
|
- * object.
|
|
|
|
- *
|
|
|
|
- */
|
|
|
|
-struct root_domain {
|
|
|
|
- atomic_t refcount;
|
|
|
|
- atomic_t rto_count;
|
|
|
|
- struct rcu_head rcu;
|
|
|
|
- cpumask_var_t span;
|
|
|
|
- cpumask_var_t online;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * The "RT overload" flag: it gets set if a CPU has more than
|
|
|
|
- * one runnable RT task.
|
|
|
|
- */
|
|
|
|
- cpumask_var_t rto_mask;
|
|
|
|
- struct cpupri cpupri;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * By default the system creates a single root-domain with all cpus as
|
|
|
|
- * members (mimicking the global state we have today).
|
|
|
|
- */
|
|
|
|
-static struct root_domain def_root_domain;
|
|
|
|
-
|
|
|
|
-#endif /* CONFIG_SMP */
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * This is the main, per-CPU runqueue data structure.
|
|
|
|
- *
|
|
|
|
- * Locking rule: those places that want to lock multiple runqueues
|
|
|
|
- * (such as the load balancing or the thread migration code), lock
|
|
|
|
- * acquire operations must be ordered by ascending &runqueue.
|
|
|
|
- */
|
|
|
|
-struct rq {
|
|
|
|
- /* runqueue lock: */
|
|
|
|
- raw_spinlock_t lock;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * nr_running and cpu_load should be in the same cacheline because
|
|
|
|
- * remote CPUs use both these fields when doing load calculation.
|
|
|
|
- */
|
|
|
|
- unsigned long nr_running;
|
|
|
|
- #define CPU_LOAD_IDX_MAX 5
|
|
|
|
- unsigned long cpu_load[CPU_LOAD_IDX_MAX];
|
|
|
|
- unsigned long last_load_update_tick;
|
|
|
|
-#ifdef CONFIG_NO_HZ
|
|
|
|
- u64 nohz_stamp;
|
|
|
|
- unsigned char nohz_balance_kick;
|
|
|
|
-#endif
|
|
|
|
- int skip_clock_update;
|
|
|
|
-
|
|
|
|
- /* capture load from *all* tasks on this cpu: */
|
|
|
|
- struct load_weight load;
|
|
|
|
- unsigned long nr_load_updates;
|
|
|
|
- u64 nr_switches;
|
|
|
|
-
|
|
|
|
- struct cfs_rq cfs;
|
|
|
|
- struct rt_rq rt;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
- /* list of leaf cfs_rq on this cpu: */
|
|
|
|
- struct list_head leaf_cfs_rq_list;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
- struct list_head leaf_rt_rq_list;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * This is part of a global counter where only the total sum
|
|
|
|
- * over all CPUs matters. A task can increase this counter on
|
|
|
|
- * one CPU and if it got migrated afterwards it may decrease
|
|
|
|
- * it on another CPU. Always updated under the runqueue lock:
|
|
|
|
- */
|
|
|
|
- unsigned long nr_uninterruptible;
|
|
|
|
-
|
|
|
|
- struct task_struct *curr, *idle, *stop;
|
|
|
|
- unsigned long next_balance;
|
|
|
|
- struct mm_struct *prev_mm;
|
|
|
|
-
|
|
|
|
- u64 clock;
|
|
|
|
- u64 clock_task;
|
|
|
|
-
|
|
|
|
- atomic_t nr_iowait;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- struct root_domain *rd;
|
|
|
|
- struct sched_domain *sd;
|
|
|
|
-
|
|
|
|
- unsigned long cpu_power;
|
|
|
|
-
|
|
|
|
- unsigned char idle_balance;
|
|
|
|
- /* For active balancing */
|
|
|
|
- int post_schedule;
|
|
|
|
- int active_balance;
|
|
|
|
- int push_cpu;
|
|
|
|
- struct cpu_stop_work active_balance_work;
|
|
|
|
- /* cpu of this runqueue: */
|
|
|
|
- int cpu;
|
|
|
|
- int online;
|
|
|
|
-
|
|
|
|
- u64 rt_avg;
|
|
|
|
- u64 age_stamp;
|
|
|
|
- u64 idle_stamp;
|
|
|
|
- u64 avg_idle;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
|
|
- u64 prev_irq_time;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_PARAVIRT
|
|
|
|
- u64 prev_steal_time;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
|
|
|
|
- u64 prev_steal_time_rq;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- /* calc_load related fields */
|
|
|
|
- unsigned long calc_load_update;
|
|
|
|
- long calc_load_active;
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SCHED_HRTICK
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- int hrtick_csd_pending;
|
|
|
|
- struct call_single_data hrtick_csd;
|
|
|
|
-#endif
|
|
|
|
- struct hrtimer hrtick_timer;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SCHEDSTATS
|
|
|
|
- /* latency stats */
|
|
|
|
- struct sched_info rq_sched_info;
|
|
|
|
- unsigned long long rq_cpu_time;
|
|
|
|
- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
|
|
|
|
-
|
|
|
|
- /* sys_sched_yield() stats */
|
|
|
|
- unsigned int yld_count;
|
|
|
|
-
|
|
|
|
- /* schedule() stats */
|
|
|
|
- unsigned int sched_switch;
|
|
|
|
- unsigned int sched_count;
|
|
|
|
- unsigned int sched_goidle;
|
|
|
|
-
|
|
|
|
- /* try_to_wake_up() stats */
|
|
|
|
- unsigned int ttwu_count;
|
|
|
|
- unsigned int ttwu_local;
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- struct llist_head wake_list;
|
|
|
|
-#endif
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
|
|
|
-
|
|
|
|
-
|
|
|
|
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
|
|
|
|
-
|
|
|
|
-static inline int cpu_of(struct rq *rq)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- return rq->cpu;
|
|
|
|
-#else
|
|
|
|
- return 0;
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#define rcu_dereference_check_sched_domain(p) \
|
|
|
|
- rcu_dereference_check((p), \
|
|
|
|
- lockdep_is_held(&sched_domains_mutex))
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
|
|
|
|
- * See detach_destroy_domains: synchronize_sched for details.
|
|
|
|
- *
|
|
|
|
- * The domain tree of any CPU may only be accessed from within
|
|
|
|
- * preempt-disabled sections.
|
|
|
|
- */
|
|
|
|
-#define for_each_domain(cpu, __sd) \
|
|
|
|
- for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
|
|
|
|
-
|
|
|
|
-#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
|
|
|
|
-#define this_rq() (&__get_cpu_var(runqueues))
|
|
|
|
-#define task_rq(p) cpu_rq(task_cpu(p))
|
|
|
|
-#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
|
|
|
|
-#define raw_rq() (&__raw_get_cpu_var(runqueues))
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Return the group to which this tasks belongs.
|
|
|
|
- *
|
|
|
|
- * We use task_subsys_state_check() and extend the RCU verification with
|
|
|
|
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
|
|
|
|
- * task it moves into the cgroup. Therefore by holding either of those locks,
|
|
|
|
- * we pin the task to the current cgroup.
|
|
|
|
- */
|
|
|
|
-static inline struct task_group *task_group(struct task_struct *p)
|
|
|
|
-{
|
|
|
|
- struct task_group *tg;
|
|
|
|
- struct cgroup_subsys_state *css;
|
|
|
|
-
|
|
|
|
- css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
|
|
|
|
- lockdep_is_held(&p->pi_lock) ||
|
|
|
|
- lockdep_is_held(&task_rq(p)->lock));
|
|
|
|
- tg = container_of(css, struct task_group, css);
|
|
|
|
-
|
|
|
|
- return autogroup_task_group(p, tg);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
|
|
|
|
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
- p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
|
|
|
|
- p->se.parent = task_group(p)->se[cpu];
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
- p->rt.rt_rq = task_group(p)->rt_rq[cpu];
|
|
|
|
- p->rt.parent = task_group(p)->rt_se[cpu];
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else /* CONFIG_CGROUP_SCHED */
|
|
|
|
-
|
|
|
|
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
|
|
|
|
-static inline struct task_group *task_group(struct task_struct *p)
|
|
|
|
-{
|
|
|
|
- return NULL;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#endif /* CONFIG_CGROUP_SCHED */
|
|
|
|
|
|
+DEFINE_MUTEX(sched_domains_mutex);
|
|
|
|
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
|
|
|
|
|
|
static void update_rq_clock_task(struct rq *rq, s64 delta);
|
|
static void update_rq_clock_task(struct rq *rq, s64 delta);
|
|
|
|
|
|
-static void update_rq_clock(struct rq *rq)
|
|
|
|
|
|
+void update_rq_clock(struct rq *rq)
|
|
{
|
|
{
|
|
s64 delta;
|
|
s64 delta;
|
|
|
|
|
|
@@ -802,45 +121,15 @@ static void update_rq_clock(struct rq *rq)
|
|
update_rq_clock_task(rq, delta);
|
|
update_rq_clock_task(rq, delta);
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
|
|
|
|
- */
|
|
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
-# define const_debug __read_mostly
|
|
|
|
-#else
|
|
|
|
-# define const_debug static const
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-/**
|
|
|
|
- * runqueue_is_locked - Returns true if the current cpu runqueue is locked
|
|
|
|
- * @cpu: the processor in question.
|
|
|
|
- *
|
|
|
|
- * This interface allows printk to be called with the runqueue lock
|
|
|
|
- * held and know whether or not it is OK to wake up the klogd.
|
|
|
|
- */
|
|
|
|
-int runqueue_is_locked(int cpu)
|
|
|
|
-{
|
|
|
|
- return raw_spin_is_locked(&cpu_rq(cpu)->lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* Debugging: various feature bits
|
|
* Debugging: various feature bits
|
|
*/
|
|
*/
|
|
|
|
|
|
-#define SCHED_FEAT(name, enabled) \
|
|
|
|
- __SCHED_FEAT_##name ,
|
|
|
|
-
|
|
|
|
-enum {
|
|
|
|
-#include "sched_features.h"
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-#undef SCHED_FEAT
|
|
|
|
-
|
|
|
|
#define SCHED_FEAT(name, enabled) \
|
|
#define SCHED_FEAT(name, enabled) \
|
|
(1UL << __SCHED_FEAT_##name) * enabled |
|
|
(1UL << __SCHED_FEAT_##name) * enabled |
|
|
|
|
|
|
const_debug unsigned int sysctl_sched_features =
|
|
const_debug unsigned int sysctl_sched_features =
|
|
-#include "sched_features.h"
|
|
|
|
|
|
+#include "features.h"
|
|
0;
|
|
0;
|
|
|
|
|
|
#undef SCHED_FEAT
|
|
#undef SCHED_FEAT
|
|
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
|
|
#name ,
|
|
#name ,
|
|
|
|
|
|
static __read_mostly char *sched_feat_names[] = {
|
|
static __read_mostly char *sched_feat_names[] = {
|
|
-#include "sched_features.h"
|
|
|
|
|
|
+#include "features.h"
|
|
NULL
|
|
NULL
|
|
};
|
|
};
|
|
|
|
|
|
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
|
|
{
|
|
{
|
|
int i;
|
|
int i;
|
|
|
|
|
|
- for (i = 0; sched_feat_names[i]; i++) {
|
|
|
|
|
|
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
|
if (!(sysctl_sched_features & (1UL << i)))
|
|
if (!(sysctl_sched_features & (1UL << i)))
|
|
seq_puts(m, "NO_");
|
|
seq_puts(m, "NO_");
|
|
seq_printf(m, "%s ", sched_feat_names[i]);
|
|
seq_printf(m, "%s ", sched_feat_names[i]);
|
|
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#ifdef HAVE_JUMP_LABEL
|
|
|
|
+
|
|
|
|
+#define jump_label_key__true jump_label_key_enabled
|
|
|
|
+#define jump_label_key__false jump_label_key_disabled
|
|
|
|
+
|
|
|
|
+#define SCHED_FEAT(name, enabled) \
|
|
|
|
+ jump_label_key__##enabled ,
|
|
|
|
+
|
|
|
|
+struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
|
|
|
|
+#include "features.h"
|
|
|
|
+};
|
|
|
|
+
|
|
|
|
+#undef SCHED_FEAT
|
|
|
|
+
|
|
|
|
+static void sched_feat_disable(int i)
|
|
|
|
+{
|
|
|
|
+ if (jump_label_enabled(&sched_feat_keys[i]))
|
|
|
|
+ jump_label_dec(&sched_feat_keys[i]);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void sched_feat_enable(int i)
|
|
|
|
+{
|
|
|
|
+ if (!jump_label_enabled(&sched_feat_keys[i]))
|
|
|
|
+ jump_label_inc(&sched_feat_keys[i]);
|
|
|
|
+}
|
|
|
|
+#else
|
|
|
|
+static void sched_feat_disable(int i) { };
|
|
|
|
+static void sched_feat_enable(int i) { };
|
|
|
|
+#endif /* HAVE_JUMP_LABEL */
|
|
|
|
+
|
|
static ssize_t
|
|
static ssize_t
|
|
sched_feat_write(struct file *filp, const char __user *ubuf,
|
|
sched_feat_write(struct file *filp, const char __user *ubuf,
|
|
size_t cnt, loff_t *ppos)
|
|
size_t cnt, loff_t *ppos)
|
|
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|
cmp += 3;
|
|
cmp += 3;
|
|
}
|
|
}
|
|
|
|
|
|
- for (i = 0; sched_feat_names[i]; i++) {
|
|
|
|
|
|
+ for (i = 0; i < __SCHED_FEAT_NR; i++) {
|
|
if (strcmp(cmp, sched_feat_names[i]) == 0) {
|
|
if (strcmp(cmp, sched_feat_names[i]) == 0) {
|
|
- if (neg)
|
|
|
|
|
|
+ if (neg) {
|
|
sysctl_sched_features &= ~(1UL << i);
|
|
sysctl_sched_features &= ~(1UL << i);
|
|
- else
|
|
|
|
|
|
+ sched_feat_disable(i);
|
|
|
|
+ } else {
|
|
sysctl_sched_features |= (1UL << i);
|
|
sysctl_sched_features |= (1UL << i);
|
|
|
|
+ sched_feat_enable(i);
|
|
|
|
+ }
|
|
break;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
- if (!sched_feat_names[i])
|
|
|
|
|
|
+ if (i == __SCHED_FEAT_NR)
|
|
return -EINVAL;
|
|
return -EINVAL;
|
|
|
|
|
|
*ppos += cnt;
|
|
*ppos += cnt;
|
|
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
late_initcall(sched_init_debug);
|
|
late_initcall(sched_init_debug);
|
|
-
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
|
|
|
|
|
+#endif /* CONFIG_SCHED_DEBUG */
|
|
|
|
|
|
/*
|
|
/*
|
|
* Number of tasks to iterate in a single balance run.
|
|
* Number of tasks to iterate in a single balance run.
|
|
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
|
|
*/
|
|
*/
|
|
unsigned int sysctl_sched_rt_period = 1000000;
|
|
unsigned int sysctl_sched_rt_period = 1000000;
|
|
|
|
|
|
-static __read_mostly int scheduler_running;
|
|
|
|
|
|
+__read_mostly int scheduler_running;
|
|
|
|
|
|
/*
|
|
/*
|
|
* part of the period that we allow rt tasks to run in us.
|
|
* part of the period that we allow rt tasks to run in us.
|
|
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
|
|
*/
|
|
*/
|
|
int sysctl_sched_rt_runtime = 950000;
|
|
int sysctl_sched_rt_runtime = 950000;
|
|
|
|
|
|
-static inline u64 global_rt_period(void)
|
|
|
|
-{
|
|
|
|
- return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline u64 global_rt_runtime(void)
|
|
|
|
-{
|
|
|
|
- if (sysctl_sched_rt_runtime < 0)
|
|
|
|
- return RUNTIME_INF;
|
|
|
|
-
|
|
|
|
- return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
|
|
|
|
-}
|
|
|
|
|
|
|
|
-#ifndef prepare_arch_switch
|
|
|
|
-# define prepare_arch_switch(next) do { } while (0)
|
|
|
|
-#endif
|
|
|
|
-#ifndef finish_arch_switch
|
|
|
|
-# define finish_arch_switch(prev) do { } while (0)
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-static inline int task_current(struct rq *rq, struct task_struct *p)
|
|
|
|
-{
|
|
|
|
- return rq->curr == p;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline int task_running(struct rq *rq, struct task_struct *p)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- return p->on_cpu;
|
|
|
|
-#else
|
|
|
|
- return task_current(rq, p);
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
|
|
|
|
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * We can optimise this out completely for !SMP, because the
|
|
|
|
- * SMP rebalancing from interrupt is the only thing that cares
|
|
|
|
- * here.
|
|
|
|
- */
|
|
|
|
- next->on_cpu = 1;
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
|
|
|
|
- * We must ensure this doesn't happen until the switch is completely
|
|
|
|
- * finished.
|
|
|
|
- */
|
|
|
|
- smp_wmb();
|
|
|
|
- prev->on_cpu = 0;
|
|
|
|
-#endif
|
|
|
|
-#ifdef CONFIG_DEBUG_SPINLOCK
|
|
|
|
- /* this is a valid case when another task releases the spinlock */
|
|
|
|
- rq->lock.owner = current;
|
|
|
|
-#endif
|
|
|
|
- /*
|
|
|
|
- * If we are tracking spinlock dependencies then we have to
|
|
|
|
- * fix up the runqueue lock - which gets 'carried over' from
|
|
|
|
- * prev into current:
|
|
|
|
- */
|
|
|
|
- spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
|
|
|
|
-
|
|
|
|
- raw_spin_unlock_irq(&rq->lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
|
|
|
|
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * We can optimise this out completely for !SMP, because the
|
|
|
|
- * SMP rebalancing from interrupt is the only thing that cares
|
|
|
|
- * here.
|
|
|
|
- */
|
|
|
|
- next->on_cpu = 1;
|
|
|
|
-#endif
|
|
|
|
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
- raw_spin_unlock_irq(&rq->lock);
|
|
|
|
-#else
|
|
|
|
- raw_spin_unlock(&rq->lock);
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
|
|
|
|
-{
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * After ->on_cpu is cleared, the task can be moved to a different CPU.
|
|
|
|
- * We must ensure this doesn't happen until the switch is completely
|
|
|
|
- * finished.
|
|
|
|
- */
|
|
|
|
- smp_wmb();
|
|
|
|
- prev->on_cpu = 0;
|
|
|
|
-#endif
|
|
|
|
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
|
|
|
|
- local_irq_enable();
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
|
|
|
|
|
|
|
|
/*
|
|
/*
|
|
* __task_rq_lock - lock the rq @p resides on.
|
|
* __task_rq_lock - lock the rq @p resides on.
|
|
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
|
|
* rq->lock.
|
|
* rq->lock.
|
|
*/
|
|
*/
|
|
|
|
|
|
-/*
|
|
|
|
- * Use hrtick when:
|
|
|
|
- * - enabled by features
|
|
|
|
- * - hrtimer is actually high res
|
|
|
|
- */
|
|
|
|
-static inline int hrtick_enabled(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- if (!sched_feat(HRTICK))
|
|
|
|
- return 0;
|
|
|
|
- if (!cpu_active(cpu_of(rq)))
|
|
|
|
- return 0;
|
|
|
|
- return hrtimer_is_hres_active(&rq->hrtick_timer);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
static void hrtick_clear(struct rq *rq)
|
|
static void hrtick_clear(struct rq *rq)
|
|
{
|
|
{
|
|
if (hrtimer_active(&rq->hrtick_timer))
|
|
if (hrtimer_active(&rq->hrtick_timer))
|
|
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
|
|
*
|
|
*
|
|
* called with rq->lock held and irqs disabled
|
|
* called with rq->lock held and irqs disabled
|
|
*/
|
|
*/
|
|
-static void hrtick_start(struct rq *rq, u64 delay)
|
|
|
|
|
|
+void hrtick_start(struct rq *rq, u64 delay)
|
|
{
|
|
{
|
|
struct hrtimer *timer = &rq->hrtick_timer;
|
|
struct hrtimer *timer = &rq->hrtick_timer;
|
|
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
|
|
ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
|
|
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
|
|
*
|
|
*
|
|
* called with rq->lock held and irqs disabled
|
|
* called with rq->lock held and irqs disabled
|
|
*/
|
|
*/
|
|
-static void hrtick_start(struct rq *rq, u64 delay)
|
|
|
|
|
|
+void hrtick_start(struct rq *rq, u64 delay)
|
|
{
|
|
{
|
|
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
|
|
__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
|
|
HRTIMER_MODE_REL_PINNED, 0);
|
|
HRTIMER_MODE_REL_PINNED, 0);
|
|
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
|
|
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
|
|
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-static void resched_task(struct task_struct *p)
|
|
|
|
|
|
+void resched_task(struct task_struct *p)
|
|
{
|
|
{
|
|
int cpu;
|
|
int cpu;
|
|
|
|
|
|
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
|
|
smp_send_reschedule(cpu);
|
|
smp_send_reschedule(cpu);
|
|
}
|
|
}
|
|
|
|
|
|
-static void resched_cpu(int cpu)
|
|
|
|
|
|
+void resched_cpu(int cpu)
|
|
{
|
|
{
|
|
struct rq *rq = cpu_rq(cpu);
|
|
struct rq *rq = cpu_rq(cpu);
|
|
unsigned long flags;
|
|
unsigned long flags;
|
|
@@ -1388,245 +588,71 @@ void wake_up_idle_cpu(int cpu)
|
|
* to idle and has not yet set rq->curr to idle then it will
|
|
* to idle and has not yet set rq->curr to idle then it will
|
|
* be serialized on the timer wheel base lock and take the new
|
|
* be serialized on the timer wheel base lock and take the new
|
|
* timer into account automatically.
|
|
* timer into account automatically.
|
|
- */
|
|
|
|
- if (rq->curr != rq->idle)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * We can set TIF_RESCHED on the idle task of the other CPU
|
|
|
|
- * lockless. The worst case is that the other CPU runs the
|
|
|
|
- * idle task through an additional NOOP schedule()
|
|
|
|
- */
|
|
|
|
- set_tsk_need_resched(rq->idle);
|
|
|
|
-
|
|
|
|
- /* NEED_RESCHED must be visible before we test polling */
|
|
|
|
- smp_mb();
|
|
|
|
- if (!tsk_is_polling(rq->idle))
|
|
|
|
- smp_send_reschedule(cpu);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline bool got_nohz_idle_kick(void)
|
|
|
|
-{
|
|
|
|
- return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else /* CONFIG_NO_HZ */
|
|
|
|
-
|
|
|
|
-static inline bool got_nohz_idle_kick(void)
|
|
|
|
-{
|
|
|
|
- return false;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#endif /* CONFIG_NO_HZ */
|
|
|
|
-
|
|
|
|
-static u64 sched_avg_period(void)
|
|
|
|
-{
|
|
|
|
- return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void sched_avg_update(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- s64 period = sched_avg_period();
|
|
|
|
-
|
|
|
|
- while ((s64)(rq->clock - rq->age_stamp) > period) {
|
|
|
|
- /*
|
|
|
|
- * Inline assembly required to prevent the compiler
|
|
|
|
- * optimising this loop into a divmod call.
|
|
|
|
- * See __iter_div_u64_rem() for another example of this.
|
|
|
|
- */
|
|
|
|
- asm("" : "+rm" (rq->age_stamp));
|
|
|
|
- rq->age_stamp += period;
|
|
|
|
- rq->rt_avg /= 2;
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
|
|
|
-{
|
|
|
|
- rq->rt_avg += rt_delta;
|
|
|
|
- sched_avg_update(rq);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else /* !CONFIG_SMP */
|
|
|
|
-static void resched_task(struct task_struct *p)
|
|
|
|
-{
|
|
|
|
- assert_raw_spin_locked(&task_rq(p)->lock);
|
|
|
|
- set_tsk_need_resched(p);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void sched_avg_update(struct rq *rq)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-#endif /* CONFIG_SMP */
|
|
|
|
-
|
|
|
|
-#if BITS_PER_LONG == 32
|
|
|
|
-# define WMULT_CONST (~0UL)
|
|
|
|
-#else
|
|
|
|
-# define WMULT_CONST (1UL << 32)
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#define WMULT_SHIFT 32
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Shift right and round:
|
|
|
|
- */
|
|
|
|
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * delta *= weight / lw
|
|
|
|
- */
|
|
|
|
-static unsigned long
|
|
|
|
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
|
|
|
|
- struct load_weight *lw)
|
|
|
|
-{
|
|
|
|
- u64 tmp;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
|
|
|
|
- * entities since MIN_SHARES = 2. Treat weight as 1 if less than
|
|
|
|
- * 2^SCHED_LOAD_RESOLUTION.
|
|
|
|
- */
|
|
|
|
- if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
|
|
|
|
- tmp = (u64)delta_exec * scale_load_down(weight);
|
|
|
|
- else
|
|
|
|
- tmp = (u64)delta_exec;
|
|
|
|
-
|
|
|
|
- if (!lw->inv_weight) {
|
|
|
|
- unsigned long w = scale_load_down(lw->weight);
|
|
|
|
-
|
|
|
|
- if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
|
|
|
|
- lw->inv_weight = 1;
|
|
|
|
- else if (unlikely(!w))
|
|
|
|
- lw->inv_weight = WMULT_CONST;
|
|
|
|
- else
|
|
|
|
- lw->inv_weight = WMULT_CONST / w;
|
|
|
|
- }
|
|
|
|
|
|
+ */
|
|
|
|
+ if (rq->curr != rq->idle)
|
|
|
|
+ return;
|
|
|
|
|
|
/*
|
|
/*
|
|
- * Check whether we'd overflow the 64-bit multiplication:
|
|
|
|
|
|
+ * We can set TIF_RESCHED on the idle task of the other CPU
|
|
|
|
+ * lockless. The worst case is that the other CPU runs the
|
|
|
|
+ * idle task through an additional NOOP schedule()
|
|
*/
|
|
*/
|
|
- if (unlikely(tmp > WMULT_CONST))
|
|
|
|
- tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
|
|
|
|
- WMULT_SHIFT/2);
|
|
|
|
- else
|
|
|
|
- tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
|
|
|
|
|
|
+ set_tsk_need_resched(rq->idle);
|
|
|
|
|
|
- return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
|
|
|
|
|
|
+ /* NEED_RESCHED must be visible before we test polling */
|
|
|
|
+ smp_mb();
|
|
|
|
+ if (!tsk_is_polling(rq->idle))
|
|
|
|
+ smp_send_reschedule(cpu);
|
|
}
|
|
}
|
|
|
|
|
|
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
|
|
|
|
|
|
+static inline bool got_nohz_idle_kick(void)
|
|
{
|
|
{
|
|
- lw->weight += inc;
|
|
|
|
- lw->inv_weight = 0;
|
|
|
|
|
|
+ int cpu = smp_processor_id();
|
|
|
|
+ return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
|
|
}
|
|
}
|
|
|
|
|
|
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
|
|
|
|
-{
|
|
|
|
- lw->weight -= dec;
|
|
|
|
- lw->inv_weight = 0;
|
|
|
|
-}
|
|
|
|
|
|
+#else /* CONFIG_NO_HZ */
|
|
|
|
|
|
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
|
|
|
|
|
|
+static inline bool got_nohz_idle_kick(void)
|
|
{
|
|
{
|
|
- lw->weight = w;
|
|
|
|
- lw->inv_weight = 0;
|
|
|
|
|
|
+ return false;
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
|
|
|
|
- * of tasks with abnormal "nice" values across CPUs the contribution that
|
|
|
|
- * each task makes to its run queue's load is weighted according to its
|
|
|
|
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
|
|
|
|
- * scaled version of the new time slice allocation that they receive on time
|
|
|
|
- * slice expiry etc.
|
|
|
|
- */
|
|
|
|
-
|
|
|
|
-#define WEIGHT_IDLEPRIO 3
|
|
|
|
-#define WMULT_IDLEPRIO 1431655765
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Nice levels are multiplicative, with a gentle 10% change for every
|
|
|
|
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
|
|
|
|
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
|
|
|
|
- * that remained on nice 0.
|
|
|
|
- *
|
|
|
|
- * The "10% effect" is relative and cumulative: from _any_ nice level,
|
|
|
|
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
|
|
|
|
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
|
|
|
|
- * If a task goes up by ~10% and another task goes down by ~10% then
|
|
|
|
- * the relative distance between them is ~25%.)
|
|
|
|
- */
|
|
|
|
-static const int prio_to_weight[40] = {
|
|
|
|
- /* -20 */ 88761, 71755, 56483, 46273, 36291,
|
|
|
|
- /* -15 */ 29154, 23254, 18705, 14949, 11916,
|
|
|
|
- /* -10 */ 9548, 7620, 6100, 4904, 3906,
|
|
|
|
- /* -5 */ 3121, 2501, 1991, 1586, 1277,
|
|
|
|
- /* 0 */ 1024, 820, 655, 526, 423,
|
|
|
|
- /* 5 */ 335, 272, 215, 172, 137,
|
|
|
|
- /* 10 */ 110, 87, 70, 56, 45,
|
|
|
|
- /* 15 */ 36, 29, 23, 18, 15,
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
|
|
|
|
- *
|
|
|
|
- * In cases where the weight does not change often, we can use the
|
|
|
|
- * precalculated inverse to speed up arithmetics by turning divisions
|
|
|
|
- * into multiplications:
|
|
|
|
- */
|
|
|
|
-static const u32 prio_to_wmult[40] = {
|
|
|
|
- /* -20 */ 48388, 59856, 76040, 92818, 118348,
|
|
|
|
- /* -15 */ 147320, 184698, 229616, 287308, 360437,
|
|
|
|
- /* -10 */ 449829, 563644, 704093, 875809, 1099582,
|
|
|
|
- /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326,
|
|
|
|
- /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587,
|
|
|
|
- /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126,
|
|
|
|
- /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
|
|
|
|
- /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-/* Time spent by the tasks of the cpu accounting group executing in ... */
|
|
|
|
-enum cpuacct_stat_index {
|
|
|
|
- CPUACCT_STAT_USER, /* ... user mode */
|
|
|
|
- CPUACCT_STAT_SYSTEM, /* ... kernel mode */
|
|
|
|
-
|
|
|
|
- CPUACCT_STAT_NSTATS,
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
|
|
|
|
-static void cpuacct_update_stats(struct task_struct *tsk,
|
|
|
|
- enum cpuacct_stat_index idx, cputime_t val);
|
|
|
|
-#else
|
|
|
|
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
|
|
|
|
-static inline void cpuacct_update_stats(struct task_struct *tsk,
|
|
|
|
- enum cpuacct_stat_index idx, cputime_t val) {}
|
|
|
|
-#endif
|
|
|
|
|
|
+#endif /* CONFIG_NO_HZ */
|
|
|
|
|
|
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
|
|
|
|
|
|
+void sched_avg_update(struct rq *rq)
|
|
{
|
|
{
|
|
- update_load_add(&rq->load, load);
|
|
|
|
|
|
+ s64 period = sched_avg_period();
|
|
|
|
+
|
|
|
|
+ while ((s64)(rq->clock - rq->age_stamp) > period) {
|
|
|
|
+ /*
|
|
|
|
+ * Inline assembly required to prevent the compiler
|
|
|
|
+ * optimising this loop into a divmod call.
|
|
|
|
+ * See __iter_div_u64_rem() for another example of this.
|
|
|
|
+ */
|
|
|
|
+ asm("" : "+rm" (rq->age_stamp));
|
|
|
|
+ rq->age_stamp += period;
|
|
|
|
+ rq->rt_avg /= 2;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
|
|
|
|
|
|
+#else /* !CONFIG_SMP */
|
|
|
|
+void resched_task(struct task_struct *p)
|
|
{
|
|
{
|
|
- update_load_sub(&rq->load, load);
|
|
|
|
|
|
+ assert_raw_spin_locked(&task_rq(p)->lock);
|
|
|
|
+ set_tsk_need_resched(p);
|
|
}
|
|
}
|
|
|
|
+#endif /* CONFIG_SMP */
|
|
|
|
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
|
|
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
|
|
(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
|
|
-typedef int (*tg_visitor)(struct task_group *, void *);
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* Iterate task_group tree rooted at *from, calling @down when first entering a
|
|
* Iterate task_group tree rooted at *from, calling @down when first entering a
|
|
* node and @up when leaving it for the final time.
|
|
* node and @up when leaving it for the final time.
|
|
*
|
|
*
|
|
* Caller must hold rcu_lock or sufficient equivalent.
|
|
* Caller must hold rcu_lock or sufficient equivalent.
|
|
*/
|
|
*/
|
|
-static int walk_tg_tree_from(struct task_group *from,
|
|
|
|
|
|
+int walk_tg_tree_from(struct task_group *from,
|
|
tg_visitor down, tg_visitor up, void *data)
|
|
tg_visitor down, tg_visitor up, void *data)
|
|
{
|
|
{
|
|
struct task_group *parent, *child;
|
|
struct task_group *parent, *child;
|
|
@@ -1657,270 +683,13 @@ out:
|
|
return ret;
|
|
return ret;
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * Iterate the full tree, calling @down when first entering a node and @up when
|
|
|
|
- * leaving it for the final time.
|
|
|
|
- *
|
|
|
|
- * Caller must hold rcu_lock or sufficient equivalent.
|
|
|
|
- */
|
|
|
|
-
|
|
|
|
-static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
|
|
|
|
-{
|
|
|
|
- return walk_tg_tree_from(&root_task_group, down, up, data);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static int tg_nop(struct task_group *tg, void *data)
|
|
|
|
-{
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
-/* Used instead of source_load when we know the type == 0 */
|
|
|
|
-static unsigned long weighted_cpuload(const int cpu)
|
|
|
|
-{
|
|
|
|
- return cpu_rq(cpu)->load.weight;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Return a low guess at the load of a migration-source cpu weighted
|
|
|
|
- * according to the scheduling class and "nice" value.
|
|
|
|
- *
|
|
|
|
- * We want to under-estimate the load of migration sources, to
|
|
|
|
- * balance conservatively.
|
|
|
|
- */
|
|
|
|
-static unsigned long source_load(int cpu, int type)
|
|
|
|
-{
|
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
- unsigned long total = weighted_cpuload(cpu);
|
|
|
|
-
|
|
|
|
- if (type == 0 || !sched_feat(LB_BIAS))
|
|
|
|
- return total;
|
|
|
|
-
|
|
|
|
- return min(rq->cpu_load[type-1], total);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Return a high guess at the load of a migration-target cpu weighted
|
|
|
|
- * according to the scheduling class and "nice" value.
|
|
|
|
- */
|
|
|
|
-static unsigned long target_load(int cpu, int type)
|
|
|
|
-{
|
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
- unsigned long total = weighted_cpuload(cpu);
|
|
|
|
-
|
|
|
|
- if (type == 0 || !sched_feat(LB_BIAS))
|
|
|
|
- return total;
|
|
|
|
-
|
|
|
|
- return max(rq->cpu_load[type-1], total);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static unsigned long power_of(int cpu)
|
|
|
|
-{
|
|
|
|
- return cpu_rq(cpu)->cpu_power;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
|
|
|
|
-
|
|
|
|
-static unsigned long cpu_avg_load_per_task(int cpu)
|
|
|
|
|
|
+int tg_nop(struct task_group *tg, void *data)
|
|
{
|
|
{
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
- unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
|
|
|
|
-
|
|
|
|
- if (nr_running)
|
|
|
|
- return rq->load.weight / nr_running;
|
|
|
|
-
|
|
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
-
|
|
|
|
-#ifdef CONFIG_PREEMPT
|
|
|
|
-
|
|
|
|
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
|
|
|
|
- * way at the expense of forcing extra atomic operations in all
|
|
|
|
- * invocations. This assures that the double_lock is acquired using the
|
|
|
|
- * same underlying policy as the spinlock_t on this architecture, which
|
|
|
|
- * reduces latency compared to the unfair variant below. However, it
|
|
|
|
- * also adds more overhead and therefore may reduce throughput.
|
|
|
|
- */
|
|
|
|
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
|
|
|
|
- __releases(this_rq->lock)
|
|
|
|
- __acquires(busiest->lock)
|
|
|
|
- __acquires(this_rq->lock)
|
|
|
|
-{
|
|
|
|
- raw_spin_unlock(&this_rq->lock);
|
|
|
|
- double_rq_lock(this_rq, busiest);
|
|
|
|
-
|
|
|
|
- return 1;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else
|
|
|
|
-/*
|
|
|
|
- * Unfair double_lock_balance: Optimizes throughput at the expense of
|
|
|
|
- * latency by eliminating extra atomic operations when the locks are
|
|
|
|
- * already in proper order on entry. This favors lower cpu-ids and will
|
|
|
|
- * grant the double lock to lower cpus over higher ids under contention,
|
|
|
|
- * regardless of entry order into the function.
|
|
|
|
- */
|
|
|
|
-static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
|
|
|
|
- __releases(this_rq->lock)
|
|
|
|
- __acquires(busiest->lock)
|
|
|
|
- __acquires(this_rq->lock)
|
|
|
|
-{
|
|
|
|
- int ret = 0;
|
|
|
|
-
|
|
|
|
- if (unlikely(!raw_spin_trylock(&busiest->lock))) {
|
|
|
|
- if (busiest < this_rq) {
|
|
|
|
- raw_spin_unlock(&this_rq->lock);
|
|
|
|
- raw_spin_lock(&busiest->lock);
|
|
|
|
- raw_spin_lock_nested(&this_rq->lock,
|
|
|
|
- SINGLE_DEPTH_NESTING);
|
|
|
|
- ret = 1;
|
|
|
|
- } else
|
|
|
|
- raw_spin_lock_nested(&busiest->lock,
|
|
|
|
- SINGLE_DEPTH_NESTING);
|
|
|
|
- }
|
|
|
|
- return ret;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#endif /* CONFIG_PREEMPT */
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
|
|
|
|
- */
|
|
|
|
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
|
|
|
|
-{
|
|
|
|
- if (unlikely(!irqs_disabled())) {
|
|
|
|
- /* printk() doesn't work good under rq->lock */
|
|
|
|
- raw_spin_unlock(&this_rq->lock);
|
|
|
|
- BUG_ON(1);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return _double_lock_balance(this_rq, busiest);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
|
|
|
|
- __releases(busiest->lock)
|
|
|
|
-{
|
|
|
|
- raw_spin_unlock(&busiest->lock);
|
|
|
|
- lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * double_rq_lock - safely lock two runqueues
|
|
|
|
- *
|
|
|
|
- * Note this does not disable interrupts like task_rq_lock,
|
|
|
|
- * you need to do so manually before calling.
|
|
|
|
- */
|
|
|
|
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
|
|
|
|
- __acquires(rq1->lock)
|
|
|
|
- __acquires(rq2->lock)
|
|
|
|
-{
|
|
|
|
- BUG_ON(!irqs_disabled());
|
|
|
|
- if (rq1 == rq2) {
|
|
|
|
- raw_spin_lock(&rq1->lock);
|
|
|
|
- __acquire(rq2->lock); /* Fake it out ;) */
|
|
|
|
- } else {
|
|
|
|
- if (rq1 < rq2) {
|
|
|
|
- raw_spin_lock(&rq1->lock);
|
|
|
|
- raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
|
|
|
|
- } else {
|
|
|
|
- raw_spin_lock(&rq2->lock);
|
|
|
|
- raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
|
|
|
|
- }
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * double_rq_unlock - safely unlock two runqueues
|
|
|
|
- *
|
|
|
|
- * Note this does not restore interrupts like task_rq_unlock,
|
|
|
|
- * you need to do so manually after calling.
|
|
|
|
- */
|
|
|
|
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
|
|
|
|
- __releases(rq1->lock)
|
|
|
|
- __releases(rq2->lock)
|
|
|
|
-{
|
|
|
|
- raw_spin_unlock(&rq1->lock);
|
|
|
|
- if (rq1 != rq2)
|
|
|
|
- raw_spin_unlock(&rq2->lock);
|
|
|
|
- else
|
|
|
|
- __release(rq2->lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#else /* CONFIG_SMP */
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * double_rq_lock - safely lock two runqueues
|
|
|
|
- *
|
|
|
|
- * Note this does not disable interrupts like task_rq_lock,
|
|
|
|
- * you need to do so manually before calling.
|
|
|
|
- */
|
|
|
|
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
|
|
|
|
- __acquires(rq1->lock)
|
|
|
|
- __acquires(rq2->lock)
|
|
|
|
-{
|
|
|
|
- BUG_ON(!irqs_disabled());
|
|
|
|
- BUG_ON(rq1 != rq2);
|
|
|
|
- raw_spin_lock(&rq1->lock);
|
|
|
|
- __acquire(rq2->lock); /* Fake it out ;) */
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * double_rq_unlock - safely unlock two runqueues
|
|
|
|
- *
|
|
|
|
- * Note this does not restore interrupts like task_rq_unlock,
|
|
|
|
- * you need to do so manually after calling.
|
|
|
|
- */
|
|
|
|
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
|
|
|
|
- __releases(rq1->lock)
|
|
|
|
- __releases(rq2->lock)
|
|
|
|
-{
|
|
|
|
- BUG_ON(rq1 != rq2);
|
|
|
|
- raw_spin_unlock(&rq1->lock);
|
|
|
|
- __release(rq2->lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-static void calc_load_account_idle(struct rq *this_rq);
|
|
|
|
-static void update_sysctl(void);
|
|
|
|
-static int get_update_sysctl_factor(void);
|
|
|
|
-static void update_cpu_load(struct rq *this_rq);
|
|
|
|
-
|
|
|
|
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
-{
|
|
|
|
- set_task_rq(p, cpu);
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /*
|
|
|
|
- * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
|
|
|
|
- * successfully executed on another CPU. We must ensure that updates of
|
|
|
|
- * per-task data have been completed by this moment.
|
|
|
|
- */
|
|
|
|
- smp_wmb();
|
|
|
|
- task_thread_info(p)->cpu = cpu;
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static const struct sched_class rt_sched_class;
|
|
|
|
-
|
|
|
|
-#define sched_class_highest (&stop_sched_class)
|
|
|
|
-#define for_each_class(class) \
|
|
|
|
- for (class = sched_class_highest; class; class = class->next)
|
|
|
|
-
|
|
|
|
-#include "sched_stats.h"
|
|
|
|
-
|
|
|
|
-static void inc_nr_running(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- rq->nr_running++;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void dec_nr_running(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- rq->nr_running--;
|
|
|
|
-}
|
|
|
|
|
|
+void update_cpu_load(struct rq *this_rq);
|
|
|
|
|
|
static void set_load_weight(struct task_struct *p)
|
|
static void set_load_weight(struct task_struct *p)
|
|
{
|
|
{
|
|
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
|
|
/*
|
|
/*
|
|
* activate_task - move a task to the runqueue.
|
|
* activate_task - move a task to the runqueue.
|
|
*/
|
|
*/
|
|
-static void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
|
|
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
{
|
|
if (task_contributes_to_load(p))
|
|
if (task_contributes_to_load(p))
|
|
rq->nr_uninterruptible--;
|
|
rq->nr_uninterruptible--;
|
|
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
|
|
/*
|
|
/*
|
|
* deactivate_task - remove a task from the runqueue.
|
|
* deactivate_task - remove a task from the runqueue.
|
|
*/
|
|
*/
|
|
-static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
|
|
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
{
|
|
if (task_contributes_to_load(p))
|
|
if (task_contributes_to_load(p))
|
|
rq->nr_uninterruptible++;
|
|
rq->nr_uninterruptible++;
|
|
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
static int irqtime_account_hi_update(void)
|
|
static int irqtime_account_hi_update(void)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
unsigned long flags;
|
|
unsigned long flags;
|
|
u64 latest_ns;
|
|
u64 latest_ns;
|
|
int ret = 0;
|
|
int ret = 0;
|
|
|
|
|
|
local_irq_save(flags);
|
|
local_irq_save(flags);
|
|
latest_ns = this_cpu_read(cpu_hardirq_time);
|
|
latest_ns = this_cpu_read(cpu_hardirq_time);
|
|
- if (nsecs_to_cputime64(latest_ns) > cpustat->irq)
|
|
|
|
|
|
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
|
|
ret = 1;
|
|
ret = 1;
|
|
local_irq_restore(flags);
|
|
local_irq_restore(flags);
|
|
return ret;
|
|
return ret;
|
|
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
|
|
|
|
|
|
static int irqtime_account_si_update(void)
|
|
static int irqtime_account_si_update(void)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
unsigned long flags;
|
|
unsigned long flags;
|
|
u64 latest_ns;
|
|
u64 latest_ns;
|
|
int ret = 0;
|
|
int ret = 0;
|
|
|
|
|
|
local_irq_save(flags);
|
|
local_irq_save(flags);
|
|
latest_ns = this_cpu_read(cpu_softirq_time);
|
|
latest_ns = this_cpu_read(cpu_softirq_time);
|
|
- if (nsecs_to_cputime64(latest_ns) > cpustat->softirq)
|
|
|
|
|
|
+ if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
|
|
ret = 1;
|
|
ret = 1;
|
|
local_irq_restore(flags);
|
|
local_irq_restore(flags);
|
|
return ret;
|
|
return ret;
|
|
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
|
|
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-#include "sched_idletask.c"
|
|
|
|
-#include "sched_fair.c"
|
|
|
|
-#include "sched_rt.c"
|
|
|
|
-#include "sched_autogroup.c"
|
|
|
|
-#include "sched_stoptask.c"
|
|
|
|
-#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
-# include "sched_debug.c"
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
void sched_set_stop_task(int cpu, struct task_struct *stop)
|
|
void sched_set_stop_task(int cpu, struct task_struct *stop)
|
|
{
|
|
{
|
|
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
|
|
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
|
|
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
|
|
p->sched_class->prio_changed(rq, p, oldprio);
|
|
p->sched_class->prio_changed(rq, p, oldprio);
|
|
}
|
|
}
|
|
|
|
|
|
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
|
|
|
|
|
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
|
{
|
|
{
|
|
const struct sched_class *class;
|
|
const struct sched_class *class;
|
|
|
|
|
|
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
#ifdef CONFIG_SMP
|
|
-/*
|
|
|
|
- * Is this task likely cache-hot:
|
|
|
|
- */
|
|
|
|
-static int
|
|
|
|
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
|
|
|
|
-{
|
|
|
|
- s64 delta;
|
|
|
|
-
|
|
|
|
- if (p->sched_class != &fair_sched_class)
|
|
|
|
- return 0;
|
|
|
|
-
|
|
|
|
- if (unlikely(p->policy == SCHED_IDLE))
|
|
|
|
- return 0;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * Buddy candidates are cache hot:
|
|
|
|
- */
|
|
|
|
- if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
|
|
|
|
- (&p->se == cfs_rq_of(&p->se)->next ||
|
|
|
|
- &p->se == cfs_rq_of(&p->se)->last))
|
|
|
|
- return 1;
|
|
|
|
-
|
|
|
|
- if (sysctl_sched_migration_cost == -1)
|
|
|
|
- return 1;
|
|
|
|
- if (sysctl_sched_migration_cost == 0)
|
|
|
|
- return 0;
|
|
|
|
-
|
|
|
|
- delta = now - p->se.exec_start;
|
|
|
|
-
|
|
|
|
- return delta < (s64)sysctl_sched_migration_cost;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
|
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
|
|
{
|
|
{
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
@@ -3439,7 +2167,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
|
|
*/
|
|
*/
|
|
static atomic_long_t calc_load_tasks_idle;
|
|
static atomic_long_t calc_load_tasks_idle;
|
|
|
|
|
|
-static void calc_load_account_idle(struct rq *this_rq)
|
|
|
|
|
|
+void calc_load_account_idle(struct rq *this_rq)
|
|
{
|
|
{
|
|
long delta;
|
|
long delta;
|
|
|
|
|
|
@@ -3583,7 +2311,7 @@ static void calc_global_nohz(unsigned long ticks)
|
|
*/
|
|
*/
|
|
}
|
|
}
|
|
#else
|
|
#else
|
|
-static void calc_load_account_idle(struct rq *this_rq)
|
|
|
|
|
|
+void calc_load_account_idle(struct rq *this_rq)
|
|
{
|
|
{
|
|
}
|
|
}
|
|
|
|
|
|
@@ -3726,7 +2454,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
|
|
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
|
* scheduler tick (TICK_NSEC). With tickless idle this will not be called
|
|
* every tick. We fix it up based on jiffies.
|
|
* every tick. We fix it up based on jiffies.
|
|
*/
|
|
*/
|
|
-static void update_cpu_load(struct rq *this_rq)
|
|
|
|
|
|
+void update_cpu_load(struct rq *this_rq)
|
|
{
|
|
{
|
|
unsigned long this_load = this_rq->load.weight;
|
|
unsigned long this_load = this_rq->load.weight;
|
|
unsigned long curr_jiffies = jiffies;
|
|
unsigned long curr_jiffies = jiffies;
|
|
@@ -3804,8 +2532,10 @@ unlock:
|
|
#endif
|
|
#endif
|
|
|
|
|
|
DEFINE_PER_CPU(struct kernel_stat, kstat);
|
|
DEFINE_PER_CPU(struct kernel_stat, kstat);
|
|
|
|
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
|
|
|
|
|
|
EXPORT_PER_CPU_SYMBOL(kstat);
|
|
EXPORT_PER_CPU_SYMBOL(kstat);
|
|
|
|
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
|
|
|
|
|
|
/*
|
|
/*
|
|
* Return any ns on the sched_clock that have not yet been accounted in
|
|
* Return any ns on the sched_clock that have not yet been accounted in
|
|
@@ -3858,6 +2588,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
|
return ns;
|
|
return ns;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
+struct cgroup_subsys cpuacct_subsys;
|
|
|
|
+struct cpuacct root_cpuacct;
|
|
|
|
+#endif
|
|
|
|
+
|
|
|
|
+static inline void task_group_account_field(struct task_struct *p, int index,
|
|
|
|
+ u64 tmp)
|
|
|
|
+{
|
|
|
|
+#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
+ struct kernel_cpustat *kcpustat;
|
|
|
|
+ struct cpuacct *ca;
|
|
|
|
+#endif
|
|
|
|
+ /*
|
|
|
|
+ * Since all updates are sure to touch the root cgroup, we
|
|
|
|
+ * get ourselves ahead and touch it first. If the root cgroup
|
|
|
|
+ * is the only cgroup, then nothing else should be necessary.
|
|
|
|
+ *
|
|
|
|
+ */
|
|
|
|
+ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
|
|
|
|
+
|
|
|
|
+#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
+ if (unlikely(!cpuacct_subsys.active))
|
|
|
|
+ return;
|
|
|
|
+
|
|
|
|
+ rcu_read_lock();
|
|
|
|
+ ca = task_ca(p);
|
|
|
|
+ while (ca && (ca != &root_cpuacct)) {
|
|
|
|
+ kcpustat = this_cpu_ptr(ca->cpustat);
|
|
|
|
+ kcpustat->cpustat[index] += tmp;
|
|
|
|
+ ca = parent_ca(ca);
|
|
|
|
+ }
|
|
|
|
+ rcu_read_unlock();
|
|
|
|
+#endif
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+
|
|
/*
|
|
/*
|
|
* Account user cpu time to a process.
|
|
* Account user cpu time to a process.
|
|
* @p: the process that the cpu time gets accounted to
|
|
* @p: the process that the cpu time gets accounted to
|
|
@@ -3867,20 +2633,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
|
void account_user_time(struct task_struct *p, cputime_t cputime,
|
|
void account_user_time(struct task_struct *p, cputime_t cputime,
|
|
cputime_t cputime_scaled)
|
|
cputime_t cputime_scaled)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ int index;
|
|
|
|
|
|
/* Add user time to process. */
|
|
/* Add user time to process. */
|
|
p->utime += cputime;
|
|
p->utime += cputime;
|
|
p->utimescaled += cputime_scaled;
|
|
p->utimescaled += cputime_scaled;
|
|
account_group_user_time(p, cputime);
|
|
account_group_user_time(p, cputime);
|
|
|
|
|
|
|
|
+ index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
|
|
|
|
+
|
|
/* Add user time to cpustat. */
|
|
/* Add user time to cpustat. */
|
|
- if (TASK_NICE(p) > 0)
|
|
|
|
- cpustat->nice += (__force cputime64_t) cputime;
|
|
|
|
- else
|
|
|
|
- cpustat->user += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ task_group_account_field(p, index, (__force u64) cputime);
|
|
|
|
|
|
- cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
|
|
|
|
/* Account for user time used */
|
|
/* Account for user time used */
|
|
acct_update_integrals(p);
|
|
acct_update_integrals(p);
|
|
}
|
|
}
|
|
@@ -3894,7 +2658,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
|
|
static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
|
static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
|
cputime_t cputime_scaled)
|
|
cputime_t cputime_scaled)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
|
|
|
|
/* Add guest time to process. */
|
|
/* Add guest time to process. */
|
|
p->utime += cputime;
|
|
p->utime += cputime;
|
|
@@ -3904,11 +2668,11 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
|
|
|
|
|
/* Add guest time to cpustat. */
|
|
/* Add guest time to cpustat. */
|
|
if (TASK_NICE(p) > 0) {
|
|
if (TASK_NICE(p) > 0) {
|
|
- cpustat->nice += (__force cputime64_t) cputime;
|
|
|
|
- cpustat->guest_nice += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ cpustat[CPUTIME_NICE] += (__force u64) cputime;
|
|
|
|
+ cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
|
|
} else {
|
|
} else {
|
|
- cpustat->user += (__force cputime64_t) cputime;
|
|
|
|
- cpustat->guest += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ cpustat[CPUTIME_USER] += (__force u64) cputime;
|
|
|
|
+ cpustat[CPUTIME_GUEST] += (__force u64) cputime;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -3921,7 +2685,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
|
|
*/
|
|
*/
|
|
static inline
|
|
static inline
|
|
void __account_system_time(struct task_struct *p, cputime_t cputime,
|
|
void __account_system_time(struct task_struct *p, cputime_t cputime,
|
|
- cputime_t cputime_scaled, cputime64_t *target_cputime64)
|
|
|
|
|
|
+ cputime_t cputime_scaled, int index)
|
|
{
|
|
{
|
|
/* Add system time to process. */
|
|
/* Add system time to process. */
|
|
p->stime += cputime;
|
|
p->stime += cputime;
|
|
@@ -3929,8 +2693,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
|
|
account_group_system_time(p, cputime);
|
|
account_group_system_time(p, cputime);
|
|
|
|
|
|
/* Add system time to cpustat. */
|
|
/* Add system time to cpustat. */
|
|
- *target_cputime64 += (__force cputime64_t) cputime;
|
|
|
|
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
|
|
|
|
|
|
+ task_group_account_field(p, index, (__force u64) cputime);
|
|
|
|
|
|
/* Account for system time used */
|
|
/* Account for system time used */
|
|
acct_update_integrals(p);
|
|
acct_update_integrals(p);
|
|
@@ -3946,8 +2709,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
|
|
void account_system_time(struct task_struct *p, int hardirq_offset,
|
|
void account_system_time(struct task_struct *p, int hardirq_offset,
|
|
cputime_t cputime, cputime_t cputime_scaled)
|
|
cputime_t cputime, cputime_t cputime_scaled)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
- cputime64_t *target_cputime64;
|
|
|
|
|
|
+ int index;
|
|
|
|
|
|
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
|
|
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
|
|
account_guest_time(p, cputime, cputime_scaled);
|
|
account_guest_time(p, cputime, cputime_scaled);
|
|
@@ -3955,13 +2717,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
|
|
}
|
|
}
|
|
|
|
|
|
if (hardirq_count() - hardirq_offset)
|
|
if (hardirq_count() - hardirq_offset)
|
|
- target_cputime64 = &cpustat->irq;
|
|
|
|
|
|
+ index = CPUTIME_IRQ;
|
|
else if (in_serving_softirq())
|
|
else if (in_serving_softirq())
|
|
- target_cputime64 = &cpustat->softirq;
|
|
|
|
|
|
+ index = CPUTIME_SOFTIRQ;
|
|
else
|
|
else
|
|
- target_cputime64 = &cpustat->system;
|
|
|
|
|
|
+ index = CPUTIME_SYSTEM;
|
|
|
|
|
|
- __account_system_time(p, cputime, cputime_scaled, target_cputime64);
|
|
|
|
|
|
+ __account_system_time(p, cputime, cputime_scaled, index);
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -3970,9 +2732,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
|
|
*/
|
|
*/
|
|
void account_steal_time(cputime_t cputime)
|
|
void account_steal_time(cputime_t cputime)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
|
|
|
|
- cpustat->steal += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ cpustat[CPUTIME_STEAL] += (__force u64) cputime;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -3981,13 +2743,13 @@ void account_steal_time(cputime_t cputime)
|
|
*/
|
|
*/
|
|
void account_idle_time(cputime_t cputime)
|
|
void account_idle_time(cputime_t cputime)
|
|
{
|
|
{
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
struct rq *rq = this_rq();
|
|
struct rq *rq = this_rq();
|
|
|
|
|
|
if (atomic_read(&rq->nr_iowait) > 0)
|
|
if (atomic_read(&rq->nr_iowait) > 0)
|
|
- cpustat->iowait += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
|
|
else
|
|
else
|
|
- cpustat->idle += (__force cputime64_t) cputime;
|
|
|
|
|
|
+ cpustat[CPUTIME_IDLE] += (__force u64) cputime;
|
|
}
|
|
}
|
|
|
|
|
|
static __always_inline bool steal_account_process_tick(void)
|
|
static __always_inline bool steal_account_process_tick(void)
|
|
@@ -4037,15 +2799,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
|
struct rq *rq)
|
|
struct rq *rq)
|
|
{
|
|
{
|
|
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
|
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
|
|
- struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
|
|
|
|
|
|
+ u64 *cpustat = kcpustat_this_cpu->cpustat;
|
|
|
|
|
|
if (steal_account_process_tick())
|
|
if (steal_account_process_tick())
|
|
return;
|
|
return;
|
|
|
|
|
|
if (irqtime_account_hi_update()) {
|
|
if (irqtime_account_hi_update()) {
|
|
- cpustat->irq += (__force cputime64_t) cputime_one_jiffy;
|
|
|
|
|
|
+ cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
|
|
} else if (irqtime_account_si_update()) {
|
|
} else if (irqtime_account_si_update()) {
|
|
- cpustat->softirq += (__force cputime64_t) cputime_one_jiffy;
|
|
|
|
|
|
+ cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
|
|
} else if (this_cpu_ksoftirqd() == p) {
|
|
} else if (this_cpu_ksoftirqd() == p) {
|
|
/*
|
|
/*
|
|
* ksoftirqd time do not get accounted in cpu_softirq_time.
|
|
* ksoftirqd time do not get accounted in cpu_softirq_time.
|
|
@@ -4053,7 +2815,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
|
* Also, p->stime needs to be updated for ksoftirqd.
|
|
* Also, p->stime needs to be updated for ksoftirqd.
|
|
*/
|
|
*/
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
|
- &cpustat->softirq);
|
|
|
|
|
|
+ CPUTIME_SOFTIRQ);
|
|
} else if (user_tick) {
|
|
} else if (user_tick) {
|
|
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
} else if (p == rq->idle) {
|
|
} else if (p == rq->idle) {
|
|
@@ -4062,7 +2824,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
|
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
|
|
} else {
|
|
} else {
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
|
__account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
|
|
- &cpustat->system);
|
|
|
|
|
|
+ CPUTIME_SYSTEM);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
@@ -5841,6 +4603,13 @@ again:
|
|
*/
|
|
*/
|
|
if (preempt && rq != p_rq)
|
|
if (preempt && rq != p_rq)
|
|
resched_task(p_rq->curr);
|
|
resched_task(p_rq->curr);
|
|
|
|
+ } else {
|
|
|
|
+ /*
|
|
|
|
+ * We might have set it in task_yield_fair(), but are
|
|
|
|
+ * not going to schedule(), so don't want to skip
|
|
|
|
+ * the next update.
|
|
|
|
+ */
|
|
|
|
+ rq->skip_clock_update = 0;
|
|
}
|
|
}
|
|
|
|
|
|
out:
|
|
out:
|
|
@@ -6008,7 +4777,7 @@ void sched_show_task(struct task_struct *p)
|
|
free = stack_not_used(p);
|
|
free = stack_not_used(p);
|
|
#endif
|
|
#endif
|
|
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
|
|
printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
|
|
- task_pid_nr(p), task_pid_nr(p->real_parent),
|
|
|
|
|
|
+ task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
|
|
(unsigned long)task_thread_info(p)->flags);
|
|
(unsigned long)task_thread_info(p)->flags);
|
|
|
|
|
|
show_stack(p, NULL);
|
|
show_stack(p, NULL);
|
|
@@ -6094,64 +4863,17 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
|
|
#endif
|
|
#endif
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
|
|
- /* Set the preempt count _outside_ the spinlocks! */
|
|
|
|
- task_thread_info(idle)->preempt_count = 0;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * The idle tasks have their own, simple scheduling class:
|
|
|
|
- */
|
|
|
|
- idle->sched_class = &idle_sched_class;
|
|
|
|
- ftrace_graph_init_idle_task(idle, cpu);
|
|
|
|
-#if defined(CONFIG_SMP)
|
|
|
|
- sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Increase the granularity value when there are more CPUs,
|
|
|
|
- * because with more CPUs the 'effective latency' as visible
|
|
|
|
- * to users decreases. But the relationship is not linear,
|
|
|
|
- * so pick a second-best guess by going with the log2 of the
|
|
|
|
- * number of CPUs.
|
|
|
|
- *
|
|
|
|
- * This idea comes from the SD scheduler of Con Kolivas:
|
|
|
|
- */
|
|
|
|
-static int get_update_sysctl_factor(void)
|
|
|
|
-{
|
|
|
|
- unsigned int cpus = min_t(int, num_online_cpus(), 8);
|
|
|
|
- unsigned int factor;
|
|
|
|
-
|
|
|
|
- switch (sysctl_sched_tunable_scaling) {
|
|
|
|
- case SCHED_TUNABLESCALING_NONE:
|
|
|
|
- factor = 1;
|
|
|
|
- break;
|
|
|
|
- case SCHED_TUNABLESCALING_LINEAR:
|
|
|
|
- factor = cpus;
|
|
|
|
- break;
|
|
|
|
- case SCHED_TUNABLESCALING_LOG:
|
|
|
|
- default:
|
|
|
|
- factor = 1 + ilog2(cpus);
|
|
|
|
- break;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return factor;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void update_sysctl(void)
|
|
|
|
-{
|
|
|
|
- unsigned int factor = get_update_sysctl_factor();
|
|
|
|
-
|
|
|
|
-#define SET_SYSCTL(name) \
|
|
|
|
- (sysctl_##name = (factor) * normalized_sysctl_##name)
|
|
|
|
- SET_SYSCTL(sched_min_granularity);
|
|
|
|
- SET_SYSCTL(sched_latency);
|
|
|
|
- SET_SYSCTL(sched_wakeup_granularity);
|
|
|
|
-#undef SET_SYSCTL
|
|
|
|
-}
|
|
|
|
|
|
+ /* Set the preempt count _outside_ the spinlocks! */
|
|
|
|
+ task_thread_info(idle)->preempt_count = 0;
|
|
|
|
|
|
-static inline void sched_init_granularity(void)
|
|
|
|
-{
|
|
|
|
- update_sysctl();
|
|
|
|
|
|
+ /*
|
|
|
|
+ * The idle tasks have their own, simple scheduling class:
|
|
|
|
+ */
|
|
|
|
+ idle->sched_class = &idle_sched_class;
|
|
|
|
+ ftrace_graph_init_idle_task(idle, cpu);
|
|
|
|
+#if defined(CONFIG_SMP)
|
|
|
|
+ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
|
|
|
|
+#endif
|
|
}
|
|
}
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
#ifdef CONFIG_SMP
|
|
@@ -6340,30 +5062,6 @@ static void calc_global_load_remove(struct rq *rq)
|
|
rq->calc_load_active = 0;
|
|
rq->calc_load_active = 0;
|
|
}
|
|
}
|
|
|
|
|
|
-#ifdef CONFIG_CFS_BANDWIDTH
|
|
|
|
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
|
|
|
|
-{
|
|
|
|
- struct cfs_rq *cfs_rq;
|
|
|
|
-
|
|
|
|
- for_each_leaf_cfs_rq(rq, cfs_rq) {
|
|
|
|
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
|
|
|
|
-
|
|
|
|
- if (!cfs_rq->runtime_enabled)
|
|
|
|
- continue;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * clock_task is not advancing so we just need to make sure
|
|
|
|
- * there's some valid quota amount
|
|
|
|
- */
|
|
|
|
- cfs_rq->runtime_remaining = cfs_b->quota;
|
|
|
|
- if (cfs_rq_throttled(cfs_rq))
|
|
|
|
- unthrottle_cfs_rq(cfs_rq);
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-#else
|
|
|
|
-static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
/*
|
|
/*
|
|
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
|
* Migrate all tasks from the rq, sleeping tasks will be migrated by
|
|
* try_to_wake_up()->select_task_rq().
|
|
* try_to_wake_up()->select_task_rq().
|
|
@@ -6969,6 +5667,12 @@ out:
|
|
return -ENOMEM;
|
|
return -ENOMEM;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+/*
|
|
|
|
+ * By default the system creates a single root-domain with all cpus as
|
|
|
|
+ * members (mimicking the global state we have today).
|
|
|
|
+ */
|
|
|
|
+struct root_domain def_root_domain;
|
|
|
|
+
|
|
static void init_defrootdomain(void)
|
|
static void init_defrootdomain(void)
|
|
{
|
|
{
|
|
init_rootdomain(&def_root_domain);
|
|
init_rootdomain(&def_root_domain);
|
|
@@ -7237,7 +5941,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
|
|
continue;
|
|
continue;
|
|
|
|
|
|
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
|
|
sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
|
|
- GFP_KERNEL, cpu_to_node(i));
|
|
|
|
|
|
+ GFP_KERNEL, cpu_to_node(cpu));
|
|
|
|
|
|
if (!sg)
|
|
if (!sg)
|
|
goto fail;
|
|
goto fail;
|
|
@@ -7375,6 +6079,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
|
|
return;
|
|
return;
|
|
|
|
|
|
update_group_power(sd, cpu);
|
|
update_group_power(sd, cpu);
|
|
|
|
+ atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+int __weak arch_sd_sibling_asym_packing(void)
|
|
|
|
+{
|
|
|
|
+ return 0*SD_ASYM_PACKING;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
/*
|
|
@@ -8012,29 +6722,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
-static int update_runtime(struct notifier_block *nfb,
|
|
|
|
- unsigned long action, void *hcpu)
|
|
|
|
-{
|
|
|
|
- int cpu = (int)(long)hcpu;
|
|
|
|
-
|
|
|
|
- switch (action) {
|
|
|
|
- case CPU_DOWN_PREPARE:
|
|
|
|
- case CPU_DOWN_PREPARE_FROZEN:
|
|
|
|
- disable_runtime(cpu_rq(cpu));
|
|
|
|
- return NOTIFY_OK;
|
|
|
|
-
|
|
|
|
- case CPU_DOWN_FAILED:
|
|
|
|
- case CPU_DOWN_FAILED_FROZEN:
|
|
|
|
- case CPU_ONLINE:
|
|
|
|
- case CPU_ONLINE_FROZEN:
|
|
|
|
- enable_runtime(cpu_rq(cpu));
|
|
|
|
- return NOTIFY_OK;
|
|
|
|
-
|
|
|
|
- default:
|
|
|
|
- return NOTIFY_DONE;
|
|
|
|
- }
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
void __init sched_init_smp(void)
|
|
void __init sched_init_smp(void)
|
|
{
|
|
{
|
|
cpumask_var_t non_isolated_cpus;
|
|
cpumask_var_t non_isolated_cpus;
|
|
@@ -8083,104 +6770,11 @@ int in_sched_functions(unsigned long addr)
|
|
&& addr < (unsigned long)__sched_text_end);
|
|
&& addr < (unsigned long)__sched_text_end);
|
|
}
|
|
}
|
|
|
|
|
|
-static void init_cfs_rq(struct cfs_rq *cfs_rq)
|
|
|
|
-{
|
|
|
|
- cfs_rq->tasks_timeline = RB_ROOT;
|
|
|
|
- INIT_LIST_HEAD(&cfs_rq->tasks);
|
|
|
|
- cfs_rq->min_vruntime = (u64)(-(1LL << 20));
|
|
|
|
-#ifndef CONFIG_64BIT
|
|
|
|
- cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
|
|
|
|
-#endif
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
|
|
|
|
-{
|
|
|
|
- struct rt_prio_array *array;
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- array = &rt_rq->active;
|
|
|
|
- for (i = 0; i < MAX_RT_PRIO; i++) {
|
|
|
|
- INIT_LIST_HEAD(array->queue + i);
|
|
|
|
- __clear_bit(i, array->bitmap);
|
|
|
|
- }
|
|
|
|
- /* delimiter for bitsearch: */
|
|
|
|
- __set_bit(MAX_RT_PRIO, array->bitmap);
|
|
|
|
-
|
|
|
|
-#if defined CONFIG_SMP
|
|
|
|
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
|
|
|
- rt_rq->highest_prio.next = MAX_RT_PRIO;
|
|
|
|
- rt_rq->rt_nr_migratory = 0;
|
|
|
|
- rt_rq->overloaded = 0;
|
|
|
|
- plist_head_init(&rt_rq->pushable_tasks);
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
- rt_rq->rt_time = 0;
|
|
|
|
- rt_rq->rt_throttled = 0;
|
|
|
|
- rt_rq->rt_runtime = 0;
|
|
|
|
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
-static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
|
|
|
|
- struct sched_entity *se, int cpu,
|
|
|
|
- struct sched_entity *parent)
|
|
|
|
-{
|
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
-
|
|
|
|
- cfs_rq->tg = tg;
|
|
|
|
- cfs_rq->rq = rq;
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- /* allow initial update_cfs_load() to truncate */
|
|
|
|
- cfs_rq->load_stamp = 1;
|
|
|
|
-#endif
|
|
|
|
- init_cfs_rq_runtime(cfs_rq);
|
|
|
|
-
|
|
|
|
- tg->cfs_rq[cpu] = cfs_rq;
|
|
|
|
- tg->se[cpu] = se;
|
|
|
|
-
|
|
|
|
- /* se could be NULL for root_task_group */
|
|
|
|
- if (!se)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- if (!parent)
|
|
|
|
- se->cfs_rq = &rq->cfs;
|
|
|
|
- else
|
|
|
|
- se->cfs_rq = parent->my_q;
|
|
|
|
-
|
|
|
|
- se->my_q = cfs_rq;
|
|
|
|
- update_load_set(&se->load, 0);
|
|
|
|
- se->parent = parent;
|
|
|
|
-}
|
|
|
|
|
|
+#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
+struct task_group root_task_group;
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-#ifdef CONFIG_RT_GROUP_SCHED
|
|
|
|
-static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
|
|
|
|
- struct sched_rt_entity *rt_se, int cpu,
|
|
|
|
- struct sched_rt_entity *parent)
|
|
|
|
-{
|
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
-
|
|
|
|
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
|
|
|
|
- rt_rq->rt_nr_boosted = 0;
|
|
|
|
- rt_rq->rq = rq;
|
|
|
|
- rt_rq->tg = tg;
|
|
|
|
-
|
|
|
|
- tg->rt_rq[cpu] = rt_rq;
|
|
|
|
- tg->rt_se[cpu] = rt_se;
|
|
|
|
-
|
|
|
|
- if (!rt_se)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- if (!parent)
|
|
|
|
- rt_se->rt_rq = &rq->rt;
|
|
|
|
- else
|
|
|
|
- rt_se->rt_rq = parent->my_q;
|
|
|
|
-
|
|
|
|
- rt_se->my_q = rt_rq;
|
|
|
|
- rt_se->parent = parent;
|
|
|
|
- INIT_LIST_HEAD(&rt_se->run_list);
|
|
|
|
-}
|
|
|
|
-#endif
|
|
|
|
|
|
+DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
|
|
|
|
|
|
void __init sched_init(void)
|
|
void __init sched_init(void)
|
|
{
|
|
{
|
|
@@ -8238,9 +6832,17 @@ void __init sched_init(void)
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
list_add(&root_task_group.list, &task_groups);
|
|
list_add(&root_task_group.list, &task_groups);
|
|
INIT_LIST_HEAD(&root_task_group.children);
|
|
INIT_LIST_HEAD(&root_task_group.children);
|
|
|
|
+ INIT_LIST_HEAD(&root_task_group.siblings);
|
|
autogroup_init(&init_task);
|
|
autogroup_init(&init_task);
|
|
|
|
+
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
|
|
|
|
|
|
+#ifdef CONFIG_CGROUP_CPUACCT
|
|
|
|
+ root_cpuacct.cpustat = &kernel_cpustat;
|
|
|
|
+ root_cpuacct.cpuusage = alloc_percpu(u64);
|
|
|
|
+ /* Too early, not expected to fail */
|
|
|
|
+ BUG_ON(!root_cpuacct.cpuusage);
|
|
|
|
+#endif
|
|
for_each_possible_cpu(i) {
|
|
for_each_possible_cpu(i) {
|
|
struct rq *rq;
|
|
struct rq *rq;
|
|
|
|
|
|
@@ -8252,7 +6854,7 @@ void __init sched_init(void)
|
|
init_cfs_rq(&rq->cfs);
|
|
init_cfs_rq(&rq->cfs);
|
|
init_rt_rq(&rq->rt, rq);
|
|
init_rt_rq(&rq->rt, rq);
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
- root_task_group.shares = root_task_group_load;
|
|
|
|
|
|
+ root_task_group.shares = ROOT_TASK_GROUP_LOAD;
|
|
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
|
INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
|
|
/*
|
|
/*
|
|
* How much cpu bandwidth does root_task_group get?
|
|
* How much cpu bandwidth does root_task_group get?
|
|
@@ -8302,7 +6904,7 @@ void __init sched_init(void)
|
|
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
|
rq->avg_idle = 2*sysctl_sched_migration_cost;
|
|
rq_attach_root(rq, &def_root_domain);
|
|
rq_attach_root(rq, &def_root_domain);
|
|
#ifdef CONFIG_NO_HZ
|
|
#ifdef CONFIG_NO_HZ
|
|
- rq->nohz_balance_kick = 0;
|
|
|
|
|
|
+ rq->nohz_flags = 0;
|
|
#endif
|
|
#endif
|
|
#endif
|
|
#endif
|
|
init_rq_hrtick(rq);
|
|
init_rq_hrtick(rq);
|
|
@@ -8315,10 +6917,6 @@ void __init sched_init(void)
|
|
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
|
|
INIT_HLIST_HEAD(&init_task.preempt_notifiers);
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
- open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
plist_head_init(&init_task.pi_waiters);
|
|
plist_head_init(&init_task.pi_waiters);
|
|
#endif
|
|
#endif
|
|
@@ -8346,17 +6944,11 @@ void __init sched_init(void)
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
#ifdef CONFIG_SMP
|
|
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
|
|
zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
|
|
-#ifdef CONFIG_NO_HZ
|
|
|
|
- zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
|
|
|
|
- alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
|
|
|
|
- atomic_set(&nohz.load_balancer, nr_cpu_ids);
|
|
|
|
- atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
|
|
|
|
- atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
|
|
|
|
-#endif
|
|
|
|
/* May be allocated at isolcpus cmdline parse time */
|
|
/* May be allocated at isolcpus cmdline parse time */
|
|
if (cpu_isolated_map == NULL)
|
|
if (cpu_isolated_map == NULL)
|
|
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
|
zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
|
|
-#endif /* SMP */
|
|
|
|
|
|
+#endif
|
|
|
|
+ init_sched_fair_class();
|
|
|
|
|
|
scheduler_running = 1;
|
|
scheduler_running = 1;
|
|
}
|
|
}
|
|
@@ -8508,169 +7100,14 @@ void set_curr_task(int cpu, struct task_struct *p)
|
|
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
-#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
|
|
-static void free_fair_sched_group(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
|
|
|
-
|
|
|
|
- for_each_possible_cpu(i) {
|
|
|
|
- if (tg->cfs_rq)
|
|
|
|
- kfree(tg->cfs_rq[i]);
|
|
|
|
- if (tg->se)
|
|
|
|
- kfree(tg->se[i]);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- kfree(tg->cfs_rq);
|
|
|
|
- kfree(tg->se);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static
|
|
|
|
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
-{
|
|
|
|
- struct cfs_rq *cfs_rq;
|
|
|
|
- struct sched_entity *se;
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
- if (!tg->cfs_rq)
|
|
|
|
- goto err;
|
|
|
|
- tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
- if (!tg->se)
|
|
|
|
- goto err;
|
|
|
|
-
|
|
|
|
- tg->shares = NICE_0_LOAD;
|
|
|
|
-
|
|
|
|
- init_cfs_bandwidth(tg_cfs_bandwidth(tg));
|
|
|
|
-
|
|
|
|
- for_each_possible_cpu(i) {
|
|
|
|
- cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
|
|
|
|
- GFP_KERNEL, cpu_to_node(i));
|
|
|
|
- if (!cfs_rq)
|
|
|
|
- goto err;
|
|
|
|
-
|
|
|
|
- se = kzalloc_node(sizeof(struct sched_entity),
|
|
|
|
- GFP_KERNEL, cpu_to_node(i));
|
|
|
|
- if (!se)
|
|
|
|
- goto err_free_rq;
|
|
|
|
-
|
|
|
|
- init_cfs_rq(cfs_rq);
|
|
|
|
- init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return 1;
|
|
|
|
-
|
|
|
|
-err_free_rq:
|
|
|
|
- kfree(cfs_rq);
|
|
|
|
-err:
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
|
|
|
|
-{
|
|
|
|
- struct rq *rq = cpu_rq(cpu);
|
|
|
|
- unsigned long flags;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * Only empty task groups can be destroyed; so we can speculatively
|
|
|
|
- * check on_list without danger of it being re-added.
|
|
|
|
- */
|
|
|
|
- if (!tg->cfs_rq[cpu]->on_list)
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
|
|
|
|
- raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
-}
|
|
|
|
-#else /* !CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
-static inline void free_fair_sched_group(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline
|
|
|
|
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
-{
|
|
|
|
- return 1;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-#endif /* CONFIG_FAIR_GROUP_SCHED */
|
|
|
|
-
|
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
|
-static void free_rt_sched_group(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- if (tg->rt_se)
|
|
|
|
- destroy_rt_bandwidth(&tg->rt_bandwidth);
|
|
|
|
-
|
|
|
|
- for_each_possible_cpu(i) {
|
|
|
|
- if (tg->rt_rq)
|
|
|
|
- kfree(tg->rt_rq[i]);
|
|
|
|
- if (tg->rt_se)
|
|
|
|
- kfree(tg->rt_se[i]);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- kfree(tg->rt_rq);
|
|
|
|
- kfree(tg->rt_se);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static
|
|
|
|
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
-{
|
|
|
|
- struct rt_rq *rt_rq;
|
|
|
|
- struct sched_rt_entity *rt_se;
|
|
|
|
- int i;
|
|
|
|
-
|
|
|
|
- tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
- if (!tg->rt_rq)
|
|
|
|
- goto err;
|
|
|
|
- tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
|
|
|
|
- if (!tg->rt_se)
|
|
|
|
- goto err;
|
|
|
|
-
|
|
|
|
- init_rt_bandwidth(&tg->rt_bandwidth,
|
|
|
|
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
|
|
|
|
-
|
|
|
|
- for_each_possible_cpu(i) {
|
|
|
|
- rt_rq = kzalloc_node(sizeof(struct rt_rq),
|
|
|
|
- GFP_KERNEL, cpu_to_node(i));
|
|
|
|
- if (!rt_rq)
|
|
|
|
- goto err;
|
|
|
|
-
|
|
|
|
- rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
|
|
|
|
- GFP_KERNEL, cpu_to_node(i));
|
|
|
|
- if (!rt_se)
|
|
|
|
- goto err_free_rq;
|
|
|
|
-
|
|
|
|
- init_rt_rq(rt_rq, cpu_rq(i));
|
|
|
|
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
|
|
|
|
- init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- return 1;
|
|
|
|
-
|
|
|
|
-err_free_rq:
|
|
|
|
- kfree(rt_rq);
|
|
|
|
-err:
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
#else /* !CONFIG_RT_GROUP_SCHED */
|
|
#else /* !CONFIG_RT_GROUP_SCHED */
|
|
-static inline void free_rt_sched_group(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-static inline
|
|
|
|
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
|
|
|
|
-{
|
|
|
|
- return 1;
|
|
|
|
-}
|
|
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
|
#endif /* CONFIG_RT_GROUP_SCHED */
|
|
|
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
+/* task_group_lock serializes the addition/removal of task groups */
|
|
|
|
+static DEFINE_SPINLOCK(task_group_lock);
|
|
|
|
+
|
|
static void free_sched_group(struct task_group *tg)
|
|
static void free_sched_group(struct task_group *tg)
|
|
{
|
|
{
|
|
free_fair_sched_group(tg);
|
|
free_fair_sched_group(tg);
|
|
@@ -8776,47 +7213,6 @@ void sched_move_task(struct task_struct *tsk)
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
|
|
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
|
-static DEFINE_MUTEX(shares_mutex);
|
|
|
|
-
|
|
|
|
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
|
|
|
|
-{
|
|
|
|
- int i;
|
|
|
|
- unsigned long flags;
|
|
|
|
-
|
|
|
|
- /*
|
|
|
|
- * We can't change the weight of the root cgroup.
|
|
|
|
- */
|
|
|
|
- if (!tg->se[0])
|
|
|
|
- return -EINVAL;
|
|
|
|
-
|
|
|
|
- shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
|
|
|
|
-
|
|
|
|
- mutex_lock(&shares_mutex);
|
|
|
|
- if (tg->shares == shares)
|
|
|
|
- goto done;
|
|
|
|
-
|
|
|
|
- tg->shares = shares;
|
|
|
|
- for_each_possible_cpu(i) {
|
|
|
|
- struct rq *rq = cpu_rq(i);
|
|
|
|
- struct sched_entity *se;
|
|
|
|
-
|
|
|
|
- se = tg->se[i];
|
|
|
|
- /* Propagate contribution to hierarchy */
|
|
|
|
- raw_spin_lock_irqsave(&rq->lock, flags);
|
|
|
|
- for_each_sched_entity(se)
|
|
|
|
- update_cfs_shares(group_cfs_rq(se));
|
|
|
|
- raw_spin_unlock_irqrestore(&rq->lock, flags);
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
-done:
|
|
|
|
- mutex_unlock(&shares_mutex);
|
|
|
|
- return 0;
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-unsigned long sched_group_shares(struct task_group *tg)
|
|
|
|
-{
|
|
|
|
- return tg->shares;
|
|
|
|
-}
|
|
|
|
#endif
|
|
#endif
|
|
|
|
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
|
|
#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
|
|
@@ -8841,7 +7237,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
|
|
struct task_struct *g, *p;
|
|
struct task_struct *g, *p;
|
|
|
|
|
|
do_each_thread(g, p) {
|
|
do_each_thread(g, p) {
|
|
- if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
|
|
|
|
|
|
+ if (rt_task(p) && task_rq(p)->rt.tg == tg)
|
|
return 1;
|
|
return 1;
|
|
} while_each_thread(g, p);
|
|
} while_each_thread(g, p);
|
|
|
|
|
|
@@ -9192,8 +7588,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
|
|
|
|
|
|
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|
static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|
{
|
|
{
|
|
- int i, ret = 0, runtime_enabled;
|
|
|
|
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
|
|
|
|
|
+ int i, ret = 0, runtime_enabled, runtime_was_enabled;
|
|
|
|
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
|
|
|
|
|
if (tg == &root_task_group)
|
|
if (tg == &root_task_group)
|
|
return -EINVAL;
|
|
return -EINVAL;
|
|
@@ -9220,6 +7616,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|
goto out_unlock;
|
|
goto out_unlock;
|
|
|
|
|
|
runtime_enabled = quota != RUNTIME_INF;
|
|
runtime_enabled = quota != RUNTIME_INF;
|
|
|
|
+ runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
|
|
|
|
+ account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
|
|
raw_spin_lock_irq(&cfs_b->lock);
|
|
raw_spin_lock_irq(&cfs_b->lock);
|
|
cfs_b->period = ns_to_ktime(period);
|
|
cfs_b->period = ns_to_ktime(period);
|
|
cfs_b->quota = quota;
|
|
cfs_b->quota = quota;
|
|
@@ -9235,13 +7633,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
|
|
|
|
|
|
for_each_possible_cpu(i) {
|
|
for_each_possible_cpu(i) {
|
|
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
|
struct cfs_rq *cfs_rq = tg->cfs_rq[i];
|
|
- struct rq *rq = rq_of(cfs_rq);
|
|
|
|
|
|
+ struct rq *rq = cfs_rq->rq;
|
|
|
|
|
|
raw_spin_lock_irq(&rq->lock);
|
|
raw_spin_lock_irq(&rq->lock);
|
|
cfs_rq->runtime_enabled = runtime_enabled;
|
|
cfs_rq->runtime_enabled = runtime_enabled;
|
|
cfs_rq->runtime_remaining = 0;
|
|
cfs_rq->runtime_remaining = 0;
|
|
|
|
|
|
- if (cfs_rq_throttled(cfs_rq))
|
|
|
|
|
|
+ if (cfs_rq->throttled)
|
|
unthrottle_cfs_rq(cfs_rq);
|
|
unthrottle_cfs_rq(cfs_rq);
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
raw_spin_unlock_irq(&rq->lock);
|
|
}
|
|
}
|
|
@@ -9255,7 +7653,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
|
|
{
|
|
{
|
|
u64 quota, period;
|
|
u64 quota, period;
|
|
|
|
|
|
- period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
|
|
|
|
|
+ period = ktime_to_ns(tg->cfs_bandwidth.period);
|
|
if (cfs_quota_us < 0)
|
|
if (cfs_quota_us < 0)
|
|
quota = RUNTIME_INF;
|
|
quota = RUNTIME_INF;
|
|
else
|
|
else
|
|
@@ -9268,10 +7666,10 @@ long tg_get_cfs_quota(struct task_group *tg)
|
|
{
|
|
{
|
|
u64 quota_us;
|
|
u64 quota_us;
|
|
|
|
|
|
- if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
|
|
|
|
|
|
+ if (tg->cfs_bandwidth.quota == RUNTIME_INF)
|
|
return -1;
|
|
return -1;
|
|
|
|
|
|
- quota_us = tg_cfs_bandwidth(tg)->quota;
|
|
|
|
|
|
+ quota_us = tg->cfs_bandwidth.quota;
|
|
do_div(quota_us, NSEC_PER_USEC);
|
|
do_div(quota_us, NSEC_PER_USEC);
|
|
|
|
|
|
return quota_us;
|
|
return quota_us;
|
|
@@ -9282,7 +7680,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
|
|
u64 quota, period;
|
|
u64 quota, period;
|
|
|
|
|
|
period = (u64)cfs_period_us * NSEC_PER_USEC;
|
|
period = (u64)cfs_period_us * NSEC_PER_USEC;
|
|
- quota = tg_cfs_bandwidth(tg)->quota;
|
|
|
|
|
|
+ quota = tg->cfs_bandwidth.quota;
|
|
|
|
|
|
if (period <= 0)
|
|
if (period <= 0)
|
|
return -EINVAL;
|
|
return -EINVAL;
|
|
@@ -9294,7 +7692,7 @@ long tg_get_cfs_period(struct task_group *tg)
|
|
{
|
|
{
|
|
u64 cfs_period_us;
|
|
u64 cfs_period_us;
|
|
|
|
|
|
- cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
|
|
|
|
|
|
+ cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
|
|
do_div(cfs_period_us, NSEC_PER_USEC);
|
|
do_div(cfs_period_us, NSEC_PER_USEC);
|
|
|
|
|
|
return cfs_period_us;
|
|
return cfs_period_us;
|
|
@@ -9354,13 +7752,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
|
|
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
|
static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
|
|
{
|
|
{
|
|
struct cfs_schedulable_data *d = data;
|
|
struct cfs_schedulable_data *d = data;
|
|
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
|
|
|
|
|
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
|
s64 quota = 0, parent_quota = -1;
|
|
s64 quota = 0, parent_quota = -1;
|
|
|
|
|
|
if (!tg->parent) {
|
|
if (!tg->parent) {
|
|
quota = RUNTIME_INF;
|
|
quota = RUNTIME_INF;
|
|
} else {
|
|
} else {
|
|
- struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
|
|
|
|
|
|
+ struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
|
|
|
|
|
|
quota = normalize_cfs_quota(tg, d);
|
|
quota = normalize_cfs_quota(tg, d);
|
|
parent_quota = parent_b->hierarchal_quota;
|
|
parent_quota = parent_b->hierarchal_quota;
|
|
@@ -9404,7 +7802,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
|
struct cgroup_map_cb *cb)
|
|
struct cgroup_map_cb *cb)
|
|
{
|
|
{
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
|
struct task_group *tg = cgroup_tg(cgrp);
|
|
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
|
|
|
|
|
|
+ struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
|
|
|
|
|
|
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
|
|
cb->fill(cb, "nr_periods", cfs_b->nr_periods);
|
|
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
|
|
cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
|
|
@@ -9505,38 +7903,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
|
|
* (balbir@in.ibm.com).
|
|
* (balbir@in.ibm.com).
|
|
*/
|
|
*/
|
|
|
|
|
|
-/* track cpu usage of a group of tasks and its child groups */
|
|
|
|
-struct cpuacct {
|
|
|
|
- struct cgroup_subsys_state css;
|
|
|
|
- /* cpuusage holds pointer to a u64-type object on every cpu */
|
|
|
|
- u64 __percpu *cpuusage;
|
|
|
|
- struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
|
|
|
|
- struct cpuacct *parent;
|
|
|
|
-};
|
|
|
|
-
|
|
|
|
-struct cgroup_subsys cpuacct_subsys;
|
|
|
|
-
|
|
|
|
-/* return cpu accounting group corresponding to this container */
|
|
|
|
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
|
|
|
|
-{
|
|
|
|
- return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
|
|
|
|
- struct cpuacct, css);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
-/* return cpu accounting group to which this task belongs */
|
|
|
|
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
|
|
|
|
-{
|
|
|
|
- return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
|
|
|
|
- struct cpuacct, css);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
/* create a new cpu accounting group */
|
|
/* create a new cpu accounting group */
|
|
static struct cgroup_subsys_state *cpuacct_create(
|
|
static struct cgroup_subsys_state *cpuacct_create(
|
|
struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|
struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|
{
|
|
{
|
|
- struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
|
|
|
- int i;
|
|
|
|
|
|
+ struct cpuacct *ca;
|
|
|
|
|
|
|
|
+ if (!cgrp->parent)
|
|
|
|
+ return &root_cpuacct.css;
|
|
|
|
+
|
|
|
|
+ ca = kzalloc(sizeof(*ca), GFP_KERNEL);
|
|
if (!ca)
|
|
if (!ca)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
@@ -9544,18 +7920,13 @@ static struct cgroup_subsys_state *cpuacct_create(
|
|
if (!ca->cpuusage)
|
|
if (!ca->cpuusage)
|
|
goto out_free_ca;
|
|
goto out_free_ca;
|
|
|
|
|
|
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
|
|
|
|
- if (percpu_counter_init(&ca->cpustat[i], 0))
|
|
|
|
- goto out_free_counters;
|
|
|
|
-
|
|
|
|
- if (cgrp->parent)
|
|
|
|
- ca->parent = cgroup_ca(cgrp->parent);
|
|
|
|
|
|
+ ca->cpustat = alloc_percpu(struct kernel_cpustat);
|
|
|
|
+ if (!ca->cpustat)
|
|
|
|
+ goto out_free_cpuusage;
|
|
|
|
|
|
return &ca->css;
|
|
return &ca->css;
|
|
|
|
|
|
-out_free_counters:
|
|
|
|
- while (--i >= 0)
|
|
|
|
- percpu_counter_destroy(&ca->cpustat[i]);
|
|
|
|
|
|
+out_free_cpuusage:
|
|
free_percpu(ca->cpuusage);
|
|
free_percpu(ca->cpuusage);
|
|
out_free_ca:
|
|
out_free_ca:
|
|
kfree(ca);
|
|
kfree(ca);
|
|
@@ -9568,10 +7939,8 @@ static void
|
|
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|
cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|
{
|
|
{
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
|
- int i;
|
|
|
|
|
|
|
|
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
|
|
|
|
- percpu_counter_destroy(&ca->cpustat[i]);
|
|
|
|
|
|
+ free_percpu(ca->cpustat);
|
|
free_percpu(ca->cpuusage);
|
|
free_percpu(ca->cpuusage);
|
|
kfree(ca);
|
|
kfree(ca);
|
|
}
|
|
}
|
|
@@ -9664,16 +8033,31 @@ static const char *cpuacct_stat_desc[] = {
|
|
};
|
|
};
|
|
|
|
|
|
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
|
static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
|
|
- struct cgroup_map_cb *cb)
|
|
|
|
|
|
+ struct cgroup_map_cb *cb)
|
|
{
|
|
{
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
|
struct cpuacct *ca = cgroup_ca(cgrp);
|
|
- int i;
|
|
|
|
|
|
+ int cpu;
|
|
|
|
+ s64 val = 0;
|
|
|
|
+
|
|
|
|
+ for_each_online_cpu(cpu) {
|
|
|
|
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
|
|
+ val += kcpustat->cpustat[CPUTIME_USER];
|
|
|
|
+ val += kcpustat->cpustat[CPUTIME_NICE];
|
|
|
|
+ }
|
|
|
|
+ val = cputime64_to_clock_t(val);
|
|
|
|
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
|
|
|
|
|
|
- for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
|
|
|
|
- s64 val = percpu_counter_read(&ca->cpustat[i]);
|
|
|
|
- val = cputime64_to_clock_t(val);
|
|
|
|
- cb->fill(cb, cpuacct_stat_desc[i], val);
|
|
|
|
|
|
+ val = 0;
|
|
|
|
+ for_each_online_cpu(cpu) {
|
|
|
|
+ struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
|
|
|
|
+ val += kcpustat->cpustat[CPUTIME_SYSTEM];
|
|
|
|
+ val += kcpustat->cpustat[CPUTIME_IRQ];
|
|
|
|
+ val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
|
|
}
|
|
}
|
|
|
|
+
|
|
|
|
+ val = cputime64_to_clock_t(val);
|
|
|
|
+ cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
|
|
|
|
+
|
|
return 0;
|
|
return 0;
|
|
}
|
|
}
|
|
|
|
|
|
@@ -9703,7 +8087,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
|
|
*
|
|
*
|
|
* called with rq->lock held.
|
|
* called with rq->lock held.
|
|
*/
|
|
*/
|
|
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|
|
|
|
|
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|
{
|
|
{
|
|
struct cpuacct *ca;
|
|
struct cpuacct *ca;
|
|
int cpu;
|
|
int cpu;
|
|
@@ -9717,7 +8101,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|
|
|
|
|
ca = task_ca(tsk);
|
|
ca = task_ca(tsk);
|
|
|
|
|
|
- for (; ca; ca = ca->parent) {
|
|
|
|
|
|
+ for (; ca; ca = parent_ca(ca)) {
|
|
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
|
u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
|
|
*cpuusage += cputime;
|
|
*cpuusage += cputime;
|
|
}
|
|
}
|
|
@@ -9725,46 +8109,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
|
|
rcu_read_unlock();
|
|
rcu_read_unlock();
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
|
|
|
|
- * in cputime_t units. As a result, cpuacct_update_stats calls
|
|
|
|
- * percpu_counter_add with values large enough to always overflow the
|
|
|
|
- * per cpu batch limit causing bad SMP scalability.
|
|
|
|
- *
|
|
|
|
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
|
|
|
|
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
|
|
|
|
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
|
|
|
|
- */
|
|
|
|
-#ifdef CONFIG_SMP
|
|
|
|
-#define CPUACCT_BATCH \
|
|
|
|
- min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
|
|
|
|
-#else
|
|
|
|
-#define CPUACCT_BATCH 0
|
|
|
|
-#endif
|
|
|
|
-
|
|
|
|
-/*
|
|
|
|
- * Charge the system/user time to the task's accounting group.
|
|
|
|
- */
|
|
|
|
-static void cpuacct_update_stats(struct task_struct *tsk,
|
|
|
|
- enum cpuacct_stat_index idx, cputime_t val)
|
|
|
|
-{
|
|
|
|
- struct cpuacct *ca;
|
|
|
|
- int batch = CPUACCT_BATCH;
|
|
|
|
-
|
|
|
|
- if (unlikely(!cpuacct_subsys.active))
|
|
|
|
- return;
|
|
|
|
-
|
|
|
|
- rcu_read_lock();
|
|
|
|
- ca = task_ca(tsk);
|
|
|
|
-
|
|
|
|
- do {
|
|
|
|
- __percpu_counter_add(&ca->cpustat[idx],
|
|
|
|
- (__force s64) val, batch);
|
|
|
|
- ca = ca->parent;
|
|
|
|
- } while (ca);
|
|
|
|
- rcu_read_unlock();
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
struct cgroup_subsys cpuacct_subsys = {
|
|
struct cgroup_subsys cpuacct_subsys = {
|
|
.name = "cpuacct",
|
|
.name = "cpuacct",
|
|
.create = cpuacct_create,
|
|
.create = cpuacct_create,
|