8 năm trước cách đây · af79ad2b1f
--- a/Documentation/scheduler/sched-deadline.txt
+++ b/Documentation/scheduler/sched-deadline.txt
@@ -16,6 +16,7 @@ CONTENTS
 
				    4.1 System-wide settings
			
 
				    4.2 Task interface
			
 
				    4.3 Default behavior
			
 
				+   4.4 Behavior of sched_yield()
			
 
				  5. Tasks CPU affinity
			
 
				    5.1 SCHED_DEADLINE and cpusets HOWTO
			
 
				  6. Future plans
			
@@ -426,6 +427,23 @@ CONTENTS
 
				  Finally, notice that in order not to jeopardize the admission control a
			
 
				  -deadline task cannot fork.
			
 
				 
			
 
				+
			
 
				+4.4 Behavior of sched_yield()
			
 
				+-----------------------------
			
 
				+
			
 
				+ When a SCHED_DEADLINE task calls sched_yield(), it gives up its
			
 
				+ remaining runtime and is immediately throttled, until the next
			
 
				+ period, when its runtime will be replenished (a special flag
			
 
				+ dl_yielded is set and used to handle correctly throttling and runtime
			
 
				+ replenishment after a call to sched_yield()).
			
 
				+
			
 
				+ This behavior of sched_yield() allows the task to wake-up exactly at
			
 
				+ the beginning of the next period. Also, this may be useful in the
			
 
				+ future with bandwidth reclaiming mechanisms, where sched_yield() will
			
 
				+ make the leftoever runtime available for reclamation by other
			
 
				+ SCHED_DEADLINE tasks.
			
 
				+
			
 
				+
			
 
				 5. Tasks CPU affinity
			
 
				 =====================
			
 
				 
			
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -986,7 +986,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs,
 
				 	int cpu = smp_processor_id();
			
 
				 
			
 
				 	previous_current = curr_task(cpu);
			
 
				-	set_curr_task(cpu, current);
			
 
				+	ia64_set_curr_task(cpu, current);
			
 
				 	if ((p = strchr(current->comm, ' ')))
			
 
				 		*p = '\0';
			
 
				 
			
@@ -1360,14 +1360,14 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw,
 
				 				cpumask_clear_cpu(i, &mca_cpu);	/* wake next cpu */
			
 
				 				while (monarch_cpu != -1)
			
 
				 					cpu_relax();	/* spin until last cpu leaves */
			
 
				-				set_curr_task(cpu, previous_current);
			
 
				+				ia64_set_curr_task(cpu, previous_current);
			
 
				 				ia64_mc_info.imi_rendez_checkin[cpu]
			
 
				 						= IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
			
 
				 				return;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				-	set_curr_task(cpu, previous_current);
			
 
				+	ia64_set_curr_task(cpu, previous_current);
			
 
				 	ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
			
 
				 	monarch_cpu = -1;	/* This frees the slaves and previous monarchs */
			
 
				 }
			
@@ -1729,7 +1729,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 
				 		NOTIFY_INIT(DIE_INIT_SLAVE_LEAVE, regs, (long)&nd, 1);
			
 
				 
			
 
				 		mprintk("Slave on cpu %d returning to normal service.\n", cpu);
			
 
				-		set_curr_task(cpu, previous_current);
			
 
				+		ia64_set_curr_task(cpu, previous_current);
			
 
				 		ia64_mc_info.imi_rendez_checkin[cpu] = IA64_MCA_RENDEZ_CHECKIN_NOTDONE;
			
 
				 		atomic_dec(&slaves);
			
 
				 		return;
			
@@ -1756,7 +1756,7 @@ ia64_init_handler(struct pt_regs *regs, struct switch_stack *sw,
 
				 
			
 
				 	mprintk("\nINIT dump complete.  Monarch on cpu %d returning to normal service.\n", cpu);
			
 
				 	atomic_dec(&monarchs);
			
 
				-	set_curr_task(cpu, previous_current);
			
 
				+	ia64_set_curr_task(cpu, previous_current);
			
 
				 	monarch_cpu = -1;
			
 
				 	return;
			
 
				 }
			
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-static struct sched_domain_topology_level numa_inside_package_topology[] = {
			
 
				+static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
			
 
				 #ifdef CONFIG_SCHED_SMT
			
 
				 	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
			
 
				 #endif
			
@@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = {
 
				 #endif
			
 
				 	{ NULL, },
			
 
				 };
			
 
				+
			
 
				+static struct sched_domain_topology_level x86_topology[] = {
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
			
 
				+#endif
			
 
				+#ifdef CONFIG_SCHED_MC
			
 
				+	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
			
 
				+#endif
			
 
				+	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
			
 
				+	{ NULL, },
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				- * set_sched_topology() sets the topology internal to a CPU.  The
			
 
				- * NUMA topologies are layered on top of it to build the full
			
 
				- * system topology.
			
 
				- *
			
 
				- * If NUMA nodes are observed to occur within a CPU package, this
			
 
				- * function should be called.  It forces the sched domain code to
			
 
				- * only use the SMT level for the CPU portion of the topology.
			
 
				- * This essentially falls back to relying on NUMA information
			
 
				- * from the SRAT table to describe the entire system topology
			
 
				- * (except for hyperthreads).
			
 
				+ * Set if a package/die has multiple NUMA nodes inside.
			
 
				+ * AMD Magny-Cours and Intel Cluster-on-Die have this.
			
 
				  */
			
 
				-static void primarily_use_numa_for_topology(void)
			
 
				-{
			
 
				-	set_sched_topology(numa_inside_package_topology);
			
 
				-}
			
 
				+static bool x86_has_numa_in_package;
			
 
				 
			
 
				 void set_cpu_sibling_map(int cpu)
			
 
				 {
			
@@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu)
 
				 				c->booted_cores = cpu_data(i).booted_cores;
			
 
				 		}
			
 
				 		if (match_die(c, o) && !topology_same_node(c, o))
			
 
				-			primarily_use_numa_for_topology();
			
 
				+			x86_has_numa_in_package = true;
			
 
				 	}
			
 
				 
			
 
				 	threads = cpumask_weight(topology_sibling_cpumask(cpu));
			
@@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
 
				 		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
			
 
				 		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Set 'default' x86 topology, this matches default_topology() in that
			
 
				+	 * it has NUMA nodes as a topology level. See also
			
 
				+	 * native_smp_cpus_done().
			
 
				+	 *
			
 
				+	 * Must be done before set_cpus_sibling_map() is ran.
			
 
				+	 */
			
 
				+	set_sched_topology(x86_topology);
			
 
				+
			
 
				 	set_cpu_sibling_map(0);
			
 
				 
			
 
				 	switch (smp_sanity_check(max_cpus)) {
			
@@ -1370,6 +1381,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
 
				 {
			
 
				 	pr_debug("Boot done\n");
			
 
				 
			
 
				+	if (x86_has_numa_in_package)
			
 
				+		set_sched_topology(x86_numa_in_package_topology);
			
 
				+
			
 
				 	nmi_selftest();
			
 
				 	impress_friends();
			
 
				 	setup_ioapic_dest();
			
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -259,17 +259,14 @@ static inline void might_fault(void) { }
 
				 extern struct atomic_notifier_head panic_notifier_list;
			
 
				 extern long (*panic_blink)(int state);
			
 
				 __printf(1, 2)
			
 
				-void panic(const char *fmt, ...)
			
 
				-	__noreturn __cold;
			
 
				+void panic(const char *fmt, ...) __noreturn __cold;
			
 
				 void nmi_panic(struct pt_regs *regs, const char *msg);
			
 
				 extern void oops_enter(void);
			
 
				 extern void oops_exit(void);
			
 
				 void print_oops_end_marker(void);
			
 
				 extern int oops_may_print(void);
			
 
				-void do_exit(long error_code)
			
 
				-	__noreturn;
			
 
				-void complete_and_exit(struct completion *, long)
			
 
				-	__noreturn;
			
 
				+void do_exit(long error_code) __noreturn;
			
 
				+void complete_and_exit(struct completion *, long) __noreturn;
			
 
				 
			
 
				 /* Internal, do not use. */
			
 
				 int __must_check _kstrtoul(const char *s, unsigned int base, unsigned long *res);
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -448,6 +448,8 @@ static inline void io_schedule(void)
 
				 	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
			
 
				 }
			
 
				 
			
 
				+void __noreturn do_task_dead(void);
			
 
				+
			
 
				 struct nsproxy;
			
 
				 struct user_namespace;
			
 
				 
			
@@ -1022,7 +1024,8 @@ extern void wake_up_q(struct wake_q_head *head);
 
				 #define SD_BALANCE_FORK		0x0008	/* Balance on fork, clone */
			
 
				 #define SD_BALANCE_WAKE		0x0010  /* Balance on wakeup */
			
 
				 #define SD_WAKE_AFFINE		0x0020	/* Wake task to waking CPU */
			
 
				-#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu power */
			
 
				+#define SD_ASYM_CPUCAPACITY	0x0040  /* Groups have different max cpu capacities */
			
 
				+#define SD_SHARE_CPUCAPACITY	0x0080	/* Domain members share cpu capacity */
			
 
				 #define SD_SHARE_POWERDOMAIN	0x0100	/* Domain members share power domain */
			
 
				 #define SD_SHARE_PKG_RESOURCES	0x0200	/* Domain members share cpu pkg resources */
			
 
				 #define SD_SERIALIZE		0x0400	/* Only a single load balancing instance */
			
@@ -1064,6 +1067,12 @@ extern int sched_domain_level_max;
 
				 
			
 
				 struct sched_group;
			
 
				 
			
 
				+struct sched_domain_shared {
			
 
				+	atomic_t	ref;
			
 
				+	atomic_t	nr_busy_cpus;
			
 
				+	int		has_idle_cores;
			
 
				+};
			
 
				+
			
 
				 struct sched_domain {
			
 
				 	/* These fields must be setup */
			
 
				 	struct sched_domain *parent;	/* top domain must be null terminated */
			
@@ -1094,6 +1103,8 @@ struct sched_domain {
 
				 	u64 max_newidle_lb_cost;
			
 
				 	unsigned long next_decay_max_lb_cost;
			
 
				 
			
 
				+	u64 avg_scan_cost;		/* select_idle_sibling */
			
 
				+
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
 
				 	/* load_balance() stats */
			
 
				 	unsigned int lb_count[CPU_MAX_IDLE_TYPES];
			
@@ -1132,6 +1143,7 @@ struct sched_domain {
 
				 		void *private;		/* used during construction */
			
 
				 		struct rcu_head rcu;	/* used during destruction */
			
 
				 	};
			
 
				+	struct sched_domain_shared *shared;
			
 
				 
			
 
				 	unsigned int span_weight;
			
 
				 	/*
			
@@ -1165,6 +1177,7 @@ typedef int (*sched_domain_flags_f)(void);
 
				 
			
 
				 struct sd_data {
			
 
				 	struct sched_domain **__percpu sd;
			
 
				+	struct sched_domain_shared **__percpu sds;
			
 
				 	struct sched_group **__percpu sg;
			
 
				 	struct sched_group_capacity **__percpu sgc;
			
 
				 };
			
@@ -2568,7 +2581,7 @@ static inline bool is_idle_task(const struct task_struct *p)
 
				 	return p->pid == 0;
			
 
				 }
			
 
				 extern struct task_struct *curr_task(int cpu);
			
 
				-extern void set_curr_task(int cpu, struct task_struct *p);
			
 
				+extern void ia64_set_curr_task(int cpu, struct task_struct *p);
			
 
				 
			
 
				 void yield(void);
			
 
				 
			
@@ -3206,7 +3219,11 @@ static inline int signal_pending_state(long state, struct task_struct *p)
 
				  * cond_resched_lock() will drop the spinlock before scheduling,
			
 
				  * cond_resched_softirq() will enable bhs before scheduling.
			
 
				  */
			
 
				+#ifndef CONFIG_PREEMPT
			
 
				 extern int _cond_resched(void);
			
 
				+#else
			
 
				+static inline int _cond_resched(void) { return 0; }
			
 
				+#endif
			
 
				 
			
 
				 #define cond_resched() ({			\
			
 
				 	___might_sleep(__FILE__, __LINE__, 0);	\
			
@@ -3236,6 +3253,15 @@ static inline void cond_resched_rcu(void)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
			
 
				+{
			
 
				+#ifdef CONFIG_DEBUG_PREEMPT
			
 
				+	return p->preempt_disable_ip;
			
 
				+#else
			
 
				+	return 0;
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Does a critical section need to be broken due to another
			
 
				  * task waiting?: (technically does not depend on CONFIG_PREEMPT,
			
--- a/include/linux/u64_stats_sync.h
+++ b/include/linux/u64_stats_sync.h
@@ -103,31 +103,42 @@ static inline void u64_stats_update_end_raw(struct u64_stats_sync *syncp)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
			
 
				+static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
			
 
				 {
			
 
				 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
			
 
				 	return read_seqcount_begin(&syncp->seq);
			
 
				 #else
			
 
				-#if BITS_PER_LONG==32
			
 
				-	preempt_disable();
			
 
				-#endif
			
 
				 	return 0;
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				-static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
			
 
				+static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp)
			
 
				+{
			
 
				+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
			
 
				+	preempt_disable();
			
 
				+#endif
			
 
				+	return __u64_stats_fetch_begin(syncp);
			
 
				+}
			
 
				+
			
 
				+static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
			
 
				 					 unsigned int start)
			
 
				 {
			
 
				 #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
			
 
				 	return read_seqcount_retry(&syncp->seq, start);
			
 
				 #else
			
 
				-#if BITS_PER_LONG==32
			
 
				-	preempt_enable();
			
 
				-#endif
			
 
				 	return false;
			
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
			
 
				+					 unsigned int start)
			
 
				+{
			
 
				+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
			
 
				+	preempt_enable();
			
 
				+#endif
			
 
				+	return __u64_stats_fetch_retry(syncp, start);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * In case irq handlers can update u64 counters, readers can use following helpers
			
 
				  * - SMP 32bit arches use seqcount protection, irq safe.
			
@@ -136,27 +147,19 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp,
 
				  */
			
 
				 static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp)
			
 
				 {
			
 
				-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
			
 
				-	return read_seqcount_begin(&syncp->seq);
			
 
				-#else
			
 
				-#if BITS_PER_LONG==32
			
 
				+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
			
 
				 	local_irq_disable();
			
 
				 #endif
			
 
				-	return 0;
			
 
				-#endif
			
 
				+	return __u64_stats_fetch_begin(syncp);
			
 
				 }
			
 
				 
			
 
				 static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp,
			
 
				-					 unsigned int start)
			
 
				+					     unsigned int start)
			
 
				 {
			
 
				-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
			
 
				-	return read_seqcount_retry(&syncp->seq, start);
			
 
				-#else
			
 
				-#if BITS_PER_LONG==32
			
 
				+#if BITS_PER_LONG==32 && !defined(CONFIG_SMP)
			
 
				 	local_irq_enable();
			
 
				 #endif
			
 
				-	return false;
			
 
				-#endif
			
 
				+	return __u64_stats_fetch_retry(syncp, start);
			
 
				 }
			
 
				 
			
 
				 #endif /* _LINUX_U64_STATS_SYNC_H */
			
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -248,6 +248,8 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 
				 	(!__builtin_constant_p(state) ||				\
			
 
				 		state == TASK_INTERRUPTIBLE || state == TASK_KILLABLE)	\
			
 
				 
			
 
				+extern void init_wait_entry(wait_queue_t *__wait, int flags);
			
 
				+
			
 
				 /*
			
 
				  * The below macro ___wait_event() has an explicit shadow of the __ret
			
 
				  * variable when used from the wait_event_*() macros.
			
@@ -266,12 +268,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 
				 	wait_queue_t __wait;						\
			
 
				 	long __ret = ret;	/* explicit shadow */			\
			
 
				 									\
			
 
				-	INIT_LIST_HEAD(&__wait.task_list);				\
			
 
				-	if (exclusive)							\
			
 
				-		__wait.flags = WQ_FLAG_EXCLUSIVE;			\
			
 
				-	else								\
			
 
				-		__wait.flags = 0;					\
			
 
				-									\
			
 
				+	init_wait_entry(&__wait, exclusive ? WQ_FLAG_EXCLUSIVE : 0);	\
			
 
				 	for (;;) {							\
			
 
				 		long __int = prepare_to_wait_event(&wq, &__wait, state);\
			
 
				 									\
			
@@ -280,12 +277,7 @@ wait_queue_head_t *bit_waitqueue(void *, int);
 
				 									\
			
 
				 		if (___wait_is_interruptible(state) && __int) {		\
			
 
				 			__ret = __int;					\
			
 
				-			if (exclusive) {				\
			
 
				-				abort_exclusive_wait(&wq, &__wait,	\
			
 
				-						     state, NULL);	\
			
 
				-				goto __out;				\
			
 
				-			}						\
			
 
				-			break;						\
			
 
				+			goto __out;					\
			
 
				 		}							\
			
 
				 									\
			
 
				 		cmd;							\
			
@@ -989,7 +981,6 @@ void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state);
 
				 void prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state);
			
 
				 long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state);
			
 
				 void finish_wait(wait_queue_head_t *q, wait_queue_t *wait);
			
 
				-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, unsigned int mode, void *key);
			
 
				 long wait_woken(wait_queue_t *wait, unsigned mode, long timeout);
			
 
				 int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
			
 
				 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key);
			
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -725,7 +725,7 @@ static void check_stack_usage(void)
 
				 static inline void check_stack_usage(void) {}
			
 
				 #endif
			
 
				 
			
 
				-void do_exit(long code)
			
 
				+void __noreturn do_exit(long code)
			
 
				 {
			
 
				 	struct task_struct *tsk = current;
			
 
				 	int group_dead;
			
@@ -882,29 +882,7 @@ void do_exit(long code)
 
				 	exit_rcu();
			
 
				 	TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i));
			
 
				 
			
 
				-	/*
			
 
				-	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
			
 
				-	 * when the following two conditions become true.
			
 
				-	 *   - There is race condition of mmap_sem (It is acquired by
			
 
				-	 *     exit_mm()), and
			
 
				-	 *   - SMI occurs before setting TASK_RUNINNG.
			
 
				-	 *     (or hypervisor of virtual machine switches to other guest)
			
 
				-	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
			
 
				-	 *
			
 
				-	 * To avoid it, we have to wait for releasing tsk->pi_lock which
			
 
				-	 * is held by try_to_wake_up()
			
 
				-	 */
			
 
				-	smp_mb();
			
 
				-	raw_spin_unlock_wait(&tsk->pi_lock);
			
 
				-
			
 
				-	/* causes final put_task_struct in finish_task_switch(). */
			
 
				-	tsk->state = TASK_DEAD;
			
 
				-	tsk->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
			
 
				-	schedule();
			
 
				-	BUG();
			
 
				-	/* Avoid "noreturn function does return".  */
			
 
				-	for (;;)
			
 
				-		cpu_relax();	/* For when BUG is null */
			
 
				+	do_task_dead();
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(do_exit);
			
 
				 
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1070,8 +1070,12 @@ static int migration_cpu_stop(void *data)
 
				 	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
			
 
				 	 * we're holding p->pi_lock.
			
 
				 	 */
			
 
				-	if (task_rq(p) == rq && task_on_rq_queued(p))
			
 
				-		rq = __migrate_task(rq, p, arg->dest_cpu);
			
 
				+	if (task_rq(p) == rq) {
			
 
				+		if (task_on_rq_queued(p))
			
 
				+			rq = __migrate_task(rq, p, arg->dest_cpu);
			
 
				+		else
			
 
				+			p->wake_cpu = arg->dest_cpu;
			
 
				+	}
			
 
				 	raw_spin_unlock(&rq->lock);
			
 
				 	raw_spin_unlock(&p->pi_lock);
			
 
				 
			
@@ -1112,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 
				 
			
 
				 	p->sched_class->set_cpus_allowed(p, new_mask);
			
 
				 
			
 
				-	if (running)
			
 
				-		p->sched_class->set_curr_task(rq);
			
 
				 	if (queued)
			
 
				 		enqueue_task(rq, p, ENQUEUE_RESTORE);
			
 
				+	if (running)
			
 
				+		set_curr_task(rq, p);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1272,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
 
				 		/*
			
 
				 		 * Task isn't running anymore; make it appear like we migrated
			
 
				 		 * it before it went to sleep. This means on wakeup we make the
			
 
				-		 * previous cpu our targer instead of where it really is.
			
 
				+		 * previous cpu our target instead of where it really is.
			
 
				 		 */
			
 
				 		p->wake_cpu = cpu;
			
 
				 	}
			
@@ -1636,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 
				 static void
			
 
				 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
			
 
				 {
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				-	struct rq *rq = this_rq();
			
 
				+	struct rq *rq;
			
 
				 
			
 
				-#ifdef CONFIG_SMP
			
 
				-	int this_cpu = smp_processor_id();
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				+	rq = this_rq();
			
 
				 
			
 
				-	if (cpu == this_cpu) {
			
 
				-		schedstat_inc(rq, ttwu_local);
			
 
				-		schedstat_inc(p, se.statistics.nr_wakeups_local);
			
 
				+#ifdef CONFIG_SMP
			
 
				+	if (cpu == rq->cpu) {
			
 
				+		schedstat_inc(rq->ttwu_local);
			
 
				+		schedstat_inc(p->se.statistics.nr_wakeups_local);
			
 
				 	} else {
			
 
				 		struct sched_domain *sd;
			
 
				 
			
 
				-		schedstat_inc(p, se.statistics.nr_wakeups_remote);
			
 
				+		schedstat_inc(p->se.statistics.nr_wakeups_remote);
			
 
				 		rcu_read_lock();
			
 
				-		for_each_domain(this_cpu, sd) {
			
 
				+		for_each_domain(rq->cpu, sd) {
			
 
				 			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
			
 
				-				schedstat_inc(sd, ttwu_wake_remote);
			
 
				+				schedstat_inc(sd->ttwu_wake_remote);
			
 
				 				break;
			
 
				 			}
			
 
				 		}
			
@@ -1660,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 
				 	}
			
 
				 
			
 
				 	if (wake_flags & WF_MIGRATED)
			
 
				-		schedstat_inc(p, se.statistics.nr_wakeups_migrate);
			
 
				-
			
 
				+		schedstat_inc(p->se.statistics.nr_wakeups_migrate);
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
 
				-	schedstat_inc(rq, ttwu_count);
			
 
				-	schedstat_inc(p, se.statistics.nr_wakeups);
			
 
				+	schedstat_inc(rq->ttwu_count);
			
 
				+	schedstat_inc(p->se.statistics.nr_wakeups);
			
 
				 
			
 
				 	if (wake_flags & WF_SYNC)
			
 
				-		schedstat_inc(p, se.statistics.nr_wakeups_sync);
			
 
				-
			
 
				-#endif /* CONFIG_SCHEDSTATS */
			
 
				+		schedstat_inc(p->se.statistics.nr_wakeups_sync);
			
 
				 }
			
 
				 
			
 
				 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
			
@@ -2091,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
				 
			
 
				 	ttwu_queue(p, cpu, wake_flags);
			
 
				 stat:
			
 
				-	if (schedstat_enabled())
			
 
				-		ttwu_stat(p, cpu, wake_flags);
			
 
				+	ttwu_stat(p, cpu, wake_flags);
			
 
				 out:
			
 
				 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
			
 
				 
			
@@ -2102,6 +2104,7 @@ out:
 
				 /**
			
 
				  * try_to_wake_up_local - try to wake up a local task with rq lock held
			
 
				  * @p: the thread to be awakened
			
 
				+ * @cookie: context's cookie for pinning
			
 
				  *
			
 
				  * Put @p on the run-queue if it's not already there. The caller must
			
 
				  * ensure that this_rq() is locked, @p is bound to this_rq() and not
			
@@ -2140,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
 
				 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
			
 
				 
			
 
				 	ttwu_do_wakeup(rq, p, 0, cookie);
			
 
				-	if (schedstat_enabled())
			
 
				-		ttwu_stat(p, smp_processor_id(), 0);
			
 
				+	ttwu_stat(p, smp_processor_id(), 0);
			
 
				 out:
			
 
				 	raw_spin_unlock(&p->pi_lock);
			
 
				 }
			
@@ -3199,6 +3201,9 @@ static inline void preempt_latency_stop(int val) { }
 
				  */
			
 
				 static noinline void __schedule_bug(struct task_struct *prev)
			
 
				 {
			
 
				+	/* Save this before calling printk(), since that will clobber it */
			
 
				+	unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
			
 
				+
			
 
				 	if (oops_in_progress)
			
 
				 		return;
			
 
				 
			
@@ -3209,13 +3214,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
 
				 	print_modules();
			
 
				 	if (irqs_disabled())
			
 
				 		print_irqtrace_events(prev);
			
 
				-#ifdef CONFIG_DEBUG_PREEMPT
			
 
				-	if (in_atomic_preempt_off()) {
			
 
				+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
			
 
				+	    && in_atomic_preempt_off()) {
			
 
				 		pr_err("Preemption disabled at:");
			
 
				-		print_ip_sym(current->preempt_disable_ip);
			
 
				+		print_ip_sym(preempt_disable_ip);
			
 
				 		pr_cont("\n");
			
 
				 	}
			
 
				-#endif
			
 
				 	if (panic_on_warn)
			
 
				 		panic("scheduling while atomic\n");
			
 
				 
			
@@ -3241,7 +3245,7 @@ static inline void schedule_debug(struct task_struct *prev)
 
				 
			
 
				 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
			
 
				 
			
 
				-	schedstat_inc(this_rq(), sched_count);
			
 
				+	schedstat_inc(this_rq()->sched_count);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3334,17 +3338,6 @@ static void __sched notrace __schedule(bool preempt)
 
				 	rq = cpu_rq(cpu);
			
 
				 	prev = rq->curr;
			
 
				 
			
 
				-	/*
			
 
				-	 * do_exit() calls schedule() with preemption disabled as an exception;
			
 
				-	 * however we must fix that up, otherwise the next task will see an
			
 
				-	 * inconsistent (higher) preempt count.
			
 
				-	 *
			
 
				-	 * It also avoids the below schedule_debug() test from complaining
			
 
				-	 * about this.
			
 
				-	 */
			
 
				-	if (unlikely(prev->state == TASK_DEAD))
			
 
				-		preempt_enable_no_resched_notrace();
			
 
				-
			
 
				 	schedule_debug(prev);
			
 
				 
			
 
				 	if (sched_feat(HRTICK))
			
@@ -3412,6 +3405,33 @@ static void __sched notrace __schedule(bool preempt)
 
				 }
			
 
				 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
			
 
				 
			
 
				+void __noreturn do_task_dead(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
			
 
				+	 * when the following two conditions become true.
			
 
				+	 *   - There is race condition of mmap_sem (It is acquired by
			
 
				+	 *     exit_mm()), and
			
 
				+	 *   - SMI occurs before setting TASK_RUNINNG.
			
 
				+	 *     (or hypervisor of virtual machine switches to other guest)
			
 
				+	 *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
			
 
				+	 *
			
 
				+	 * To avoid it, we have to wait for releasing tsk->pi_lock which
			
 
				+	 * is held by try_to_wake_up()
			
 
				+	 */
			
 
				+	smp_mb();
			
 
				+	raw_spin_unlock_wait(&current->pi_lock);
			
 
				+
			
 
				+	/* causes final put_task_struct in finish_task_switch(). */
			
 
				+	__set_current_state(TASK_DEAD);
			
 
				+	current->flags |= PF_NOFREEZE;	/* tell freezer to ignore us */
			
 
				+	__schedule(false);
			
 
				+	BUG();
			
 
				+	/* Avoid "noreturn function does return".  */
			
 
				+	for (;;)
			
 
				+		cpu_relax();	/* For when BUG is null */
			
 
				+}
			
 
				+
			
 
				 static inline void sched_submit_work(struct task_struct *tsk)
			
 
				 {
			
 
				 	if (!tsk->state || tsk_is_pi_blocked(tsk))
			
@@ -3694,10 +3714,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
				 
			
 
				 	p->prio = prio;
			
 
				 
			
 
				-	if (running)
			
 
				-		p->sched_class->set_curr_task(rq);
			
 
				 	if (queued)
			
 
				 		enqueue_task(rq, p, queue_flag);
			
 
				+	if (running)
			
 
				+		set_curr_task(rq, p);
			
 
				 
			
 
				 	check_class_changed(rq, p, prev_class, oldprio);
			
 
				 out_unlock:
			
@@ -3711,7 +3731,8 @@ out_unlock:
 
				 
			
 
				 void set_user_nice(struct task_struct *p, long nice)
			
 
				 {
			
 
				-	int old_prio, delta, queued;
			
 
				+	bool queued, running;
			
 
				+	int old_prio, delta;
			
 
				 	struct rq_flags rf;
			
 
				 	struct rq *rq;
			
 
				 
			
@@ -3733,8 +3754,11 @@ void set_user_nice(struct task_struct *p, long nice)
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 	queued = task_on_rq_queued(p);
			
 
				+	running = task_current(rq, p);
			
 
				 	if (queued)
			
 
				 		dequeue_task(rq, p, DEQUEUE_SAVE);
			
 
				+	if (running)
			
 
				+		put_prev_task(rq, p);
			
 
				 
			
 
				 	p->static_prio = NICE_TO_PRIO(nice);
			
 
				 	set_load_weight(p);
			
@@ -3751,6 +3775,8 @@ void set_user_nice(struct task_struct *p, long nice)
 
				 		if (delta < 0 || (delta > 0 && task_running(rq, p)))
			
 
				 			resched_curr(rq);
			
 
				 	}
			
 
				+	if (running)
			
 
				+		set_curr_task(rq, p);
			
 
				 out_unlock:
			
 
				 	task_rq_unlock(rq, p, &rf);
			
 
				 }
			
@@ -4250,8 +4276,6 @@ change:
 
				 	prev_class = p->sched_class;
			
 
				 	__setscheduler(rq, p, attr, pi);
			
 
				 
			
 
				-	if (running)
			
 
				-		p->sched_class->set_curr_task(rq);
			
 
				 	if (queued) {
			
 
				 		/*
			
 
				 		 * We enqueue to tail when the priority of a task is
			
@@ -4262,6 +4286,8 @@ change:
 
				 
			
 
				 		enqueue_task(rq, p, queue_flags);
			
 
				 	}
			
 
				+	if (running)
			
 
				+		set_curr_task(rq, p);
			
 
				 
			
 
				 	check_class_changed(rq, p, prev_class, oldprio);
			
 
				 	preempt_disable(); /* avoid rq from going away on us */
			
@@ -4853,7 +4879,7 @@ SYSCALL_DEFINE0(sched_yield)
 
				 {
			
 
				 	struct rq *rq = this_rq_lock();
			
 
				 
			
 
				-	schedstat_inc(rq, yld_count);
			
 
				+	schedstat_inc(rq->yld_count);
			
 
				 	current->sched_class->yield_task(rq);
			
 
				 
			
 
				 	/*
			
@@ -4870,6 +4896,7 @@ SYSCALL_DEFINE0(sched_yield)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+#ifndef CONFIG_PREEMPT
			
 
				 int __sched _cond_resched(void)
			
 
				 {
			
 
				 	if (should_resched(0)) {
			
@@ -4879,6 +4906,7 @@ int __sched _cond_resched(void)
 
				 	return 0;
			
 
				 }
			
 
				 EXPORT_SYMBOL(_cond_resched);
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
			
@@ -5004,7 +5032,7 @@ again:
 
				 
			
 
				 	yielded = curr->sched_class->yield_to_task(rq, p, preempt);
			
 
				 	if (yielded) {
			
 
				-		schedstat_inc(rq, yld_count);
			
 
				+		schedstat_inc(rq->yld_count);
			
 
				 		/*
			
 
				 		 * Make p's CPU reschedule; pick_next_entity takes care of
			
 
				 		 * fairness.
			
@@ -5424,10 +5452,10 @@ void sched_setnuma(struct task_struct *p, int nid)
 
				 
			
 
				 	p->numa_preferred_nid = nid;
			
 
				 
			
 
				-	if (running)
			
 
				-		p->sched_class->set_curr_task(rq);
			
 
				 	if (queued)
			
 
				 		enqueue_task(rq, p, ENQUEUE_RESTORE);
			
 
				+	if (running)
			
 
				+		set_curr_task(rq, p);
			
 
				 	task_rq_unlock(rq, p, &rf);
			
 
				 }
			
 
				 #endif /* CONFIG_NUMA_BALANCING */
			
@@ -5724,6 +5752,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 
				 	}
			
 
				 }
			
 
				 #else /* !CONFIG_SCHED_DEBUG */
			
 
				+
			
 
				+# define sched_debug_enabled 0
			
 
				 # define sched_domain_debug(sd, cpu) do { } while (0)
			
 
				 static inline bool sched_debug(void)
			
 
				 {
			
@@ -5742,6 +5772,7 @@ static int sd_degenerate(struct sched_domain *sd)
 
				 			 SD_BALANCE_FORK |
			
 
				 			 SD_BALANCE_EXEC |
			
 
				 			 SD_SHARE_CPUCAPACITY |
			
 
				+			 SD_ASYM_CPUCAPACITY |
			
 
				 			 SD_SHARE_PKG_RESOURCES |
			
 
				 			 SD_SHARE_POWERDOMAIN)) {
			
 
				 		if (sd->groups != sd->groups->next)
			
@@ -5772,6 +5803,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 
				 				SD_BALANCE_NEWIDLE |
			
 
				 				SD_BALANCE_FORK |
			
 
				 				SD_BALANCE_EXEC |
			
 
				+				SD_ASYM_CPUCAPACITY |
			
 
				 				SD_SHARE_CPUCAPACITY |
			
 
				 				SD_SHARE_PKG_RESOURCES |
			
 
				 				SD_PREFER_SIBLING |
			
@@ -5916,10 +5948,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
 
				 	} while (sg != first);
			
 
				 }
			
 
				 
			
 
				-static void free_sched_domain(struct rcu_head *rcu)
			
 
				+static void destroy_sched_domain(struct sched_domain *sd)
			
 
				 {
			
 
				-	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
			
 
				-
			
 
				 	/*
			
 
				 	 * If its an overlapping domain it has private groups, iterate and
			
 
				 	 * nuke them all.
			
@@ -5930,18 +5960,26 @@ static void free_sched_domain(struct rcu_head *rcu)
 
				 		kfree(sd->groups->sgc);
			
 
				 		kfree(sd->groups);
			
 
				 	}
			
 
				+	if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
			
 
				+		kfree(sd->shared);
			
 
				 	kfree(sd);
			
 
				 }
			
 
				 
			
 
				-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
			
 
				+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
			
 
				 {
			
 
				-	call_rcu(&sd->rcu, free_sched_domain);
			
 
				+	struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
			
 
				+
			
 
				+	while (sd) {
			
 
				+		struct sched_domain *parent = sd->parent;
			
 
				+		destroy_sched_domain(sd);
			
 
				+		sd = parent;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
			
 
				+static void destroy_sched_domains(struct sched_domain *sd)
			
 
				 {
			
 
				-	for (; sd; sd = sd->parent)
			
 
				-		destroy_sched_domain(sd, cpu);
			
 
				+	if (sd)
			
 
				+		call_rcu(&sd->rcu, destroy_sched_domains_rcu);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -5956,14 +5994,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 
				 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
			
 
				 DEFINE_PER_CPU(int, sd_llc_size);
			
 
				 DEFINE_PER_CPU(int, sd_llc_id);
			
 
				+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
			
 
				 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
			
 
				-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
			
 
				 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
			
 
				 
			
 
				 static void update_top_cache_domain(int cpu)
			
 
				 {
			
 
				+	struct sched_domain_shared *sds = NULL;
			
 
				 	struct sched_domain *sd;
			
 
				-	struct sched_domain *busy_sd = NULL;
			
 
				 	int id = cpu;
			
 
				 	int size = 1;
			
 
				 
			
@@ -5971,13 +6009,13 @@ static void update_top_cache_domain(int cpu)
 
				 	if (sd) {
			
 
				 		id = cpumask_first(sched_domain_span(sd));
			
 
				 		size = cpumask_weight(sched_domain_span(sd));
			
 
				-		busy_sd = sd->parent; /* sd_busy */
			
 
				+		sds = sd->shared;
			
 
				 	}
			
 
				-	rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
			
 
				 
			
 
				 	rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
			
 
				 	per_cpu(sd_llc_size, cpu) = size;
			
 
				 	per_cpu(sd_llc_id, cpu) = id;
			
 
				+	rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
			
 
				 
			
 
				 	sd = lowest_flag_domain(cpu, SD_NUMA);
			
 
				 	rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
			
@@ -6013,7 +6051,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
				 			 */
			
 
				 			if (parent->flags & SD_PREFER_SIBLING)
			
 
				 				tmp->flags |= SD_PREFER_SIBLING;
			
 
				-			destroy_sched_domain(parent, cpu);
			
 
				+			destroy_sched_domain(parent);
			
 
				 		} else
			
 
				 			tmp = tmp->parent;
			
 
				 	}
			
@@ -6021,7 +6059,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
				 	if (sd && sd_degenerate(sd)) {
			
 
				 		tmp = sd;
			
 
				 		sd = sd->parent;
			
 
				-		destroy_sched_domain(tmp, cpu);
			
 
				+		destroy_sched_domain(tmp);
			
 
				 		if (sd)
			
 
				 			sd->child = NULL;
			
 
				 	}
			
@@ -6031,7 +6069,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
				 	rq_attach_root(rq, rd);
			
 
				 	tmp = rq->sd;
			
 
				 	rcu_assign_pointer(rq->sd, sd);
			
 
				-	destroy_sched_domains(tmp, cpu);
			
 
				+	destroy_sched_domains(tmp);
			
 
				 
			
 
				 	update_top_cache_domain(cpu);
			
 
				 }
			
@@ -6274,7 +6312,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 
				 		return;
			
 
				 
			
 
				 	update_group_capacity(sd, cpu);
			
 
				-	atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -6362,6 +6399,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 
				 	WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
			
 
				 	*per_cpu_ptr(sdd->sd, cpu) = NULL;
			
 
				 
			
 
				+	if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
			
 
				+		*per_cpu_ptr(sdd->sds, cpu) = NULL;
			
 
				+
			
 
				 	if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
			
 
				 		*per_cpu_ptr(sdd->sg, cpu) = NULL;
			
 
				 
			
@@ -6381,26 +6421,37 @@ static int sched_domains_curr_level;
 
				 /*
			
 
				  * SD_flags allowed in topology descriptions.
			
 
				  *
			
 
				- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
			
 
				- * SD_SHARE_PKG_RESOURCES - describes shared caches
			
 
				- * SD_NUMA                - describes NUMA topologies
			
 
				- * SD_SHARE_POWERDOMAIN   - describes shared power domain
			
 
				+ * These flags are purely descriptive of the topology and do not prescribe
			
 
				+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
			
 
				+ * function:
			
 
				+ *
			
 
				+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
			
 
				+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
			
 
				+ *   SD_NUMA                - describes NUMA topologies
			
 
				+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
			
 
				+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
			
 
				+ *
			
 
				+ * Odd one out, which beside describing the topology has a quirk also
			
 
				+ * prescribes the desired behaviour that goes along with it:
			
 
				  *
			
 
				- * Odd one out:
			
 
				- * SD_ASYM_PACKING        - describes SMT quirks
			
 
				+ *   SD_ASYM_PACKING        - describes SMT quirks
			
 
				  */
			
 
				 #define TOPOLOGY_SD_FLAGS		\
			
 
				 	(SD_SHARE_CPUCAPACITY |		\
			
 
				 	 SD_SHARE_PKG_RESOURCES |	\
			
 
				 	 SD_NUMA |			\
			
 
				 	 SD_ASYM_PACKING |		\
			
 
				+	 SD_ASYM_CPUCAPACITY |		\
			
 
				 	 SD_SHARE_POWERDOMAIN)
			
 
				 
			
 
				 static struct sched_domain *
			
 
				-sd_init(struct sched_domain_topology_level *tl, int cpu)
			
 
				+sd_init(struct sched_domain_topology_level *tl,
			
 
				+	const struct cpumask *cpu_map,
			
 
				+	struct sched_domain *child, int cpu)
			
 
				 {
			
 
				-	struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
			
 
				-	int sd_weight, sd_flags = 0;
			
 
				+	struct sd_data *sdd = &tl->data;
			
 
				+	struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
			
 
				+	int sd_id, sd_weight, sd_flags = 0;
			
 
				 
			
 
				 #ifdef CONFIG_NUMA
			
 
				 	/*
			
@@ -6449,15 +6500,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 
				 		.smt_gain		= 0,
			
 
				 		.max_newidle_lb_cost	= 0,
			
 
				 		.next_decay_max_lb_cost	= jiffies,
			
 
				+		.child			= child,
			
 
				 #ifdef CONFIG_SCHED_DEBUG
			
 
				 		.name			= tl->name,
			
 
				 #endif
			
 
				 	};
			
 
				 
			
 
				+	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
			
 
				+	sd_id = cpumask_first(sched_domain_span(sd));
			
 
				+
			
 
				 	/*
			
 
				 	 * Convert topological properties into behaviour.
			
 
				 	 */
			
 
				 
			
 
				+	if (sd->flags & SD_ASYM_CPUCAPACITY) {
			
 
				+		struct sched_domain *t = sd;
			
 
				+
			
 
				+		for_each_lower_domain(t)
			
 
				+			t->flags |= SD_BALANCE_WAKE;
			
 
				+	}
			
 
				+
			
 
				 	if (sd->flags & SD_SHARE_CPUCAPACITY) {
			
 
				 		sd->flags |= SD_PREFER_SIBLING;
			
 
				 		sd->imbalance_pct = 110;
			
@@ -6489,7 +6551,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
 
				 		sd->idle_idx = 1;
			
 
				 	}
			
 
				 
			
 
				-	sd->private = &tl->data;
			
 
				+	/*
			
 
				+	 * For all levels sharing cache; connect a sched_domain_shared
			
 
				+	 * instance.
			
 
				+	 */
			
 
				+	if (sd->flags & SD_SHARE_PKG_RESOURCES) {
			
 
				+		sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
			
 
				+		atomic_inc(&sd->shared->ref);
			
 
				+		atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
			
 
				+	}
			
 
				+
			
 
				+	sd->private = sdd;
			
 
				 
			
 
				 	return sd;
			
 
				 }
			
@@ -6516,6 +6588,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
 
				 
			
 
				 void set_sched_topology(struct sched_domain_topology_level *tl)
			
 
				 {
			
 
				+	if (WARN_ON_ONCE(sched_smp_initialized))
			
 
				+		return;
			
 
				+
			
 
				 	sched_domain_topology = tl;
			
 
				 }
			
 
				 
			
@@ -6796,6 +6871,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
				 		if (!sdd->sd)
			
 
				 			return -ENOMEM;
			
 
				 
			
 
				+		sdd->sds = alloc_percpu(struct sched_domain_shared *);
			
 
				+		if (!sdd->sds)
			
 
				+			return -ENOMEM;
			
 
				+
			
 
				 		sdd->sg = alloc_percpu(struct sched_group *);
			
 
				 		if (!sdd->sg)
			
 
				 			return -ENOMEM;
			
@@ -6806,6 +6885,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
				 
			
 
				 		for_each_cpu(j, cpu_map) {
			
 
				 			struct sched_domain *sd;
			
 
				+			struct sched_domain_shared *sds;
			
 
				 			struct sched_group *sg;
			
 
				 			struct sched_group_capacity *sgc;
			
 
				 
			
@@ -6816,6 +6896,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
 
				 
			
 
				 			*per_cpu_ptr(sdd->sd, j) = sd;
			
 
				 
			
 
				+			sds = kzalloc_node(sizeof(struct sched_domain_shared),
			
 
				+					GFP_KERNEL, cpu_to_node(j));
			
 
				+			if (!sds)
			
 
				+				return -ENOMEM;
			
 
				+
			
 
				+			*per_cpu_ptr(sdd->sds, j) = sds;
			
 
				+
			
 
				 			sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
			
 
				 					GFP_KERNEL, cpu_to_node(j));
			
 
				 			if (!sg)
			
@@ -6855,6 +6942,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
				 				kfree(*per_cpu_ptr(sdd->sd, j));
			
 
				 			}
			
 
				 
			
 
				+			if (sdd->sds)
			
 
				+				kfree(*per_cpu_ptr(sdd->sds, j));
			
 
				 			if (sdd->sg)
			
 
				 				kfree(*per_cpu_ptr(sdd->sg, j));
			
 
				 			if (sdd->sgc)
			
@@ -6862,6 +6951,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
 
				 		}
			
 
				 		free_percpu(sdd->sd);
			
 
				 		sdd->sd = NULL;
			
 
				+		free_percpu(sdd->sds);
			
 
				+		sdd->sds = NULL;
			
 
				 		free_percpu(sdd->sg);
			
 
				 		sdd->sg = NULL;
			
 
				 		free_percpu(sdd->sgc);
			
@@ -6873,16 +6964,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
 
				 		const struct cpumask *cpu_map, struct sched_domain_attr *attr,
			
 
				 		struct sched_domain *child, int cpu)
			
 
				 {
			
 
				-	struct sched_domain *sd = sd_init(tl, cpu);
			
 
				-	if (!sd)
			
 
				-		return child;
			
 
				+	struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
			
 
				 
			
 
				-	cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
			
 
				 	if (child) {
			
 
				 		sd->level = child->level + 1;
			
 
				 		sched_domain_level_max = max(sched_domain_level_max, sd->level);
			
 
				 		child->parent = sd;
			
 
				-		sd->child = child;
			
 
				 
			
 
				 		if (!cpumask_subset(sched_domain_span(child),
			
 
				 				    sched_domain_span(sd))) {
			
@@ -6913,6 +7000,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 
				 	enum s_alloc alloc_state;
			
 
				 	struct sched_domain *sd;
			
 
				 	struct s_data d;
			
 
				+	struct rq *rq = NULL;
			
 
				 	int i, ret = -ENOMEM;
			
 
				 
			
 
				 	alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
			
@@ -6963,11 +7051,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
 
				 	/* Attach the domains */
			
 
				 	rcu_read_lock();
			
 
				 	for_each_cpu(i, cpu_map) {
			
 
				+		rq = cpu_rq(i);
			
 
				 		sd = *per_cpu_ptr(d.sd, i);
			
 
				+
			
 
				+		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
			
 
				+		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
			
 
				+			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
			
 
				+
			
 
				 		cpu_attach_domain(sd, d.rd, i);
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				+	if (rq && sched_debug_enabled) {
			
 
				+		pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
			
 
				+			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
			
 
				+	}
			
 
				+
			
 
				 	ret = 0;
			
 
				 error:
			
 
				 	__free_domain_allocs(&d, alloc_state, cpu_map);
			
@@ -7326,6 +7425,22 @@ int sched_cpu_dying(unsigned int cpu)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
			
 
				+
			
 
				+static void sched_init_smt(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * We've enumerated all CPUs and will assume that if any CPU
			
 
				+	 * has SMT siblings, CPU0 will too.
			
 
				+	 */
			
 
				+	if (cpumask_weight(cpu_smt_mask(0)) > 1)
			
 
				+		static_branch_enable(&sched_smt_present);
			
 
				+}
			
 
				+#else
			
 
				+static inline void sched_init_smt(void) { }
			
 
				+#endif
			
 
				+
			
 
				 void __init sched_init_smp(void)
			
 
				 {
			
 
				 	cpumask_var_t non_isolated_cpus;
			
@@ -7355,6 +7470,9 @@ void __init sched_init_smp(void)
 
				 
			
 
				 	init_sched_rt_class();
			
 
				 	init_sched_dl_class();
			
 
				+
			
 
				+	sched_init_smt();
			
 
				+
			
 
				 	sched_smp_initialized = true;
			
 
				 }
			
 
				 
			
@@ -7392,6 +7510,7 @@ static struct kmem_cache *task_group_cache __read_mostly;
 
				 #endif
			
 
				 
			
 
				 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
			
 
				 
			
 
				 void __init sched_init(void)
			
 
				 {
			
@@ -7428,6 +7547,8 @@ void __init sched_init(void)
 
				 	for_each_possible_cpu(i) {
			
 
				 		per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
			
 
				 			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
			
 
				+		per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
			
 
				+			cpumask_size(), GFP_KERNEL, cpu_to_node(i));
			
 
				 	}
			
 
				 #endif /* CONFIG_CPUMASK_OFFSTACK */
			
 
				 
			
@@ -7530,21 +7651,12 @@ void __init sched_init(void)
 
				 
			
 
				 	set_load_weight(&init_task);
			
 
				 
			
 
				-#ifdef CONFIG_PREEMPT_NOTIFIERS
			
 
				-	INIT_HLIST_HEAD(&init_task.preempt_notifiers);
			
 
				-#endif
			
 
				-
			
 
				 	/*
			
 
				 	 * The boot idle thread does lazy MMU switching as well:
			
 
				 	 */
			
 
				 	atomic_inc(&init_mm.mm_count);
			
 
				 	enter_lazy_tlb(&init_mm, current);
			
 
				 
			
 
				-	/*
			
 
				-	 * During early bootup we pretend to be a normal task:
			
 
				-	 */
			
 
				-	current->sched_class = &fair_sched_class;
			
 
				-
			
 
				 	/*
			
 
				 	 * Make us the idle thread. Technically, schedule() should not be
			
 
				 	 * called from this thread, however somewhere below it might be,
			
@@ -7599,6 +7711,7 @@ EXPORT_SYMBOL(__might_sleep);
 
				 void ___might_sleep(const char *file, int line, int preempt_offset)
			
 
				 {
			
 
				 	static unsigned long prev_jiffy;	/* ratelimiting */
			
 
				+	unsigned long preempt_disable_ip;
			
 
				 
			
 
				 	rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
			
 
				 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
			
@@ -7609,6 +7722,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 
				 		return;
			
 
				 	prev_jiffy = jiffies;
			
 
				 
			
 
				+	/* Save this before calling printk(), since that will clobber it */
			
 
				+	preempt_disable_ip = get_preempt_disable_ip(current);
			
 
				+
			
 
				 	printk(KERN_ERR
			
 
				 		"BUG: sleeping function called from invalid context at %s:%d\n",
			
 
				 			file, line);
			
@@ -7623,14 +7739,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
 
				 	debug_show_held_locks(current);
			
 
				 	if (irqs_disabled())
			
 
				 		print_irqtrace_events(current);
			
 
				-#ifdef CONFIG_DEBUG_PREEMPT
			
 
				-	if (!preempt_count_equals(preempt_offset)) {
			
 
				+	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
			
 
				+	    && !preempt_count_equals(preempt_offset)) {
			
 
				 		pr_err("Preemption disabled at:");
			
 
				-		print_ip_sym(current->preempt_disable_ip);
			
 
				+		print_ip_sym(preempt_disable_ip);
			
 
				 		pr_cont("\n");
			
 
				 	}
			
 
				-#endif
			
 
				 	dump_stack();
			
 
				+	add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
			
 
				 }
			
 
				 EXPORT_SYMBOL(___might_sleep);
			
 
				 #endif
			
@@ -7651,12 +7767,10 @@ void normalize_rt_tasks(void)
 
				 		if (p->flags & PF_KTHREAD)
			
 
				 			continue;
			
 
				 
			
 
				-		p->se.exec_start		= 0;
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				-		p->se.statistics.wait_start	= 0;
			
 
				-		p->se.statistics.sleep_start	= 0;
			
 
				-		p->se.statistics.block_start	= 0;
			
 
				-#endif
			
 
				+		p->se.exec_start = 0;
			
 
				+		schedstat_set(p->se.statistics.wait_start,  0);
			
 
				+		schedstat_set(p->se.statistics.sleep_start, 0);
			
 
				+		schedstat_set(p->se.statistics.block_start, 0);
			
 
				 
			
 
				 		if (!dl_task(p) && !rt_task(p)) {
			
 
				 			/*
			
@@ -7717,7 +7831,7 @@ struct task_struct *curr_task(int cpu)
 
				  *
			
 
				  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
			
 
				  */
			
 
				-void set_curr_task(int cpu, struct task_struct *p)
			
 
				+void ia64_set_curr_task(int cpu, struct task_struct *p)
			
 
				 {
			
 
				 	cpu_curr(cpu) = p;
			
 
				 }
			
@@ -7848,10 +7962,10 @@ void sched_move_task(struct task_struct *tsk)
 
				 
			
 
				 	sched_change_group(tsk, TASK_MOVE_GROUP);
			
 
				 
			
 
				-	if (unlikely(running))
			
 
				-		tsk->sched_class->set_curr_task(rq);
			
 
				 	if (queued)
			
 
				 		enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
			
 
				+	if (unlikely(running))
			
 
				+		set_curr_task(rq, tsk);
			
 
				 
			
 
				 	task_rq_unlock(rq, tsk, &rf);
			
 
				 }
			
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
 
				 	return (i << 1) + 2;
			
 
				 }
			
 
				 
			
 
				-static void cpudl_exchange(struct cpudl *cp, int a, int b)
			
 
				+static void cpudl_heapify_down(struct cpudl *cp, int idx)
			
 
				 {
			
 
				-	int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
			
 
				+	int l, r, largest;
			
 
				 
			
 
				-	swap(cp->elements[a].cpu, cp->elements[b].cpu);
			
 
				-	swap(cp->elements[a].dl , cp->elements[b].dl );
			
 
				+	int orig_cpu = cp->elements[idx].cpu;
			
 
				+	u64 orig_dl = cp->elements[idx].dl;
			
 
				 
			
 
				-	swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
			
 
				-}
			
 
				-
			
 
				-static void cpudl_heapify(struct cpudl *cp, int idx)
			
 
				-{
			
 
				-	int l, r, largest;
			
 
				+	if (left_child(idx) >= cp->size)
			
 
				+		return;
			
 
				 
			
 
				 	/* adapted from lib/prio_heap.c */
			
 
				 	while(1) {
			
 
				+		u64 largest_dl;
			
 
				 		l = left_child(idx);
			
 
				 		r = right_child(idx);
			
 
				 		largest = idx;
			
 
				+		largest_dl = orig_dl;
			
 
				 
			
 
				-		if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
			
 
				-							cp->elements[l].dl))
			
 
				+		if ((l < cp->size) && dl_time_before(orig_dl,
			
 
				+						cp->elements[l].dl)) {
			
 
				 			largest = l;
			
 
				-		if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
			
 
				-							cp->elements[r].dl))
			
 
				+			largest_dl = cp->elements[l].dl;
			
 
				+		}
			
 
				+		if ((r < cp->size) && dl_time_before(largest_dl,
			
 
				+						cp->elements[r].dl))
			
 
				 			largest = r;
			
 
				+
			
 
				 		if (largest == idx)
			
 
				 			break;
			
 
				 
			
 
				-		/* Push idx down the heap one level and bump one up */
			
 
				-		cpudl_exchange(cp, largest, idx);
			
 
				+		/* pull largest child onto idx */
			
 
				+		cp->elements[idx].cpu = cp->elements[largest].cpu;
			
 
				+		cp->elements[idx].dl = cp->elements[largest].dl;
			
 
				+		cp->elements[cp->elements[idx].cpu].idx = idx;
			
 
				 		idx = largest;
			
 
				 	}
			
 
				+	/* actual push down of saved original values orig_* */
			
 
				+	cp->elements[idx].cpu = orig_cpu;
			
 
				+	cp->elements[idx].dl = orig_dl;
			
 
				+	cp->elements[cp->elements[idx].cpu].idx = idx;
			
 
				 }
			
 
				 
			
 
				-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
			
 
				+static void cpudl_heapify_up(struct cpudl *cp, int idx)
			
 
				 {
			
 
				-	WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
			
 
				+	int p;
			
 
				 
			
 
				-	if (dl_time_before(new_dl, cp->elements[idx].dl)) {
			
 
				-		cp->elements[idx].dl = new_dl;
			
 
				-		cpudl_heapify(cp, idx);
			
 
				-	} else {
			
 
				-		cp->elements[idx].dl = new_dl;
			
 
				-		while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
			
 
				-					cp->elements[idx].dl)) {
			
 
				-			cpudl_exchange(cp, idx, parent(idx));
			
 
				-			idx = parent(idx);
			
 
				-		}
			
 
				-	}
			
 
				+	int orig_cpu = cp->elements[idx].cpu;
			
 
				+	u64 orig_dl = cp->elements[idx].dl;
			
 
				+
			
 
				+	if (idx == 0)
			
 
				+		return;
			
 
				+
			
 
				+	do {
			
 
				+		p = parent(idx);
			
 
				+		if (dl_time_before(orig_dl, cp->elements[p].dl))
			
 
				+			break;
			
 
				+		/* pull parent onto idx */
			
 
				+		cp->elements[idx].cpu = cp->elements[p].cpu;
			
 
				+		cp->elements[idx].dl = cp->elements[p].dl;
			
 
				+		cp->elements[cp->elements[idx].cpu].idx = idx;
			
 
				+		idx = p;
			
 
				+	} while (idx != 0);
			
 
				+	/* actual push up of saved original values orig_* */
			
 
				+	cp->elements[idx].cpu = orig_cpu;
			
 
				+	cp->elements[idx].dl = orig_dl;
			
 
				+	cp->elements[cp->elements[idx].cpu].idx = idx;
			
 
				+}
			
 
				+
			
 
				+static void cpudl_heapify(struct cpudl *cp, int idx)
			
 
				+{
			
 
				+	if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
			
 
				+				cp->elements[idx].dl))
			
 
				+		cpudl_heapify_up(cp, idx);
			
 
				+	else
			
 
				+		cpudl_heapify_down(cp, idx);
			
 
				 }
			
 
				 
			
 
				 static inline int cpudl_maximum(struct cpudl *cp)
			
@@ -120,16 +145,15 @@ out:
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * cpudl_set - update the cpudl max-heap
			
 
				+ * cpudl_clear - remove a cpu from the cpudl max-heap
			
 
				  * @cp: the cpudl max-heap context
			
 
				  * @cpu: the target cpu
			
 
				- * @dl: the new earliest deadline for this cpu
			
 
				  *
			
 
				  * Notes: assumes cpu_rq(cpu)->lock is locked
			
 
				  *
			
 
				  * Returns: (void)
			
 
				  */
			
 
				-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
			
 
				+void cpudl_clear(struct cpudl *cp, int cpu)
			
 
				 {
			
 
				 	int old_idx, new_cpu;
			
 
				 	unsigned long flags;
			
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
 
				 	WARN_ON(!cpu_present(cpu));
			
 
				 
			
 
				 	raw_spin_lock_irqsave(&cp->lock, flags);
			
 
				+
			
 
				 	old_idx = cp->elements[cpu].idx;
			
 
				-	if (!is_valid) {
			
 
				-		/* remove item */
			
 
				-		if (old_idx == IDX_INVALID) {
			
 
				-			/*
			
 
				-			 * Nothing to remove if old_idx was invalid.
			
 
				-			 * This could happen if a rq_offline_dl is
			
 
				-			 * called for a CPU without -dl tasks running.
			
 
				-			 */
			
 
				-			goto out;
			
 
				-		}
			
 
				+	if (old_idx == IDX_INVALID) {
			
 
				+		/*
			
 
				+		 * Nothing to remove if old_idx was invalid.
			
 
				+		 * This could happen if a rq_offline_dl is
			
 
				+		 * called for a CPU without -dl tasks running.
			
 
				+		 */
			
 
				+	} else {
			
 
				 		new_cpu = cp->elements[cp->size - 1].cpu;
			
 
				 		cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
			
 
				 		cp->elements[old_idx].cpu = new_cpu;
			
 
				 		cp->size--;
			
 
				 		cp->elements[new_cpu].idx = old_idx;
			
 
				 		cp->elements[cpu].idx = IDX_INVALID;
			
 
				-		while (old_idx > 0 && dl_time_before(
			
 
				-				cp->elements[parent(old_idx)].dl,
			
 
				-				cp->elements[old_idx].dl)) {
			
 
				-			cpudl_exchange(cp, old_idx, parent(old_idx));
			
 
				-			old_idx = parent(old_idx);
			
 
				-		}
			
 
				-		cpumask_set_cpu(cpu, cp->free_cpus);
			
 
				-                cpudl_heapify(cp, old_idx);
			
 
				+		cpudl_heapify(cp, old_idx);
			
 
				 
			
 
				-		goto out;
			
 
				+		cpumask_set_cpu(cpu, cp->free_cpus);
			
 
				 	}
			
 
				+	raw_spin_unlock_irqrestore(&cp->lock, flags);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * cpudl_set - update the cpudl max-heap
			
 
				+ * @cp: the cpudl max-heap context
			
 
				+ * @cpu: the target cpu
			
 
				+ * @dl: the new earliest deadline for this cpu
			
 
				+ *
			
 
				+ * Notes: assumes cpu_rq(cpu)->lock is locked
			
 
				+ *
			
 
				+ * Returns: (void)
			
 
				+ */
			
 
				+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
			
 
				+{
			
 
				+	int old_idx;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				+	WARN_ON(!cpu_present(cpu));
			
 
				+
			
 
				+	raw_spin_lock_irqsave(&cp->lock, flags);
			
 
				+
			
 
				+	old_idx = cp->elements[cpu].idx;
			
 
				 	if (old_idx == IDX_INVALID) {
			
 
				-		cp->size++;
			
 
				-		cp->elements[cp->size - 1].dl = dl;
			
 
				-		cp->elements[cp->size - 1].cpu = cpu;
			
 
				-		cp->elements[cpu].idx = cp->size - 1;
			
 
				-		cpudl_change_key(cp, cp->size - 1, dl);
			
 
				+		int new_idx = cp->size++;
			
 
				+		cp->elements[new_idx].dl = dl;
			
 
				+		cp->elements[new_idx].cpu = cpu;
			
 
				+		cp->elements[cpu].idx = new_idx;
			
 
				+		cpudl_heapify_up(cp, new_idx);
			
 
				 		cpumask_clear_cpu(cpu, cp->free_cpus);
			
 
				 	} else {
			
 
				-		cpudl_change_key(cp, old_idx, dl);
			
 
				+		cp->elements[old_idx].dl = dl;
			
 
				+		cpudl_heapify(cp, old_idx);
			
 
				 	}
			
 
				 
			
 
				-out:
			
 
				 	raw_spin_unlock_irqrestore(&cp->lock, flags);
			
 
				 }
			
 
				 
			
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
 
				 #ifdef CONFIG_SMP
			
 
				 int cpudl_find(struct cpudl *cp, struct task_struct *p,
			
 
				 	       struct cpumask *later_mask);
			
 
				-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
			
 
				+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
			
 
				+void cpudl_clear(struct cpudl *cp, int cpu);
			
 
				 int cpudl_init(struct cpudl *cp);
			
 
				 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
			
 
				 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
			
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
 
				  * task when irq is in progress while we read rq->clock. That is a worthy
			
 
				  * compromise in place of having locks on each irq in account_system_time.
			
 
				  */
			
 
				-DEFINE_PER_CPU(u64, cpu_hardirq_time);
			
 
				-DEFINE_PER_CPU(u64, cpu_softirq_time);
			
 
				+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
			
 
				 
			
 
				-static DEFINE_PER_CPU(u64, irq_start_time);
			
 
				 static int sched_clock_irqtime;
			
 
				 
			
 
				 void enable_sched_clock_irqtime(void)
			
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
 
				 	sched_clock_irqtime = 0;
			
 
				 }
			
 
				 
			
 
				-#ifndef CONFIG_64BIT
			
 
				-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
			
 
				-#endif /* CONFIG_64BIT */
			
 
				-
			
 
				 /*
			
 
				  * Called before incrementing preempt_count on {soft,}irq_enter
			
 
				  * and before decrementing preempt_count on {soft,}irq_exit.
			
 
				  */
			
 
				 void irqtime_account_irq(struct task_struct *curr)
			
 
				 {
			
 
				+	struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
			
 
				 	s64 delta;
			
 
				 	int cpu;
			
 
				 
			
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
 
				 		return;
			
 
				 
			
 
				 	cpu = smp_processor_id();
			
 
				-	delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
			
 
				-	__this_cpu_add(irq_start_time, delta);
			
 
				+	delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
			
 
				+	irqtime->irq_start_time += delta;
			
 
				 
			
 
				-	irq_time_write_begin();
			
 
				+	u64_stats_update_begin(&irqtime->sync);
			
 
				 	/*
			
 
				 	 * We do not account for softirq time from ksoftirqd here.
			
 
				 	 * We want to continue accounting softirq time to ksoftirqd thread
			
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
 
				 	 * that do not consume any time, but still wants to run.
			
 
				 	 */
			
 
				 	if (hardirq_count())
			
 
				-		__this_cpu_add(cpu_hardirq_time, delta);
			
 
				+		irqtime->hardirq_time += delta;
			
 
				 	else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
			
 
				-		__this_cpu_add(cpu_softirq_time, delta);
			
 
				+		irqtime->softirq_time += delta;
			
 
				 
			
 
				-	irq_time_write_end();
			
 
				+	u64_stats_update_end(&irqtime->sync);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(irqtime_account_irq);
			
 
				 
			
 
				-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
			
 
				+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
			
 
				 {
			
 
				 	u64 *cpustat = kcpustat_this_cpu->cpustat;
			
 
				-	unsigned long flags;
			
 
				 	cputime_t irq_cputime;
			
 
				 
			
 
				-	local_irq_save(flags);
			
 
				-	irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
			
 
				-		      cpustat[CPUTIME_IRQ];
			
 
				+	irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
			
 
				 	irq_cputime = min(irq_cputime, maxtime);
			
 
				-	cpustat[CPUTIME_IRQ] += irq_cputime;
			
 
				-	local_irq_restore(flags);
			
 
				+	cpustat[idx] += irq_cputime;
			
 
				+
			
 
				 	return irq_cputime;
			
 
				 }
			
 
				 
			
 
				-static cputime_t irqtime_account_si_update(cputime_t maxtime)
			
 
				+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
			
 
				 {
			
 
				-	u64 *cpustat = kcpustat_this_cpu->cpustat;
			
 
				-	unsigned long flags;
			
 
				-	cputime_t softirq_cputime;
			
 
				+	return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
			
 
				+				      CPUTIME_IRQ, maxtime);
			
 
				+}
			
 
				 
			
 
				-	local_irq_save(flags);
			
 
				-	softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
			
 
				-			  cpustat[CPUTIME_SOFTIRQ];
			
 
				-	softirq_cputime = min(softirq_cputime, maxtime);
			
 
				-	cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
			
 
				-	local_irq_restore(flags);
			
 
				-	return softirq_cputime;
			
 
				+static cputime_t irqtime_account_si_update(cputime_t maxtime)
			
 
				+{
			
 
				+	return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
			
 
				+				      CPUTIME_SOFTIRQ, maxtime);
			
 
				 }
			
 
				 
			
 
				 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
			
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
 
				 {
			
 
				 	cputime_t accounted;
			
 
				 
			
 
				+	/* Shall be converted to a lockdep-enabled lightweight check */
			
 
				+	WARN_ON_ONCE(!irqs_disabled());
			
 
				+
			
 
				 	accounted = steal_account_process_time(max);
			
 
				 
			
 
				 	if (accounted < max)
			
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
 
				 	return accounted;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_64BIT
			
 
				+static inline u64 read_sum_exec_runtime(struct task_struct *t)
			
 
				+{
			
 
				+	return t->se.sum_exec_runtime;
			
 
				+}
			
 
				+#else
			
 
				+static u64 read_sum_exec_runtime(struct task_struct *t)
			
 
				+{
			
 
				+	u64 ns;
			
 
				+	struct rq_flags rf;
			
 
				+	struct rq *rq;
			
 
				+
			
 
				+	rq = task_rq_lock(t, &rf);
			
 
				+	ns = t->se.sum_exec_runtime;
			
 
				+	task_rq_unlock(rq, t, &rf);
			
 
				+
			
 
				+	return ns;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 /*
			
 
				  * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
			
 
				  * tasks (sum on group iteration) belonging to @tsk's group.
			
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 
				 	unsigned int seq, nextseq;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				+	/*
			
 
				+	 * Update current task runtime to account pending time since last
			
 
				+	 * scheduler action or thread_group_cputime() call. This thread group
			
 
				+	 * might have other running tasks on different CPUs, but updating
			
 
				+	 * their runtime can affect syscall performance, so we skip account
			
 
				+	 * those pending times and rely only on values updated on tick or
			
 
				+	 * other scheduler action.
			
 
				+	 */
			
 
				+	if (same_thread_group(current, tsk))
			
 
				+		(void) task_sched_runtime(current);
			
 
				+
			
 
				 	rcu_read_lock();
			
 
				 	/* Attempt a lockless read on the first round. */
			
 
				 	nextseq = 0;
			
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 
				 			task_cputime(t, &utime, &stime);
			
 
				 			times->utime += utime;
			
 
				 			times->stime += stime;
			
 
				-			times->sum_exec_runtime += task_sched_runtime(t);
			
 
				+			times->sum_exec_runtime += read_sum_exec_runtime(t);
			
 
				 		}
			
 
				 		/* If lockless access failed, take the lock. */
			
 
				 		nextseq = 1;
			
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 
				 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
			
 
				 {
			
 
				 	struct rq *later_rq = NULL;
			
 
				-	bool fallback = false;
			
 
				 
			
 
				 	later_rq = find_lock_later_rq(p, rq);
			
 
				-
			
 
				 	if (!later_rq) {
			
 
				 		int cpu;
			
 
				 
			
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 
				 		 * If we cannot preempt any rq, fall back to pick any
			
 
				 		 * online cpu.
			
 
				 		 */
			
 
				-		fallback = true;
			
 
				 		cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
			
 
				 		if (cpu >= nr_cpu_ids) {
			
 
				 			/*
			
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
 
				 		double_lock_balance(rq, later_rq);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * By now the task is replenished and enqueued; migrate it.
			
 
				-	 */
			
 
				-	deactivate_task(rq, p, 0);
			
 
				 	set_task_cpu(p, later_rq->cpu);
			
 
				-	activate_task(later_rq, p, 0);
			
 
				-
			
 
				-	if (!fallback)
			
 
				-		resched_curr(later_rq);
			
 
				-
			
 
				 	double_unlock_balance(later_rq, rq);
			
 
				 
			
 
				 	return later_rq;
			
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 
				  * one, and to (try to!) reconcile itself with its own scheduling
			
 
				  * parameters.
			
 
				  */
			
 
				-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
			
 
				-				       struct sched_dl_entity *pi_se)
			
 
				+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
			
 
				 {
			
 
				 	struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
			
 
				 	struct rq *rq = rq_of_dl_rq(dl_rq);
			
 
				 
			
 
				+	WARN_ON(dl_se->dl_boosted);
			
 
				 	WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
			
 
				 
			
 
				 	/*
			
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
 
				 	 * future; in fact, we must consider execution overheads (time
			
 
				 	 * spent on hardirq context, etc.).
			
 
				 	 */
			
 
				-	dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
			
 
				-	dl_se->runtime = pi_se->dl_runtime;
			
 
				+	dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
			
 
				+	dl_se->runtime = dl_se->dl_runtime;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 
				 		goto unlock;
			
 
				 	}
			
 
				 
			
 
				-	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
			
 
				-	if (dl_task(rq->curr))
			
 
				-		check_preempt_curr_dl(rq, p, 0);
			
 
				-	else
			
 
				-		resched_curr(rq);
			
 
				-
			
 
				 #ifdef CONFIG_SMP
			
 
				-	/*
			
 
				-	 * Perform balancing operations here; after the replenishments.  We
			
 
				-	 * cannot drop rq->lock before this, otherwise the assertion in
			
 
				-	 * start_dl_timer() about not missing updates is not true.
			
 
				-	 *
			
 
				-	 * If we find that the rq the task was on is no longer available, we
			
 
				-	 * need to select a new rq.
			
 
				-	 *
			
 
				-	 * XXX figure out if select_task_rq_dl() deals with offline cpus.
			
 
				-	 */
			
 
				 	if (unlikely(!rq->online)) {
			
 
				+		/*
			
 
				+		 * If the runqueue is no longer available, migrate the
			
 
				+		 * task elsewhere. This necessarily changes rq.
			
 
				+		 */
			
 
				 		lockdep_unpin_lock(&rq->lock, rf.cookie);
			
 
				 		rq = dl_task_offline_migration(rq, p);
			
 
				 		rf.cookie = lockdep_pin_lock(&rq->lock);
			
 
				+
			
 
				+		/*
			
 
				+		 * Now that the task has been migrated to the new RQ and we
			
 
				+		 * have that locked, proceed as normal and enqueue the task
			
 
				+		 * there.
			
 
				+		 */
			
 
				 	}
			
 
				+#endif
			
 
				+
			
 
				+	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
			
 
				+	if (dl_task(rq->curr))
			
 
				+		check_preempt_curr_dl(rq, p, 0);
			
 
				+	else
			
 
				+		resched_curr(rq);
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				 	/*
			
 
				 	 * Queueing this task back might have overloaded rq, check if we need
			
 
				 	 * to kick someone away.
			
@@ -797,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 
				 	if (dl_rq->earliest_dl.curr == 0 ||
			
 
				 	    dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
			
 
				 		dl_rq->earliest_dl.curr = deadline;
			
 
				-		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
			
 
				+		cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -812,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
 
				 	if (!dl_rq->dl_nr_running) {
			
 
				 		dl_rq->earliest_dl.curr = 0;
			
 
				 		dl_rq->earliest_dl.next = 0;
			
 
				-		cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
			
 
				+		cpudl_clear(&rq->rd->cpudl, rq->cpu);
			
 
				 	} else {
			
 
				 		struct rb_node *leftmost = dl_rq->rb_leftmost;
			
 
				 		struct sched_dl_entity *entry;
			
 
				 
			
 
				 		entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
			
 
				 		dl_rq->earliest_dl.curr = entry->deadline;
			
 
				-		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
			
 
				+		cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -1670,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
 
				 
			
 
				 	cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
			
 
				 	if (rq->dl.dl_nr_running > 0)
			
 
				-		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
			
 
				+		cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
			
 
				 }
			
 
				 
			
 
				 /* Assumes rq->lock is held */
			
@@ -1679,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
 
				 	if (rq->dl.overloaded)
			
 
				 		dl_clear_overload(rq);
			
 
				 
			
 
				-	cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
			
 
				+	cpudl_clear(&rq->rd->cpudl, rq->cpu);
			
 
				 	cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
			
 
				 }
			
 
				 
			
@@ -1722,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 
				  */
			
 
				 static void switched_to_dl(struct rq *rq, struct task_struct *p)
			
 
				 {
			
 
				+
			
 
				+	/* If p is not queued we will update its parameters at next wakeup. */
			
 
				+	if (!task_on_rq_queued(p))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * If p is boosted we already updated its params in
			
 
				+	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
			
 
				+	 * p's deadline being now already after rq_clock(rq).
			
 
				+	 */
			
 
				 	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
			
 
				-		setup_new_dl_entity(&p->dl, &p->dl);
			
 
				+		setup_new_dl_entity(&p->dl);
			
 
				 
			
 
				-	if (task_on_rq_queued(p) && rq->curr != p) {
			
 
				+	if (rq->curr != p) {
			
 
				 #ifdef CONFIG_SMP
			
 
				 		if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
			
 
				 			queue_push_tasks(rq);
			
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
				 
			
 
				 #define P(F) \
			
 
				 	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
			
 
				+#define P_SCHEDSTAT(F) \
			
 
				+	SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
			
 
				 #define PN(F) \
			
 
				 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
			
 
				+#define PN_SCHEDSTAT(F) \
			
 
				+	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
			
 
				 
			
 
				 	if (!se)
			
 
				 		return;
			
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
				 	PN(se->exec_start);
			
 
				 	PN(se->vruntime);
			
 
				 	PN(se->sum_exec_runtime);
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				 	if (schedstat_enabled()) {
			
 
				-		PN(se->statistics.wait_start);
			
 
				-		PN(se->statistics.sleep_start);
			
 
				-		PN(se->statistics.block_start);
			
 
				-		PN(se->statistics.sleep_max);
			
 
				-		PN(se->statistics.block_max);
			
 
				-		PN(se->statistics.exec_max);
			
 
				-		PN(se->statistics.slice_max);
			
 
				-		PN(se->statistics.wait_max);
			
 
				-		PN(se->statistics.wait_sum);
			
 
				-		P(se->statistics.wait_count);
			
 
				+		PN_SCHEDSTAT(se->statistics.wait_start);
			
 
				+		PN_SCHEDSTAT(se->statistics.sleep_start);
			
 
				+		PN_SCHEDSTAT(se->statistics.block_start);
			
 
				+		PN_SCHEDSTAT(se->statistics.sleep_max);
			
 
				+		PN_SCHEDSTAT(se->statistics.block_max);
			
 
				+		PN_SCHEDSTAT(se->statistics.exec_max);
			
 
				+		PN_SCHEDSTAT(se->statistics.slice_max);
			
 
				+		PN_SCHEDSTAT(se->statistics.wait_max);
			
 
				+		PN_SCHEDSTAT(se->statistics.wait_sum);
			
 
				+		P_SCHEDSTAT(se->statistics.wait_count);
			
 
				 	}
			
 
				-#endif
			
 
				 	P(se->load.weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				 	P(se->avg.load_avg);
			
 
				 	P(se->avg.util_avg);
			
 
				 #endif
			
 
				+
			
 
				+#undef PN_SCHEDSTAT
			
 
				 #undef PN
			
 
				+#undef P_SCHEDSTAT
			
 
				 #undef P
			
 
				 }
			
 
				 #endif
			
@@ -429,9 +434,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 
				 		p->prio);
			
 
				 
			
 
				 	SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
			
 
				-		SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
			
 
				+		SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
			
 
				 		SPLIT_NS(p->se.sum_exec_runtime),
			
 
				-		SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
			
 
				+		SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
			
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 	SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
			
@@ -626,9 +631,7 @@ do {									\
 
				 #undef P64
			
 
				 #endif
			
 
				 
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
			
 
				-
			
 
				+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
			
 
				 	if (schedstat_enabled()) {
			
 
				 		P(yld_count);
			
 
				 		P(sched_count);
			
@@ -636,9 +639,8 @@ do {									\
 
				 		P(ttwu_count);
			
 
				 		P(ttwu_local);
			
 
				 	}
			
 
				-
			
 
				 #undef P
			
 
				-#endif
			
 
				+
			
 
				 	spin_lock_irqsave(&sched_debug_lock, flags);
			
 
				 	print_cfs_stats(m, cpu);
			
 
				 	print_rt_stats(m, cpu);
			
@@ -868,10 +870,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
			
 
				 #define P(F) \
			
 
				 	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
			
 
				+#define P_SCHEDSTAT(F) \
			
 
				+	SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
			
 
				 #define __PN(F) \
			
 
				 	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
			
 
				 #define PN(F) \
			
 
				 	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
			
 
				+#define PN_SCHEDSTAT(F) \
			
 
				+	SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
			
 
				 
			
 
				 	PN(se.exec_start);
			
 
				 	PN(se.vruntime);
			
@@ -881,37 +887,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 
			
 
				 	P(se.nr_migrations);
			
 
				 
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				 	if (schedstat_enabled()) {
			
 
				 		u64 avg_atom, avg_per_cpu;
			
 
				 
			
 
				-		PN(se.statistics.sum_sleep_runtime);
			
 
				-		PN(se.statistics.wait_start);
			
 
				-		PN(se.statistics.sleep_start);
			
 
				-		PN(se.statistics.block_start);
			
 
				-		PN(se.statistics.sleep_max);
			
 
				-		PN(se.statistics.block_max);
			
 
				-		PN(se.statistics.exec_max);
			
 
				-		PN(se.statistics.slice_max);
			
 
				-		PN(se.statistics.wait_max);
			
 
				-		PN(se.statistics.wait_sum);
			
 
				-		P(se.statistics.wait_count);
			
 
				-		PN(se.statistics.iowait_sum);
			
 
				-		P(se.statistics.iowait_count);
			
 
				-		P(se.statistics.nr_migrations_cold);
			
 
				-		P(se.statistics.nr_failed_migrations_affine);
			
 
				-		P(se.statistics.nr_failed_migrations_running);
			
 
				-		P(se.statistics.nr_failed_migrations_hot);
			
 
				-		P(se.statistics.nr_forced_migrations);
			
 
				-		P(se.statistics.nr_wakeups);
			
 
				-		P(se.statistics.nr_wakeups_sync);
			
 
				-		P(se.statistics.nr_wakeups_migrate);
			
 
				-		P(se.statistics.nr_wakeups_local);
			
 
				-		P(se.statistics.nr_wakeups_remote);
			
 
				-		P(se.statistics.nr_wakeups_affine);
			
 
				-		P(se.statistics.nr_wakeups_affine_attempts);
			
 
				-		P(se.statistics.nr_wakeups_passive);
			
 
				-		P(se.statistics.nr_wakeups_idle);
			
 
				+		PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
			
 
				+		PN_SCHEDSTAT(se.statistics.wait_start);
			
 
				+		PN_SCHEDSTAT(se.statistics.sleep_start);
			
 
				+		PN_SCHEDSTAT(se.statistics.block_start);
			
 
				+		PN_SCHEDSTAT(se.statistics.sleep_max);
			
 
				+		PN_SCHEDSTAT(se.statistics.block_max);
			
 
				+		PN_SCHEDSTAT(se.statistics.exec_max);
			
 
				+		PN_SCHEDSTAT(se.statistics.slice_max);
			
 
				+		PN_SCHEDSTAT(se.statistics.wait_max);
			
 
				+		PN_SCHEDSTAT(se.statistics.wait_sum);
			
 
				+		P_SCHEDSTAT(se.statistics.wait_count);
			
 
				+		PN_SCHEDSTAT(se.statistics.iowait_sum);
			
 
				+		P_SCHEDSTAT(se.statistics.iowait_count);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_migrations_cold);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_forced_migrations);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_local);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
			
 
				+		P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
			
 
				 
			
 
				 		avg_atom = p->se.sum_exec_runtime;
			
 
				 		if (nr_switches)
			
@@ -930,7 +935,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 		__PN(avg_atom);
			
 
				 		__PN(avg_per_cpu);
			
 
				 	}
			
 
				-#endif
			
 
				+
			
 
				 	__P(nr_switches);
			
 
				 	SEQ_printf(m, "%-45s:%21Ld\n",
			
 
				 		   "nr_voluntary_switches", (long long)p->nvcsw);
			
@@ -947,8 +952,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 #endif
			
 
				 	P(policy);
			
 
				 	P(prio);
			
 
				+#undef PN_SCHEDSTAT
			
 
				 #undef PN
			
 
				 #undef __PN
			
 
				+#undef P_SCHEDSTAT
			
 
				 #undef P
			
 
				 #undef __P
			
 
				 
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
				 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * The margin used when comparing utilization with CPU capacity:
			
 
				+ * util * 1024 < capacity * margin
			
 
				+ */
			
 
				+unsigned int capacity_margin = 1280; /* ~20% */
			
 
				+
			
 
				 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
			
 
				 {
			
 
				 	lw->weight += inc;
			
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 
				 
			
 
				 static inline struct task_struct *task_of(struct sched_entity *se)
			
 
				 {
			
 
				-#ifdef CONFIG_SCHED_DEBUG
			
 
				-	WARN_ON_ONCE(!entity_is_task(se));
			
 
				-#endif
			
 
				+	SCHED_WARN_ON(!entity_is_task(se));
			
 
				 	return container_of(se, struct task_struct, se);
			
 
				 }
			
 
				 
			
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
 
				 
			
 
				 static void update_min_vruntime(struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				+	struct sched_entity *curr = cfs_rq->curr;
			
 
				+
			
 
				 	u64 vruntime = cfs_rq->min_vruntime;
			
 
				 
			
 
				-	if (cfs_rq->curr)
			
 
				-		vruntime = cfs_rq->curr->vruntime;
			
 
				+	if (curr) {
			
 
				+		if (curr->on_rq)
			
 
				+			vruntime = curr->vruntime;
			
 
				+		else
			
 
				+			curr = NULL;
			
 
				+	}
			
 
				 
			
 
				 	if (cfs_rq->rb_leftmost) {
			
 
				 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
			
 
				 						   struct sched_entity,
			
 
				 						   run_node);
			
 
				 
			
 
				-		if (!cfs_rq->curr)
			
 
				+		if (!curr)
			
 
				 			vruntime = se->vruntime;
			
 
				 		else
			
 
				 			vruntime = min_vruntime(vruntime, se->vruntime);
			
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-static int select_idle_sibling(struct task_struct *p, int cpu);
			
 
				+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
			
 
				 static unsigned long task_h_load(struct task_struct *p);
			
 
				 
			
 
				 /*
			
@@ -726,7 +736,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
				 	struct sched_avg *sa = &se->avg;
			
 
				 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
			
 
				 	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				-	int tg_update;
			
 
				 
			
 
				 	if (cap > 0) {
			
 
				 		if (cfs_rq->avg.util_avg != 0) {
			
@@ -759,10 +768,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				+	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				 	attach_entity_load_avg(cfs_rq, se);
			
 
				-	if (tg_update)
			
 
				-		update_tg_load_avg(cfs_rq, false);
			
 
				+	update_tg_load_avg(cfs_rq, false);
			
 
				 }
			
 
				 
			
 
				 #else /* !CONFIG_SMP */
			
@@ -799,7 +807,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 
				 		      max(delta_exec, curr->statistics.exec_max));
			
 
				 
			
 
				 	curr->sum_exec_runtime += delta_exec;
			
 
				-	schedstat_add(cfs_rq, exec_clock, delta_exec);
			
 
				+	schedstat_add(cfs_rq->exec_clock, delta_exec);
			
 
				 
			
 
				 	curr->vruntime += calc_delta_fair(delta_exec, curr);
			
 
				 	update_min_vruntime(cfs_rq);
			
@@ -820,26 +828,34 @@ static void update_curr_fair(struct rq *rq)
 
				 	update_curr(cfs_rq_of(&rq->curr->se));
			
 
				 }
			
 
				 
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				 static inline void
			
 
				 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	u64 wait_start = rq_clock(rq_of(cfs_rq));
			
 
				+	u64 wait_start, prev_wait_start;
			
 
				+
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				+	wait_start = rq_clock(rq_of(cfs_rq));
			
 
				+	prev_wait_start = schedstat_val(se->statistics.wait_start);
			
 
				 
			
 
				 	if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
			
 
				-	    likely(wait_start > se->statistics.wait_start))
			
 
				-		wait_start -= se->statistics.wait_start;
			
 
				+	    likely(wait_start > prev_wait_start))
			
 
				+		wait_start -= prev_wait_start;
			
 
				 
			
 
				-	se->statistics.wait_start = wait_start;
			
 
				+	schedstat_set(se->statistics.wait_start, wait_start);
			
 
				 }
			
 
				 
			
 
				-static void
			
 
				+static inline void
			
 
				 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				 	struct task_struct *p;
			
 
				 	u64 delta;
			
 
				 
			
 
				-	delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				+	delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
			
 
				 
			
 
				 	if (entity_is_task(se)) {
			
 
				 		p = task_of(se);
			
@@ -849,35 +865,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 			 * time stamp can be adjusted to accumulate wait time
			
 
				 			 * prior to migration.
			
 
				 			 */
			
 
				-			se->statistics.wait_start = delta;
			
 
				+			schedstat_set(se->statistics.wait_start, delta);
			
 
				 			return;
			
 
				 		}
			
 
				 		trace_sched_stat_wait(p, delta);
			
 
				 	}
			
 
				 
			
 
				-	se->statistics.wait_max = max(se->statistics.wait_max, delta);
			
 
				-	se->statistics.wait_count++;
			
 
				-	se->statistics.wait_sum += delta;
			
 
				-	se->statistics.wait_start = 0;
			
 
				+	schedstat_set(se->statistics.wait_max,
			
 
				+		      max(schedstat_val(se->statistics.wait_max), delta));
			
 
				+	schedstat_inc(se->statistics.wait_count);
			
 
				+	schedstat_add(se->statistics.wait_sum, delta);
			
 
				+	schedstat_set(se->statistics.wait_start, 0);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	struct task_struct *tsk = NULL;
			
 
				+	u64 sleep_start, block_start;
			
 
				+
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				+	sleep_start = schedstat_val(se->statistics.sleep_start);
			
 
				+	block_start = schedstat_val(se->statistics.block_start);
			
 
				+
			
 
				+	if (entity_is_task(se))
			
 
				+		tsk = task_of(se);
			
 
				+
			
 
				+	if (sleep_start) {
			
 
				+		u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
			
 
				+
			
 
				+		if ((s64)delta < 0)
			
 
				+			delta = 0;
			
 
				+
			
 
				+		if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
			
 
				+			schedstat_set(se->statistics.sleep_max, delta);
			
 
				+
			
 
				+		schedstat_set(se->statistics.sleep_start, 0);
			
 
				+		schedstat_add(se->statistics.sum_sleep_runtime, delta);
			
 
				+
			
 
				+		if (tsk) {
			
 
				+			account_scheduler_latency(tsk, delta >> 10, 1);
			
 
				+			trace_sched_stat_sleep(tsk, delta);
			
 
				+		}
			
 
				+	}
			
 
				+	if (block_start) {
			
 
				+		u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
			
 
				+
			
 
				+		if ((s64)delta < 0)
			
 
				+			delta = 0;
			
 
				+
			
 
				+		if (unlikely(delta > schedstat_val(se->statistics.block_max)))
			
 
				+			schedstat_set(se->statistics.block_max, delta);
			
 
				+
			
 
				+		schedstat_set(se->statistics.block_start, 0);
			
 
				+		schedstat_add(se->statistics.sum_sleep_runtime, delta);
			
 
				+
			
 
				+		if (tsk) {
			
 
				+			if (tsk->in_iowait) {
			
 
				+				schedstat_add(se->statistics.iowait_sum, delta);
			
 
				+				schedstat_inc(se->statistics.iowait_count);
			
 
				+				trace_sched_stat_iowait(tsk, delta);
			
 
				+			}
			
 
				+
			
 
				+			trace_sched_stat_blocked(tsk, delta);
			
 
				+
			
 
				+			/*
			
 
				+			 * Blocking time is in units of nanosecs, so shift by
			
 
				+			 * 20 to get a milliseconds-range estimation of the
			
 
				+			 * amount of time that the task spent sleeping:
			
 
				+			 */
			
 
				+			if (unlikely(prof_on == SLEEP_PROFILING)) {
			
 
				+				profile_hits(SLEEP_PROFILING,
			
 
				+						(void *)get_wchan(tsk),
			
 
				+						delta >> 20);
			
 
				+			}
			
 
				+			account_scheduler_latency(tsk, delta >> 10, 0);
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * Task is being enqueued - update stats:
			
 
				  */
			
 
				 static inline void
			
 
				-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
			
 
				 {
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				 	/*
			
 
				 	 * Are we enqueueing a waiting task? (for current tasks
			
 
				 	 * a dequeue/enqueue event is a NOP)
			
 
				 	 */
			
 
				 	if (se != cfs_rq->curr)
			
 
				 		update_stats_wait_start(cfs_rq, se);
			
 
				+
			
 
				+	if (flags & ENQUEUE_WAKEUP)
			
 
				+		update_stats_enqueue_sleeper(cfs_rq, se);
			
 
				 }
			
 
				 
			
 
				 static inline void
			
 
				 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
			
 
				 {
			
 
				+
			
 
				+	if (!schedstat_enabled())
			
 
				+		return;
			
 
				+
			
 
				 	/*
			
 
				 	 * Mark the end of the wait period if dequeueing a
			
 
				 	 * waiting task:
			
@@ -885,40 +980,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	if (se != cfs_rq->curr)
			
 
				 		update_stats_wait_end(cfs_rq, se);
			
 
				 
			
 
				-	if (flags & DEQUEUE_SLEEP) {
			
 
				-		if (entity_is_task(se)) {
			
 
				-			struct task_struct *tsk = task_of(se);
			
 
				+	if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
			
 
				+		struct task_struct *tsk = task_of(se);
			
 
				 
			
 
				-			if (tsk->state & TASK_INTERRUPTIBLE)
			
 
				-				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
			
 
				-			if (tsk->state & TASK_UNINTERRUPTIBLE)
			
 
				-				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
			
 
				-		}
			
 
				+		if (tsk->state & TASK_INTERRUPTIBLE)
			
 
				+			schedstat_set(se->statistics.sleep_start,
			
 
				+				      rq_clock(rq_of(cfs_rq)));
			
 
				+		if (tsk->state & TASK_UNINTERRUPTIBLE)
			
 
				+			schedstat_set(se->statistics.block_start,
			
 
				+				      rq_clock(rq_of(cfs_rq)));
			
 
				 	}
			
 
				-
			
 
				-}
			
 
				-#else
			
 
				-static inline void
			
 
				-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline void
			
 
				-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline void
			
 
				-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				-{
			
 
				 }
			
 
				 
			
 
				-static inline void
			
 
				-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
			
 
				-{
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 /*
			
 
				  * We are picking a new current task - update its stats:
			
 
				  */
			
@@ -1513,8 +1586,16 @@ balance:
 
				 	 * One idle CPU per node is evaluated for a task numa move.
			
 
				 	 * Call select_idle_sibling to maybe find a better one.
			
 
				 	 */
			
 
				-	if (!cur)
			
 
				-		env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
			
 
				+	if (!cur) {
			
 
				+		/*
			
 
				+		 * select_idle_siblings() uses an per-cpu cpumask that
			
 
				+		 * can be used from IRQ context.
			
 
				+		 */
			
 
				+		local_irq_disable();
			
 
				+		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
			
 
				+						   env->dst_cpu);
			
 
				+		local_irq_enable();
			
 
				+	}
			
 
				 
			
 
				 assign:
			
 
				 	task_numa_assign(env, cur, imp);
			
@@ -2292,7 +2373,7 @@ void task_numa_work(struct callback_head *work)
 
				 	unsigned long nr_pte_updates = 0;
			
 
				 	long pages, virtpages;
			
 
				 
			
 
				-	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
			
 
				+	SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
			
 
				 
			
 
				 	work->next = work; /* protect against double add */
			
 
				 	/*
			
@@ -2803,9 +2884,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-/*
			
 
				- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
			
 
				- * and effective_load (which is not done because it is too costly).
			
 
				+/**
			
 
				+ * update_tg_load_avg - update the tg's load avg
			
 
				+ * @cfs_rq: the cfs_rq whose avg changed
			
 
				+ * @force: update regardless of how small the difference
			
 
				+ *
			
 
				+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
			
 
				+ * However, because tg->load_avg is a global value there are performance
			
 
				+ * considerations.
			
 
				+ *
			
 
				+ * In order to avoid having to look at the other cfs_rq's, we use a
			
 
				+ * differential update where we store the last value we propagated. This in
			
 
				+ * turn allows skipping updates if the differential is 'small'.
			
 
				+ *
			
 
				+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
			
 
				+ * done) and effective_load() (which is not done because it is too costly).
			
 
				  */
			
 
				 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
			
 
				 {
			
@@ -2925,10 +3018,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 
				  *
			
 
				  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
			
 
				  *
			
 
				- * Returns true if the load decayed or we removed utilization. It is expected
			
 
				- * that one calls update_tg_load_avg() on this condition, but after you've
			
 
				- * modified the cfs_rq avg (attach/detach), such that we propagate the new
			
 
				- * avg up.
			
 
				+ * Returns true if the load decayed or we removed load.
			
 
				+ *
			
 
				+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
			
 
				+ * call update_tg_load_avg() when this function returns true.
			
 
				  */
			
 
				 static inline int
			
 
				 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
			
@@ -3174,68 +3267,6 @@ static inline int idle_balance(struct rq *rq)
 
				 
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
 
				-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				-{
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				-	struct task_struct *tsk = NULL;
			
 
				-
			
 
				-	if (entity_is_task(se))
			
 
				-		tsk = task_of(se);
			
 
				-
			
 
				-	if (se->statistics.sleep_start) {
			
 
				-		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
			
 
				-
			
 
				-		if ((s64)delta < 0)
			
 
				-			delta = 0;
			
 
				-
			
 
				-		if (unlikely(delta > se->statistics.sleep_max))
			
 
				-			se->statistics.sleep_max = delta;
			
 
				-
			
 
				-		se->statistics.sleep_start = 0;
			
 
				-		se->statistics.sum_sleep_runtime += delta;
			
 
				-
			
 
				-		if (tsk) {
			
 
				-			account_scheduler_latency(tsk, delta >> 10, 1);
			
 
				-			trace_sched_stat_sleep(tsk, delta);
			
 
				-		}
			
 
				-	}
			
 
				-	if (se->statistics.block_start) {
			
 
				-		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
			
 
				-
			
 
				-		if ((s64)delta < 0)
			
 
				-			delta = 0;
			
 
				-
			
 
				-		if (unlikely(delta > se->statistics.block_max))
			
 
				-			se->statistics.block_max = delta;
			
 
				-
			
 
				-		se->statistics.block_start = 0;
			
 
				-		se->statistics.sum_sleep_runtime += delta;
			
 
				-
			
 
				-		if (tsk) {
			
 
				-			if (tsk->in_iowait) {
			
 
				-				se->statistics.iowait_sum += delta;
			
 
				-				se->statistics.iowait_count++;
			
 
				-				trace_sched_stat_iowait(tsk, delta);
			
 
				-			}
			
 
				-
			
 
				-			trace_sched_stat_blocked(tsk, delta);
			
 
				-
			
 
				-			/*
			
 
				-			 * Blocking time is in units of nanosecs, so shift by
			
 
				-			 * 20 to get a milliseconds-range estimation of the
			
 
				-			 * amount of time that the task spent sleeping:
			
 
				-			 */
			
 
				-			if (unlikely(prof_on == SLEEP_PROFILING)) {
			
 
				-				profile_hits(SLEEP_PROFILING,
			
 
				-						(void *)get_wchan(tsk),
			
 
				-						delta >> 20);
			
 
				-			}
			
 
				-			account_scheduler_latency(tsk, delta >> 10, 0);
			
 
				-		}
			
 
				-	}
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				 #ifdef CONFIG_SCHED_DEBUG
			
@@ -3245,7 +3276,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		d = -d;
			
 
				 
			
 
				 	if (d > 3*sysctl_sched_latency)
			
 
				-		schedstat_inc(cfs_rq, nr_spread_over);
			
 
				+		schedstat_inc(cfs_rq->nr_spread_over);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -3362,17 +3393,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	account_entity_enqueue(cfs_rq, se);
			
 
				 	update_cfs_shares(cfs_rq);
			
 
				 
			
 
				-	if (flags & ENQUEUE_WAKEUP) {
			
 
				+	if (flags & ENQUEUE_WAKEUP)
			
 
				 		place_entity(cfs_rq, se, 0);
			
 
				-		if (schedstat_enabled())
			
 
				-			enqueue_sleeper(cfs_rq, se);
			
 
				-	}
			
 
				 
			
 
				 	check_schedstat_required();
			
 
				-	if (schedstat_enabled()) {
			
 
				-		update_stats_enqueue(cfs_rq, se);
			
 
				-		check_spread(cfs_rq, se);
			
 
				-	}
			
 
				+	update_stats_enqueue(cfs_rq, se, flags);
			
 
				+	check_spread(cfs_rq, se);
			
 
				 	if (!curr)
			
 
				 		__enqueue_entity(cfs_rq, se);
			
 
				 	se->on_rq = 1;
			
@@ -3439,8 +3465,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	update_curr(cfs_rq);
			
 
				 	dequeue_entity_load_avg(cfs_rq, se);
			
 
				 
			
 
				-	if (schedstat_enabled())
			
 
				-		update_stats_dequeue(cfs_rq, se, flags);
			
 
				+	update_stats_dequeue(cfs_rq, se, flags);
			
 
				 
			
 
				 	clear_buddies(cfs_rq, se);
			
 
				 
			
@@ -3450,9 +3475,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	account_entity_dequeue(cfs_rq, se);
			
 
				 
			
 
				 	/*
			
 
				-	 * Normalize the entity after updating the min_vruntime because the
			
 
				-	 * update can refer to the ->curr item and we need to reflect this
			
 
				-	 * movement in our normalized position.
			
 
				+	 * Normalize after update_curr(); which will also have moved
			
 
				+	 * min_vruntime if @se is the one holding it back. But before doing
			
 
				+	 * update_min_vruntime() again, which will discount @se's position and
			
 
				+	 * can move min_vruntime forward still more.
			
 
				 	 */
			
 
				 	if (!(flags & DEQUEUE_SLEEP))
			
 
				 		se->vruntime -= cfs_rq->min_vruntime;
			
@@ -3460,8 +3486,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	/* return excess runtime on last dequeue */
			
 
				 	return_cfs_rq_runtime(cfs_rq);
			
 
				 
			
 
				-	update_min_vruntime(cfs_rq);
			
 
				 	update_cfs_shares(cfs_rq);
			
 
				+
			
 
				+	/*
			
 
				+	 * Now advance min_vruntime if @se was the entity holding it back,
			
 
				+	 * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
			
 
				+	 * put back on, and if we advance min_vruntime, we'll be placed back
			
 
				+	 * further than we started -- ie. we'll be penalized.
			
 
				+	 */
			
 
				+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
			
 
				+		update_min_vruntime(cfs_rq);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3514,25 +3548,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		 * a CPU. So account for the time it spent waiting on the
			
 
				 		 * runqueue.
			
 
				 		 */
			
 
				-		if (schedstat_enabled())
			
 
				-			update_stats_wait_end(cfs_rq, se);
			
 
				+		update_stats_wait_end(cfs_rq, se);
			
 
				 		__dequeue_entity(cfs_rq, se);
			
 
				 		update_load_avg(se, 1);
			
 
				 	}
			
 
				 
			
 
				 	update_stats_curr_start(cfs_rq, se);
			
 
				 	cfs_rq->curr = se;
			
 
				-#ifdef CONFIG_SCHEDSTATS
			
 
				+
			
 
				 	/*
			
 
				 	 * Track our maximum slice length, if the CPU's load is at
			
 
				 	 * least twice that of our own weight (i.e. dont track it
			
 
				 	 * when there are only lesser-weight tasks around):
			
 
				 	 */
			
 
				 	if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
			
 
				-		se->statistics.slice_max = max(se->statistics.slice_max,
			
 
				-			se->sum_exec_runtime - se->prev_sum_exec_runtime);
			
 
				+		schedstat_set(se->statistics.slice_max,
			
 
				+			max((u64)schedstat_val(se->statistics.slice_max),
			
 
				+			    se->sum_exec_runtime - se->prev_sum_exec_runtime));
			
 
				 	}
			
 
				-#endif
			
 
				+
			
 
				 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
			
 
				 }
			
 
				 
			
@@ -3611,13 +3645,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
				 	/* throttle cfs_rqs exceeding runtime */
			
 
				 	check_cfs_rq_runtime(cfs_rq);
			
 
				 
			
 
				-	if (schedstat_enabled()) {
			
 
				-		check_spread(cfs_rq, prev);
			
 
				-		if (prev->on_rq)
			
 
				-			update_stats_wait_start(cfs_rq, prev);
			
 
				-	}
			
 
				+	check_spread(cfs_rq, prev);
			
 
				 
			
 
				 	if (prev->on_rq) {
			
 
				+		update_stats_wait_start(cfs_rq, prev);
			
 
				 		/* Put 'current' back into the tree. */
			
 
				 		__enqueue_entity(cfs_rq, prev);
			
 
				 		/* in !on_rq case, update occurred at dequeue */
			
@@ -4447,9 +4478,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
 
				 	struct sched_entity *se = &p->se;
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 
			
 
				-	WARN_ON(task_rq(p) != rq);
			
 
				+	SCHED_WARN_ON(task_rq(p) != rq);
			
 
				 
			
 
				-	if (cfs_rq->nr_running > 1) {
			
 
				+	if (rq->cfs.h_nr_running > 1) {
			
 
				 		u64 slice = sched_slice(cfs_rq, se);
			
 
				 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
			
 
				 		s64 delta = slice - ran;
			
@@ -4604,6 +4635,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				+
			
 
				+/* Working cpumask for: load_balance, load_balance_newidle. */
			
 
				+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
			
 
				+
			
 
				 #ifdef CONFIG_NO_HZ_COMMON
			
 
				 /*
			
 
				  * per rq 'load' arrray crap; XXX kill this.
			
@@ -5005,9 +5041,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
				 		 * wl = S * s'_i; see (2)
			
 
				 		 */
			
 
				 		if (W > 0 && w < W)
			
 
				-			wl = (w * (long)tg->shares) / W;
			
 
				+			wl = (w * (long)scale_load_down(tg->shares)) / W;
			
 
				 		else
			
 
				-			wl = tg->shares;
			
 
				+			wl = scale_load_down(tg->shares);
			
 
				 
			
 
				 		/*
			
 
				 		 * Per the above, wl is the new se->load.weight value; since
			
@@ -5090,18 +5126,18 @@ static int wake_wide(struct task_struct *p)
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
			
 
				+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
			
 
				+		       int prev_cpu, int sync)
			
 
				 {
			
 
				 	s64 this_load, load;
			
 
				 	s64 this_eff_load, prev_eff_load;
			
 
				-	int idx, this_cpu, prev_cpu;
			
 
				+	int idx, this_cpu;
			
 
				 	struct task_group *tg;
			
 
				 	unsigned long weight;
			
 
				 	int balanced;
			
 
				 
			
 
				 	idx	  = sd->wake_idx;
			
 
				 	this_cpu  = smp_processor_id();
			
 
				-	prev_cpu  = task_cpu(p);
			
 
				 	load	  = source_load(prev_cpu, idx);
			
 
				 	this_load = target_load(this_cpu, idx);
			
 
				 
			
@@ -5145,13 +5181,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
				 
			
 
				 	balanced = this_eff_load <= prev_eff_load;
			
 
				 
			
 
				-	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
			
 
				+	schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
			
 
				 
			
 
				 	if (!balanced)
			
 
				 		return 0;
			
 
				 
			
 
				-	schedstat_inc(sd, ttwu_move_affine);
			
 
				-	schedstat_inc(p, se.statistics.nr_wakeups_affine);
			
 
				+	schedstat_inc(sd->ttwu_move_affine);
			
 
				+	schedstat_inc(p->se.statistics.nr_wakeups_affine);
			
 
				 
			
 
				 	return 1;
			
 
				 }
			
@@ -5227,6 +5263,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
				 	int shallowest_idle_cpu = -1;
			
 
				 	int i;
			
 
				 
			
 
				+	/* Check if we have any choice: */
			
 
				+	if (group->group_weight == 1)
			
 
				+		return cpumask_first(sched_group_cpus(group));
			
 
				+
			
 
				 	/* Traverse only the allowed CPUs */
			
 
				 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
			
 
				 		if (idle_cpu(i)) {
			
@@ -5264,64 +5304,237 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Try and locate an idle CPU in the sched_domain.
			
 
				+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
			
 
				+ * (@start), and wraps around.
			
 
				+ *
			
 
				+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
			
 
				+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
			
 
				+ * through the LLC domain.
			
 
				+ *
			
 
				+ * Especially tbench is found sensitive to this.
			
 
				+ */
			
 
				+
			
 
				+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
			
 
				+{
			
 
				+	int next;
			
 
				+
			
 
				+again:
			
 
				+	next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
			
 
				+
			
 
				+	if (*wrapped) {
			
 
				+		if (next >= start)
			
 
				+			return nr_cpumask_bits;
			
 
				+	} else {
			
 
				+		if (next >= nr_cpumask_bits) {
			
 
				+			*wrapped = 1;
			
 
				+			n = -1;
			
 
				+			goto again;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return next;
			
 
				+}
			
 
				+
			
 
				+#define for_each_cpu_wrap(cpu, mask, start, wrap)				\
			
 
				+	for ((wrap) = 0, (cpu) = (start)-1;					\
			
 
				+		(cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),	\
			
 
				+		(cpu) < nr_cpumask_bits; )
			
 
				+
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+
			
 
				+static inline void set_idle_cores(int cpu, int val)
			
 
				+{
			
 
				+	struct sched_domain_shared *sds;
			
 
				+
			
 
				+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+	if (sds)
			
 
				+		WRITE_ONCE(sds->has_idle_cores, val);
			
 
				+}
			
 
				+
			
 
				+static inline bool test_idle_cores(int cpu, bool def)
			
 
				+{
			
 
				+	struct sched_domain_shared *sds;
			
 
				+
			
 
				+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+	if (sds)
			
 
				+		return READ_ONCE(sds->has_idle_cores);
			
 
				+
			
 
				+	return def;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scans the local SMT mask to see if the entire core is idle, and records this
			
 
				+ * information in sd_llc_shared->has_idle_cores.
			
 
				+ *
			
 
				+ * Since SMT siblings share all cache levels, inspecting this limited remote
			
 
				+ * state should be fairly cheap.
			
 
				  */
			
 
				-static int select_idle_sibling(struct task_struct *p, int target)
			
 
				+void __update_idle_core(struct rq *rq)
			
 
				+{
			
 
				+	int core = cpu_of(rq);
			
 
				+	int cpu;
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	if (test_idle_cores(core, true))
			
 
				+		goto unlock;
			
 
				+
			
 
				+	for_each_cpu(cpu, cpu_smt_mask(core)) {
			
 
				+		if (cpu == core)
			
 
				+			continue;
			
 
				+
			
 
				+		if (!idle_cpu(cpu))
			
 
				+			goto unlock;
			
 
				+	}
			
 
				+
			
 
				+	set_idle_cores(core, 1);
			
 
				+unlock:
			
 
				+	rcu_read_unlock();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
			
 
				+ * there are no idle cores left in the system; tracked through
			
 
				+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
			
 
				+ */
			
 
				+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
			
 
				+	int core, cpu, wrap;
			
 
				+
			
 
				+	if (!static_branch_likely(&sched_smt_present))
			
 
				+		return -1;
			
 
				+
			
 
				+	if (!test_idle_cores(target, false))
			
 
				+		return -1;
			
 
				+
			
 
				+	cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
			
 
				+
			
 
				+	for_each_cpu_wrap(core, cpus, target, wrap) {
			
 
				+		bool idle = true;
			
 
				+
			
 
				+		for_each_cpu(cpu, cpu_smt_mask(core)) {
			
 
				+			cpumask_clear_cpu(cpu, cpus);
			
 
				+			if (!idle_cpu(cpu))
			
 
				+				idle = false;
			
 
				+		}
			
 
				+
			
 
				+		if (idle)
			
 
				+			return core;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Failed to find an idle core; stop looking for one.
			
 
				+	 */
			
 
				+	set_idle_cores(target, 0);
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Scan the local SMT mask for idle CPUs.
			
 
				+ */
			
 
				+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				+	if (!static_branch_likely(&sched_smt_present))
			
 
				+		return -1;
			
 
				+
			
 
				+	for_each_cpu(cpu, cpu_smt_mask(target)) {
			
 
				+		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
			
 
				+			continue;
			
 
				+		if (idle_cpu(cpu))
			
 
				+			return cpu;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#else /* CONFIG_SCHED_SMT */
			
 
				+
			
 
				+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				+#endif /* CONFIG_SCHED_SMT */
			
 
				+
			
 
				+/*
			
 
				+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
			
 
				+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
			
 
				+ * average idle time for this rq (as found in rq->avg_idle).
			
 
				+ */
			
 
				+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
			
 
				+{
			
 
				+	struct sched_domain *this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
			
 
				+	u64 avg_idle = this_rq()->avg_idle;
			
 
				+	u64 avg_cost = this_sd->avg_scan_cost;
			
 
				+	u64 time, cost;
			
 
				+	s64 delta;
			
 
				+	int cpu, wrap;
			
 
				+
			
 
				+	/*
			
 
				+	 * Due to large variance we need a large fuzz factor; hackbench in
			
 
				+	 * particularly is sensitive here.
			
 
				+	 */
			
 
				+	if ((avg_idle / 512) < avg_cost)
			
 
				+		return -1;
			
 
				+
			
 
				+	time = local_clock();
			
 
				+
			
 
				+	for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
			
 
				+		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
			
 
				+			continue;
			
 
				+		if (idle_cpu(cpu))
			
 
				+			break;
			
 
				+	}
			
 
				+
			
 
				+	time = local_clock() - time;
			
 
				+	cost = this_sd->avg_scan_cost;
			
 
				+	delta = (s64)(time - cost) / 8;
			
 
				+	this_sd->avg_scan_cost += delta;
			
 
				+
			
 
				+	return cpu;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Try and locate an idle core/thread in the LLC cache domain.
			
 
				+ */
			
 
				+static int select_idle_sibling(struct task_struct *p, int prev, int target)
			
 
				 {
			
 
				 	struct sched_domain *sd;
			
 
				-	struct sched_group *sg;
			
 
				-	int i = task_cpu(p);
			
 
				+	int i;
			
 
				 
			
 
				 	if (idle_cpu(target))
			
 
				 		return target;
			
 
				 
			
 
				 	/*
			
 
				-	 * If the prevous cpu is cache affine and idle, don't be stupid.
			
 
				+	 * If the previous cpu is cache affine and idle, don't be stupid.
			
 
				 	 */
			
 
				-	if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
			
 
				-		return i;
			
 
				+	if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
			
 
				+		return prev;
			
 
				 
			
 
				-	/*
			
 
				-	 * Otherwise, iterate the domains and find an eligible idle cpu.
			
 
				-	 *
			
 
				-	 * A completely idle sched group at higher domains is more
			
 
				-	 * desirable than an idle group at a lower level, because lower
			
 
				-	 * domains have smaller groups and usually share hardware
			
 
				-	 * resources which causes tasks to contend on them, e.g. x86
			
 
				-	 * hyperthread siblings in the lowest domain (SMT) can contend
			
 
				-	 * on the shared cpu pipeline.
			
 
				-	 *
			
 
				-	 * However, while we prefer idle groups at higher domains
			
 
				-	 * finding an idle cpu at the lowest domain is still better than
			
 
				-	 * returning 'target', which we've already established, isn't
			
 
				-	 * idle.
			
 
				-	 */
			
 
				 	sd = rcu_dereference(per_cpu(sd_llc, target));
			
 
				-	for_each_lower_domain(sd) {
			
 
				-		sg = sd->groups;
			
 
				-		do {
			
 
				-			if (!cpumask_intersects(sched_group_cpus(sg),
			
 
				-						tsk_cpus_allowed(p)))
			
 
				-				goto next;
			
 
				-
			
 
				-			/* Ensure the entire group is idle */
			
 
				-			for_each_cpu(i, sched_group_cpus(sg)) {
			
 
				-				if (i == target || !idle_cpu(i))
			
 
				-					goto next;
			
 
				-			}
			
 
				+	if (!sd)
			
 
				+		return target;
			
 
				+
			
 
				+	i = select_idle_core(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				+
			
 
				+	i = select_idle_cpu(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				+
			
 
				+	i = select_idle_smt(p, sd, target);
			
 
				+	if ((unsigned)i < nr_cpumask_bits)
			
 
				+		return i;
			
 
				 
			
 
				-			/*
			
 
				-			 * It doesn't matter which cpu we pick, the
			
 
				-			 * whole group is idle.
			
 
				-			 */
			
 
				-			target = cpumask_first_and(sched_group_cpus(sg),
			
 
				-					tsk_cpus_allowed(p));
			
 
				-			goto done;
			
 
				-next:
			
 
				-			sg = sg->next;
			
 
				-		} while (sg != sd->groups);
			
 
				-	}
			
 
				-done:
			
 
				 	return target;
			
 
				 }
			
 
				 
			
@@ -5359,6 +5572,32 @@ static int cpu_util(int cpu)
 
				 	return (util >= capacity) ? capacity : util;
			
 
				 }
			
 
				 
			
 
				+static inline int task_util(struct task_struct *p)
			
 
				+{
			
 
				+	return p->se.avg.util_avg;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
			
 
				+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
			
 
				+ *
			
 
				+ * In that case WAKE_AFFINE doesn't make sense and we'll let
			
 
				+ * BALANCE_WAKE sort things out.
			
 
				+ */
			
 
				+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
			
 
				+{
			
 
				+	long min_cap, max_cap;
			
 
				+
			
 
				+	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
			
 
				+	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
			
 
				+
			
 
				+	/* Minimum capacity is close to max, no need to abort wake_affine */
			
 
				+	if (max_cap - min_cap < max_cap >> 3)
			
 
				+		return 0;
			
 
				+
			
 
				+	return min_cap * 1024 < task_util(p) * capacity_margin;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * select_task_rq_fair: Select target runqueue for the waking task in domains
			
 
				  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
			
@@ -5382,7 +5621,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 
			
 
				 	if (sd_flag & SD_BALANCE_WAKE) {
			
 
				 		record_wakee(p);
			
 
				-		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
			
 
				+		want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
			
 
				+			      && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
			
 
				 	}
			
 
				 
			
 
				 	rcu_read_lock();
			
@@ -5408,13 +5648,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 
			
 
				 	if (affine_sd) {
			
 
				 		sd = NULL; /* Prefer wake_affine over balance flags */
			
 
				-		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
			
 
				+		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
			
 
				 			new_cpu = cpu;
			
 
				 	}
			
 
				 
			
 
				 	if (!sd) {
			
 
				 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
			
 
				-			new_cpu = select_idle_sibling(p, new_cpu);
			
 
				+			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
			
 
				 
			
 
				 	} else while (sd) {
			
 
				 		struct sched_group *group;
			
@@ -5938,7 +6178,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
				  *
			
 
				  * The adjacency matrix of the resulting graph is given by:
			
 
				  *
			
 
				- *             log_2 n     
			
 
				+ *             log_2 n
			
 
				  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
			
 
				  *             k = 0
			
 
				  *
			
@@ -5984,7 +6224,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 
				  *
			
 
				  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
			
 
				  *      rewrite all of this once again.]
			
 
				- */ 
			
 
				+ */
			
 
				 
			
 
				 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
			
 
				 
			
@@ -6132,7 +6372,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
				 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
			
 
				 		int cpu;
			
 
				 
			
 
				-		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
			
 
				+		schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
			
 
				 
			
 
				 		env->flags |= LBF_SOME_PINNED;
			
 
				 
			
@@ -6163,7 +6403,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
				 	env->flags &= ~LBF_ALL_PINNED;
			
 
				 
			
 
				 	if (task_running(env->src_rq, p)) {
			
 
				-		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
			
 
				+		schedstat_inc(p->se.statistics.nr_failed_migrations_running);
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
@@ -6180,13 +6420,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
				 	if (tsk_cache_hot <= 0 ||
			
 
				 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
			
 
				 		if (tsk_cache_hot == 1) {
			
 
				-			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
			
 
				-			schedstat_inc(p, se.statistics.nr_forced_migrations);
			
 
				+			schedstat_inc(env->sd->lb_hot_gained[env->idle]);
			
 
				+			schedstat_inc(p->se.statistics.nr_forced_migrations);
			
 
				 		}
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
 
				-	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
			
 
				+	schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
@@ -6226,7 +6466,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
 
				 		 * so we can safely collect stats here rather than
			
 
				 		 * inside detach_tasks().
			
 
				 		 */
			
 
				-		schedstat_inc(env->sd, lb_gained[env->idle]);
			
 
				+		schedstat_inc(env->sd->lb_gained[env->idle]);
			
 
				 		return p;
			
 
				 	}
			
 
				 	return NULL;
			
@@ -6318,7 +6558,7 @@ next:
 
				 	 * so we can safely collect detach_one_task() stats here rather
			
 
				 	 * than inside detach_one_task().
			
 
				 	 */
			
 
				-	schedstat_add(env->sd, lb_gained[env->idle], detached);
			
 
				+	schedstat_add(env->sd->lb_gained[env->idle], detached);
			
 
				 
			
 
				 	return detached;
			
 
				 }
			
@@ -6646,7 +6886,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
				 		/*
			
 
				 		 * !SD_OVERLAP domains can assume that child groups
			
 
				 		 * span the current group.
			
 
				-		 */ 
			
 
				+		 */
			
 
				 
			
 
				 		group = child->groups;
			
 
				 		do {
			
@@ -7146,7 +7386,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 
				 		load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
			
 
				 		if (load_above_capacity > busiest->group_capacity) {
			
 
				 			load_above_capacity -= busiest->group_capacity;
			
 
				-			load_above_capacity *= NICE_0_LOAD;
			
 
				+			load_above_capacity *= scale_load_down(NICE_0_LOAD);
			
 
				 			load_above_capacity /= busiest->group_capacity;
			
 
				 		} else
			
 
				 			load_above_capacity = ~0UL;
			
@@ -7353,9 +7593,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 
				  */
			
 
				 #define MAX_PINNED_INTERVAL	512
			
 
				 
			
 
				-/* Working cpumask for load_balance and load_balance_newidle. */
			
 
				-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
			
 
				-
			
 
				 static int need_active_balance(struct lb_env *env)
			
 
				 {
			
 
				 	struct sched_domain *sd = env->sd;
			
@@ -7459,7 +7696,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 
				 
			
 
				 	cpumask_copy(cpus, cpu_active_mask);
			
 
				 
			
 
				-	schedstat_inc(sd, lb_count[idle]);
			
 
				+	schedstat_inc(sd->lb_count[idle]);
			
 
				 
			
 
				 redo:
			
 
				 	if (!should_we_balance(&env)) {
			
@@ -7469,19 +7706,19 @@ redo:
 
				 
			
 
				 	group = find_busiest_group(&env);
			
 
				 	if (!group) {
			
 
				-		schedstat_inc(sd, lb_nobusyg[idle]);
			
 
				+		schedstat_inc(sd->lb_nobusyg[idle]);
			
 
				 		goto out_balanced;
			
 
				 	}
			
 
				 
			
 
				 	busiest = find_busiest_queue(&env, group);
			
 
				 	if (!busiest) {
			
 
				-		schedstat_inc(sd, lb_nobusyq[idle]);
			
 
				+		schedstat_inc(sd->lb_nobusyq[idle]);
			
 
				 		goto out_balanced;
			
 
				 	}
			
 
				 
			
 
				 	BUG_ON(busiest == env.dst_rq);
			
 
				 
			
 
				-	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
			
 
				+	schedstat_add(sd->lb_imbalance[idle], env.imbalance);
			
 
				 
			
 
				 	env.src_cpu = busiest->cpu;
			
 
				 	env.src_rq = busiest;
			
@@ -7588,7 +7825,7 @@ more_balance:
 
				 	}
			
 
				 
			
 
				 	if (!ld_moved) {
			
 
				-		schedstat_inc(sd, lb_failed[idle]);
			
 
				+		schedstat_inc(sd->lb_failed[idle]);
			
 
				 		/*
			
 
				 		 * Increment the failure counter only on periodic balance.
			
 
				 		 * We do not want newidle balance, which can be very
			
@@ -7671,7 +7908,7 @@ out_all_pinned:
 
				 	 * we can't migrate them. Let the imbalance flag set so parent level
			
 
				 	 * can try to migrate them.
			
 
				 	 */
			
 
				-	schedstat_inc(sd, lb_balanced[idle]);
			
 
				+	schedstat_inc(sd->lb_balanced[idle]);
			
 
				 
			
 
				 	sd->nr_balance_failed = 0;
			
 
				 
			
@@ -7703,11 +7940,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 
				 }
			
 
				 
			
 
				 static inline void
			
 
				-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
			
 
				+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
			
 
				 {
			
 
				 	unsigned long interval, next;
			
 
				 
			
 
				-	interval = get_sd_balance_interval(sd, cpu_busy);
			
 
				+	/* used by idle balance, so cpu_busy = 0 */
			
 
				+	interval = get_sd_balance_interval(sd, 0);
			
 
				 	next = sd->last_balance + interval;
			
 
				 
			
 
				 	if (time_after(*next_balance, next))
			
@@ -7737,7 +7975,7 @@ static int idle_balance(struct rq *this_rq)
 
				 		rcu_read_lock();
			
 
				 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
			
 
				 		if (sd)
			
 
				-			update_next_balance(sd, 0, &next_balance);
			
 
				+			update_next_balance(sd, &next_balance);
			
 
				 		rcu_read_unlock();
			
 
				 
			
 
				 		goto out;
			
@@ -7755,7 +7993,7 @@ static int idle_balance(struct rq *this_rq)
 
				 			continue;
			
 
				 
			
 
				 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
			
 
				-			update_next_balance(sd, 0, &next_balance);
			
 
				+			update_next_balance(sd, &next_balance);
			
 
				 			break;
			
 
				 		}
			
 
				 
			
@@ -7773,7 +8011,7 @@ static int idle_balance(struct rq *this_rq)
 
				 			curr_cost += domain_cost;
			
 
				 		}
			
 
				 
			
 
				-		update_next_balance(sd, 0, &next_balance);
			
 
				+		update_next_balance(sd, &next_balance);
			
 
				 
			
 
				 		/*
			
 
				 		 * Stop searching for tasks to pull if there are
			
@@ -7863,15 +8101,15 @@ static int active_load_balance_cpu_stop(void *data)
 
				 			.idle		= CPU_IDLE,
			
 
				 		};
			
 
				 
			
 
				-		schedstat_inc(sd, alb_count);
			
 
				+		schedstat_inc(sd->alb_count);
			
 
				 
			
 
				 		p = detach_one_task(&env);
			
 
				 		if (p) {
			
 
				-			schedstat_inc(sd, alb_pushed);
			
 
				+			schedstat_inc(sd->alb_pushed);
			
 
				 			/* Active balancing done, reset the failure counter. */
			
 
				 			sd->nr_balance_failed = 0;
			
 
				 		} else {
			
 
				-			schedstat_inc(sd, alb_failed);
			
 
				+			schedstat_inc(sd->alb_failed);
			
 
				 		}
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
@@ -7963,13 +8201,13 @@ static inline void set_cpu_sd_state_busy(void)
 
				 	int cpu = smp_processor_id();
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				-	sd = rcu_dereference(per_cpu(sd_busy, cpu));
			
 
				+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
			
 
				 
			
 
				 	if (!sd || !sd->nohz_idle)
			
 
				 		goto unlock;
			
 
				 	sd->nohz_idle = 0;
			
 
				 
			
 
				-	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
			
 
				+	atomic_inc(&sd->shared->nr_busy_cpus);
			
 
				 unlock:
			
 
				 	rcu_read_unlock();
			
 
				 }
			
@@ -7980,13 +8218,13 @@ void set_cpu_sd_state_idle(void)
 
				 	int cpu = smp_processor_id();
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				-	sd = rcu_dereference(per_cpu(sd_busy, cpu));
			
 
				+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
			
 
				 
			
 
				 	if (!sd || sd->nohz_idle)
			
 
				 		goto unlock;
			
 
				 	sd->nohz_idle = 1;
			
 
				 
			
 
				-	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
			
 
				+	atomic_dec(&sd->shared->nr_busy_cpus);
			
 
				 unlock:
			
 
				 	rcu_read_unlock();
			
 
				 }
			
@@ -8213,8 +8451,8 @@ end:
 
				 static inline bool nohz_kick_needed(struct rq *rq)
			
 
				 {
			
 
				 	unsigned long now = jiffies;
			
 
				+	struct sched_domain_shared *sds;
			
 
				 	struct sched_domain *sd;
			
 
				-	struct sched_group_capacity *sgc;
			
 
				 	int nr_busy, cpu = rq->cpu;
			
 
				 	bool kick = false;
			
 
				 
			
@@ -8242,11 +8480,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
 
				 		return true;
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				-	sd = rcu_dereference(per_cpu(sd_busy, cpu));
			
 
				-	if (sd) {
			
 
				-		sgc = sd->groups->sgc;
			
 
				-		nr_busy = atomic_read(&sgc->nr_busy_cpus);
			
 
				-
			
 
				+	sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
			
 
				+	if (sds) {
			
 
				+		/*
			
 
				+		 * XXX: write a coherent comment on why we do this.
			
 
				+		 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
			
 
				+		 */
			
 
				+		nr_busy = atomic_read(&sds->nr_busy_cpus);
			
 
				 		if (nr_busy > 1) {
			
 
				 			kick = true;
			
 
				 			goto unlock;
			
@@ -8440,7 +8680,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
 
				 	struct sched_entity *se = &p->se;
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				-	int tg_update;
			
 
				 
			
 
				 	if (!vruntime_normalized(p)) {
			
 
				 		/*
			
@@ -8452,10 +8691,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
 
				 	}
			
 
				 
			
 
				 	/* Catch up with the cfs_rq and remove our load when we leave */
			
 
				-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				+	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				 	detach_entity_load_avg(cfs_rq, se);
			
 
				-	if (tg_update)
			
 
				-		update_tg_load_avg(cfs_rq, false);
			
 
				+	update_tg_load_avg(cfs_rq, false);
			
 
				 }
			
 
				 
			
 
				 static void attach_task_cfs_rq(struct task_struct *p)
			
@@ -8463,7 +8701,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
 
				 	struct sched_entity *se = &p->se;
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				-	int tg_update;
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/*
			
@@ -8474,10 +8711,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
 
				 #endif
			
 
				 
			
 
				 	/* Synchronize task with its cfs_rq */
			
 
				-	tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				+	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				 	attach_entity_load_avg(cfs_rq, se);
			
 
				-	if (tg_update)
			
 
				-		update_tg_load_avg(cfs_rq, false);
			
 
				+	update_tg_load_avg(cfs_rq, false);
			
 
				 
			
 
				 	if (!vruntime_normalized(p))
			
 
				 		se->vruntime += cfs_rq->min_vruntime;
			
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
 
				 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
			
 
				 {
			
 
				 	put_prev_task(rq, prev);
			
 
				-
			
 
				-	schedstat_inc(rq, sched_goidle);
			
 
				+	update_idle_core(rq);
			
 
				+	schedstat_inc(rq->sched_goidle);
			
 
				 	return rq->idle;
			
 
				 }
			
 
				 
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
 
				 #include <linux/sched.h>
			
 
				 #include <linux/sched/sysctl.h>
			
 
				 #include <linux/sched/rt.h>
			
 
				+#include <linux/u64_stats_sync.h>
			
 
				 #include <linux/sched/deadline.h>
			
 
				 #include <linux/binfmts.h>
			
 
				 #include <linux/mutex.h>
			
@@ -15,6 +16,12 @@
 
				 #include "cpudeadline.h"
			
 
				 #include "cpuacct.h"
			
 
				 
			
 
				+#ifdef CONFIG_SCHED_DEBUG
			
 
				+#define SCHED_WARN_ON(x)	WARN_ONCE(x, #x)
			
 
				+#else
			
 
				+#define SCHED_WARN_ON(x)	((void)(x))
			
 
				+#endif
			
 
				+
			
 
				 struct rq;
			
 
				 struct cpuidle_state;
			
 
				 
			
@@ -565,6 +572,8 @@ struct root_domain {
 
				 	 */
			
 
				 	cpumask_var_t rto_mask;
			
 
				 	struct cpupri cpupri;
			
 
				+
			
 
				+	unsigned long max_cpu_capacity;
			
 
				 };
			
 
				 
			
 
				 extern struct root_domain def_root_domain;
			
@@ -597,7 +606,6 @@ struct rq {
 
				 #ifdef CONFIG_SMP
			
 
				 	unsigned long last_load_update_tick;
			
 
				 #endif /* CONFIG_SMP */
			
 
				-	u64 nohz_stamp;
			
 
				 	unsigned long nohz_flags;
			
 
				 #endif /* CONFIG_NO_HZ_COMMON */
			
 
				 #ifdef CONFIG_NO_HZ_FULL
			
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
 
				 #endif
			
 
				 }
			
 
				 
			
 
				+
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+
			
 
				+extern struct static_key_false sched_smt_present;
			
 
				+
			
 
				+extern void __update_idle_core(struct rq *rq);
			
 
				+
			
 
				+static inline void update_idle_core(struct rq *rq)
			
 
				+{
			
 
				+	if (static_branch_unlikely(&sched_smt_present))
			
 
				+		__update_idle_core(rq);
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+static inline void update_idle_core(struct rq *rq) { }
			
 
				+#endif
			
 
				+
			
 
				 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
			
 
				 
			
 
				 #define cpu_rq(cpu)		(&per_cpu(runqueues, (cpu)))
			
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 
				 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
			
 
				 DECLARE_PER_CPU(int, sd_llc_size);
			
 
				 DECLARE_PER_CPU(int, sd_llc_id);
			
 
				+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
			
 
				 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
			
 
				-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
			
 
				 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
			
 
				 
			
 
				 struct sched_group_capacity {
			
@@ -870,10 +895,6 @@ struct sched_group_capacity {
 
				 	unsigned int capacity;
			
 
				 	unsigned long next_update;
			
 
				 	int imbalance; /* XXX unrelated to capacity but shared group state */
			
 
				-	/*
			
 
				-	 * Number of busy cpus in this group.
			
 
				-	 */
			
 
				-	atomic_t nr_busy_cpus;
			
 
				 
			
 
				 	unsigned long cpumask[0]; /* iteration mask */
			
 
				 };
			
@@ -1260,6 +1281,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
 
				 	prev->sched_class->put_prev_task(rq, prev);
			
 
				 }
			
 
				 
			
 
				+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
			
 
				+{
			
 
				+	curr->sched_class->set_curr_task(rq);
			
 
				+}
			
 
				+
			
 
				 #define sched_class_highest (&stop_sched_class)
			
 
				 #define for_each_class(class) \
			
 
				    for (class = sched_class_highest; class; class = class->next)
			
@@ -1290,7 +1316,7 @@ static inline void idle_set_state(struct rq *rq,
 
				 
			
 
				 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
			
 
				 {
			
 
				-	WARN_ON(!rcu_read_lock_held());
			
 
				+	SCHED_WARN_ON(!rcu_read_lock_held());
			
 
				 	return rq->idle_state;
			
 
				 }
			
 
				 #else
			
@@ -1710,52 +1736,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
			
 
				+struct irqtime {
			
 
				+	u64			hardirq_time;
			
 
				+	u64			softirq_time;
			
 
				+	u64			irq_start_time;
			
 
				+	struct u64_stats_sync	sync;
			
 
				+};
			
 
				 
			
 
				-DECLARE_PER_CPU(u64, cpu_hardirq_time);
			
 
				-DECLARE_PER_CPU(u64, cpu_softirq_time);
			
 
				-
			
 
				-#ifndef CONFIG_64BIT
			
 
				-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
			
 
				-
			
 
				-static inline void irq_time_write_begin(void)
			
 
				-{
			
 
				-	__this_cpu_inc(irq_time_seq.sequence);
			
 
				-	smp_wmb();
			
 
				-}
			
 
				-
			
 
				-static inline void irq_time_write_end(void)
			
 
				-{
			
 
				-	smp_wmb();
			
 
				-	__this_cpu_inc(irq_time_seq.sequence);
			
 
				-}
			
 
				+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
			
 
				 
			
 
				 static inline u64 irq_time_read(int cpu)
			
 
				 {
			
 
				-	u64 irq_time;
			
 
				-	unsigned seq;
			
 
				+	struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
			
 
				+	unsigned int seq;
			
 
				+	u64 total;
			
 
				 
			
 
				 	do {
			
 
				-		seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
			
 
				-		irq_time = per_cpu(cpu_softirq_time, cpu) +
			
 
				-			   per_cpu(cpu_hardirq_time, cpu);
			
 
				-	} while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
			
 
				-
			
 
				-	return irq_time;
			
 
				-}
			
 
				-#else /* CONFIG_64BIT */
			
 
				-static inline void irq_time_write_begin(void)
			
 
				-{
			
 
				-}
			
 
				+		seq = __u64_stats_fetch_begin(&irqtime->sync);
			
 
				+		total = irqtime->softirq_time + irqtime->hardirq_time;
			
 
				+	} while (__u64_stats_fetch_retry(&irqtime->sync, seq));
			
 
				 
			
 
				-static inline void irq_time_write_end(void)
			
 
				-{
			
 
				-}
			
 
				-
			
 
				-static inline u64 irq_time_read(int cpu)
			
 
				-{
			
 
				-	return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
			
 
				+	return total;
			
 
				 }
			
 
				-#endif /* CONFIG_64BIT */
			
 
				 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
			
 
				 
			
 
				 #ifdef CONFIG_CPU_FREQ
			
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 
				 	if (rq)
			
 
				 		rq->rq_sched_info.run_delay += delta;
			
 
				 }
			
 
				-# define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
			
 
				-# define schedstat_inc(rq, field)	do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
			
 
				-# define schedstat_add(rq, field, amt)	do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
			
 
				-# define schedstat_set(var, val)	do { if (schedstat_enabled()) { var = (val); } } while (0)
			
 
				-# define schedstat_val(rq, field)	((schedstat_enabled()) ? (rq)->field : 0)
			
 
				+#define schedstat_enabled()		static_branch_unlikely(&sched_schedstats)
			
 
				+#define schedstat_inc(var)		do { if (schedstat_enabled()) { var++; } } while (0)
			
 
				+#define schedstat_add(var, amt)		do { if (schedstat_enabled()) { var += (amt); } } while (0)
			
 
				+#define schedstat_set(var, val)		do { if (schedstat_enabled()) { var = (val); } } while (0)
			
 
				+#define schedstat_val(var)		(var)
			
 
				+#define schedstat_val_or_zero(var)	((schedstat_enabled()) ? (var) : 0)
			
 
				 
			
 
				 #else /* !CONFIG_SCHEDSTATS */
			
 
				 static inline void
			
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 
				 static inline void
			
 
				 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
			
 
				 {}
			
 
				-# define schedstat_enabled()		0
			
 
				-# define schedstat_inc(rq, field)	do { } while (0)
			
 
				-# define schedstat_add(rq, field, amt)	do { } while (0)
			
 
				-# define schedstat_set(var, val)	do { } while (0)
			
 
				-# define schedstat_val(rq, field)	0
			
 
				-#endif
			
 
				+#define schedstat_enabled()		0
			
 
				+#define schedstat_inc(var)		do { } while (0)
			
 
				+#define schedstat_add(var, amt)		do { } while (0)
			
 
				+#define schedstat_set(var, val)		do { } while (0)
			
 
				+#define schedstat_val(var)		0
			
 
				+#define schedstat_val_or_zero(var)	0
			
 
				+#endif /* CONFIG_SCHEDSTATS */
			
 
				 
			
 
				 #ifdef CONFIG_SCHED_INFO
			
 
				 static inline void sched_info_reset_dequeued(struct task_struct *t)
			
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 
				 }
			
 
				 EXPORT_SYMBOL(prepare_to_wait_exclusive);
			
 
				 
			
 
				-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
			
 
				+void init_wait_entry(wait_queue_t *wait, int flags)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (signal_pending_state(state, current))
			
 
				-		return -ERESTARTSYS;
			
 
				-
			
 
				+	wait->flags = flags;
			
 
				 	wait->private = current;
			
 
				 	wait->func = autoremove_wake_function;
			
 
				+	INIT_LIST_HEAD(&wait->task_list);
			
 
				+}
			
 
				+EXPORT_SYMBOL(init_wait_entry);
			
 
				+
			
 
				+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+	long ret = 0;
			
 
				 
			
 
				 	spin_lock_irqsave(&q->lock, flags);
			
 
				-	if (list_empty(&wait->task_list)) {
			
 
				-		if (wait->flags & WQ_FLAG_EXCLUSIVE)
			
 
				-			__add_wait_queue_tail(q, wait);
			
 
				-		else
			
 
				-			__add_wait_queue(q, wait);
			
 
				+	if (unlikely(signal_pending_state(state, current))) {
			
 
				+		/*
			
 
				+		 * Exclusive waiter must not fail if it was selected by wakeup,
			
 
				+		 * it should "consume" the condition we were waiting for.
			
 
				+		 *
			
 
				+		 * The caller will recheck the condition and return success if
			
 
				+		 * we were already woken up, we can not miss the event because
			
 
				+		 * wakeup locks/unlocks the same q->lock.
			
 
				+		 *
			
 
				+		 * But we need to ensure that set-condition + wakeup after that
			
 
				+		 * can't see us, it should wake up another exclusive waiter if
			
 
				+		 * we fail.
			
 
				+		 */
			
 
				+		list_del_init(&wait->task_list);
			
 
				+		ret = -ERESTARTSYS;
			
 
				+	} else {
			
 
				+		if (list_empty(&wait->task_list)) {
			
 
				+			if (wait->flags & WQ_FLAG_EXCLUSIVE)
			
 
				+				__add_wait_queue_tail(q, wait);
			
 
				+			else
			
 
				+				__add_wait_queue(q, wait);
			
 
				+		}
			
 
				+		set_current_state(state);
			
 
				 	}
			
 
				-	set_current_state(state);
			
 
				 	spin_unlock_irqrestore(&q->lock, flags);
			
 
				 
			
 
				-	return 0;
			
 
				+	return ret;
			
 
				 }
			
 
				 EXPORT_SYMBOL(prepare_to_wait_event);
			
 
				 
			
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 
				 }
			
 
				 EXPORT_SYMBOL(finish_wait);
			
 
				 
			
 
				-/**
			
 
				- * abort_exclusive_wait - abort exclusive waiting in a queue
			
 
				- * @q: waitqueue waited on
			
 
				- * @wait: wait descriptor
			
 
				- * @mode: runstate of the waiter to be woken
			
 
				- * @key: key to identify a wait bit queue or %NULL
			
 
				- *
			
 
				- * Sets current thread back to running state and removes
			
 
				- * the wait descriptor from the given waitqueue if still
			
 
				- * queued.
			
 
				- *
			
 
				- * Wakes up the next waiter if the caller is concurrently
			
 
				- * woken up through the queue.
			
 
				- *
			
 
				- * This prevents waiter starvation where an exclusive waiter
			
 
				- * aborts and is woken up concurrently and no one wakes up
			
 
				- * the next waiter.
			
 
				- */
			
 
				-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
			
 
				-			unsigned int mode, void *key)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	__set_current_state(TASK_RUNNING);
			
 
				-	spin_lock_irqsave(&q->lock, flags);
			
 
				-	if (!list_empty(&wait->task_list))
			
 
				-		list_del_init(&wait->task_list);
			
 
				-	else if (waitqueue_active(q))
			
 
				-		__wake_up_locked_key(q, mode, key);
			
 
				-	spin_unlock_irqrestore(&q->lock, flags);
			
 
				-}
			
 
				-EXPORT_SYMBOL(abort_exclusive_wait);
			
 
				-
			
 
				 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
			
 
				 {
			
 
				 	int ret = default_wake_function(wait, mode, sync, key);
			
@@ -425,20 +413,29 @@ int __sched
 
				 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
			
 
				 			wait_bit_action_f *action, unsigned mode)
			
 
				 {
			
 
				-	do {
			
 
				-		int ret;
			
 
				+	int ret = 0;
			
 
				 
			
 
				+	for (;;) {
			
 
				 		prepare_to_wait_exclusive(wq, &q->wait, mode);
			
 
				-		if (!test_bit(q->key.bit_nr, q->key.flags))
			
 
				-			continue;
			
 
				-		ret = action(&q->key, mode);
			
 
				-		if (!ret)
			
 
				-			continue;
			
 
				-		abort_exclusive_wait(wq, &q->wait, mode, &q->key);
			
 
				-		return ret;
			
 
				-	} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
			
 
				-	finish_wait(wq, &q->wait);
			
 
				-	return 0;
			
 
				+		if (test_bit(q->key.bit_nr, q->key.flags)) {
			
 
				+			ret = action(&q->key, mode);
			
 
				+			/*
			
 
				+			 * See the comment in prepare_to_wait_event().
			
 
				+			 * finish_wait() does not necessarily takes wq->lock,
			
 
				+			 * but test_and_set_bit() implies mb() which pairs with
			
 
				+			 * smp_mb__after_atomic() before wake_up_page().
			
 
				+			 */
			
 
				+			if (ret)
			
 
				+				finish_wait(wq, &q->wait);
			
 
				+		}
			
 
				+		if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
			
 
				+			if (!ret)
			
 
				+				finish_wait(wq, &q->wait);
			
 
				+			return 0;
			
 
				+		} else if (ret) {
			
 
				+			return ret;
			
 
				+		}
			
 
				+	}
			
 
				 }
			
 
				 EXPORT_SYMBOL(__wait_on_bit_lock);
			
 
				 
			
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -122,12 +122,12 @@ static int smpboot_thread_fn(void *data)
 
				 
			
 
				 		if (kthread_should_park()) {
			
 
				 			__set_current_state(TASK_RUNNING);
			
 
				-			preempt_enable();
			
 
				 			if (ht->park && td->status == HP_THREAD_ACTIVE) {
			
 
				 				BUG_ON(td->cpu != smp_processor_id());
			
 
				 				ht->park(td->cpu);
			
 
				 				td->status = HP_THREAD_PARKED;
			
 
				 			}
			
 
				+			preempt_enable();
			
 
				 			kthread_parkme();
			
 
				 			/* We might have been woken for stop */
			
 
				 			continue;
			
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -121,6 +121,11 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
 
				 	cpu_stop_init_done(&done, 1);
			
 
				 	if (!cpu_stop_queue_work(cpu, &work))
			
 
				 		return -ENOENT;
			
 
				+	/*
			
 
				+	 * In case @cpu == smp_proccessor_id() we can avoid a sleep+wakeup
			
 
				+	 * cycle by doing a preemption:
			
 
				+	 */
			
 
				+	cond_resched();
			
 
				 	wait_for_completion(&done.completion);
			
 
				 	return done.ret;
			
 
				 }
			
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1165,7 +1165,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd)
 
				 	}
			
 
				 
			
 
				 	/* See similar comment in do_numa_page for explanation */
			
 
				-	if (!(vma->vm_flags & VM_WRITE))
			
 
				+	if (!pmd_write(pmd))
			
 
				 		flags |= TNF_NO_GROUP;
			
 
				 
			
 
				 	/*
			
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3395,7 +3395,7 @@ static int do_numa_page(struct fault_env *fe, pte_t pte)
 
				 	 * pte_dirty has unpredictable behaviour between PTE scan updates,
			
 
				 	 * background writeback, dirty balancing and application behaviour.
			
 
				 	 */
			
 
				-	if (!(vma->vm_flags & VM_WRITE))
			
 
				+	if (!pte_write(pte))
			
 
				 		flags |= TNF_NO_GROUP;
			
 
				 
			
 
				 	/*
			
--- a/tools/objtool/builtin-check.c
+++ b/tools/objtool/builtin-check.c
@@ -175,6 +175,7 @@ static int __dead_end_function(struct objtool_file *file, struct symbol *func,
 
				 		"__stack_chk_fail",
			
 
				 		"panic",
			
 
				 		"do_exit",
			
 
				+		"do_task_dead",
			
 
				 		"__module_put_and_exit",
			
 
				 		"complete_and_exit",
			
 
				 		"kvm_spurious_fault",