10 years ago · a1d8561172
--- a/arch/arm/include/asm/switch_to.h
+++ b/arch/arm/include/asm/switch_to.h
@@ -10,7 +10,9 @@
 
				  * CPU.
			
 
				  */
			
 
				 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP) && defined(CONFIG_CPU_V7)
			
 
				-#define finish_arch_switch(prev)	dsb(ish)
			
 
				+#define __complete_pending_tlbi()	dsb(ish)
			
 
				+#else
			
 
				+#define __complete_pending_tlbi()
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -22,6 +24,7 @@ extern struct task_struct *__switch_to(struct task_struct *, struct thread_info
 
				 
			
 
				 #define switch_to(prev,next,last)					\
			
 
				 do {									\
			
 
				+	__complete_pending_tlbi();					\
			
 
				 	last = __switch_to(prev,task_thread_info(prev), task_thread_info(next));	\
			
 
				 } while (0)
			
 
				 
			
--- a/arch/avr32/include/asm/switch_to.h
+++ b/arch/avr32/include/asm/switch_to.h
@@ -15,11 +15,13 @@
 
				  */
			
 
				 #ifdef CONFIG_OWNERSHIP_TRACE
			
 
				 #include <asm/ocd.h>
			
 
				-#define finish_arch_switch(prev)			\
			
 
				+#define ocd_switch(prev, next)				\
			
 
				 	do {						\
			
 
				 		ocd_write(PID, prev->pid);		\
			
 
				-		ocd_write(PID, current->pid);		\
			
 
				+		ocd_write(PID, next->pid);		\
			
 
				 	} while(0)
			
 
				+#else
			
 
				+#define ocd_switch(prev, next)
			
 
				 #endif
			
 
				 
			
 
				 /*
			
@@ -38,6 +40,7 @@ extern struct task_struct *__switch_to(struct task_struct *,
 
				 				       struct cpu_context *);
			
 
				 #define switch_to(prev, next, last)					\
			
 
				 	do {								\
			
 
				+		ocd_switch(prev, next);					\
			
 
				 		last = __switch_to(prev, &prev->thread.cpu_context + 1,	\
			
 
				 				   &next->thread.cpu_context);		\
			
 
				 	} while (0)
			
--- a/arch/mips/include/asm/switch_to.h
+++ b/arch/mips/include/asm/switch_to.h
@@ -83,45 +83,43 @@ do {	if (cpu_has_rw_llb) {						\
 
				 	}								\
			
 
				 } while (0)
			
 
				 
			
 
				+/*
			
 
				+ * For newly created kernel threads switch_to() will return to
			
 
				+ * ret_from_kernel_thread, newly created user threads to ret_from_fork.
			
 
				+ * That is, everything following resume() will be skipped for new threads.
			
 
				+ * So everything that matters to new threads should be placed before resume().
			
 
				+ */
			
 
				 #define switch_to(prev, next, last)					\
			
 
				 do {									\
			
 
				-	u32 __c0_stat;							\
			
 
				 	s32 __fpsave = FP_SAVE_NONE;					\
			
 
				 	__mips_mt_fpaff_switch_to(prev);				\
			
 
				-	if (cpu_has_dsp)						\
			
 
				+	if (cpu_has_dsp) {						\
			
 
				 		__save_dsp(prev);					\
			
 
				-	if (cop2_present && (KSTK_STATUS(prev) & ST0_CU2)) {		\
			
 
				-		if (cop2_lazy_restore)					\
			
 
				-			KSTK_STATUS(prev) &= ~ST0_CU2;			\
			
 
				-		__c0_stat = read_c0_status();				\
			
 
				-		write_c0_status(__c0_stat | ST0_CU2);			\
			
 
				-		cop2_save(prev);					\
			
 
				-		write_c0_status(__c0_stat & ~ST0_CU2);			\
			
 
				+		__restore_dsp(next);					\
			
 
				+	}								\
			
 
				+	if (cop2_present) {						\
			
 
				+		set_c0_status(ST0_CU2);					\
			
 
				+		if ((KSTK_STATUS(prev) & ST0_CU2)) {			\
			
 
				+			if (cop2_lazy_restore)				\
			
 
				+				KSTK_STATUS(prev) &= ~ST0_CU2;		\
			
 
				+			cop2_save(prev);				\
			
 
				+		}							\
			
 
				+		if (KSTK_STATUS(next) & ST0_CU2 &&			\
			
 
				+		    !cop2_lazy_restore) {				\
			
 
				+			cop2_restore(next);				\
			
 
				+		}							\
			
 
				+		clear_c0_status(ST0_CU2);				\
			
 
				 	}								\
			
 
				 	__clear_software_ll_bit();					\
			
 
				 	if (test_and_clear_tsk_thread_flag(prev, TIF_USEDFPU))		\
			
 
				 		__fpsave = FP_SAVE_SCALAR;				\
			
 
				 	if (test_and_clear_tsk_thread_flag(prev, TIF_USEDMSA))		\
			
 
				 		__fpsave = FP_SAVE_VECTOR;				\
			
 
				-	(last) = resume(prev, next, task_thread_info(next), __fpsave);	\
			
 
				-} while (0)
			
 
				-
			
 
				-#define finish_arch_switch(prev)					\
			
 
				-do {									\
			
 
				-	u32 __c0_stat;							\
			
 
				-	if (cop2_present && !cop2_lazy_restore &&			\
			
 
				-			(KSTK_STATUS(current) & ST0_CU2)) {		\
			
 
				-		__c0_stat = read_c0_status();				\
			
 
				-		write_c0_status(__c0_stat | ST0_CU2);			\
			
 
				-		cop2_restore(current);					\
			
 
				-		write_c0_status(__c0_stat & ~ST0_CU2);			\
			
 
				-	}								\
			
 
				-	if (cpu_has_dsp)						\
			
 
				-		__restore_dsp(current);					\
			
 
				 	if (cpu_has_userlocal)						\
			
 
				-		write_c0_userlocal(current_thread_info()->tp_value);	\
			
 
				+		write_c0_userlocal(task_thread_info(next)->tp_value);	\
			
 
				 	__restore_watch();						\
			
 
				 	disable_msa();							\
			
 
				+	(last) = resume(prev, next, task_thread_info(next), __fpsave);	\
			
 
				 } while (0)
			
 
				 
			
 
				 #endif /* _ASM_SWITCH_TO_H */
			
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2178,7 +2178,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
				 		vc->runner = vcpu;
			
 
				 		if (n_ceded == vc->n_runnable) {
			
 
				 			kvmppc_vcore_blocked(vc);
			
 
				-		} else if (should_resched()) {
			
 
				+		} else if (need_resched()) {
			
 
				 			vc->vcore_state = VCORE_PREEMPT;
			
 
				 			/* Let something else run */
			
 
				 			cond_resched_lock(&vc->lock);
			
--- a/arch/score/include/asm/switch_to.h
+++ b/arch/score/include/asm/switch_to.h
@@ -8,6 +8,4 @@ do {								\
 
				 	(last) = resume(prev, next, task_thread_info(next));	\
			
 
				 } while (0)
			
 
				 
			
 
				-#define finish_arch_switch(prev)	do {} while (0)
			
 
				-
			
 
				 #endif /* _ASM_SCORE_SWITCH_TO_H */
			
--- a/arch/sh/include/asm/switch_to_32.h
+++ b/arch/sh/include/asm/switch_to_32.h
@@ -78,6 +78,8 @@ do {								\
 
				 								\
			
 
				 	if (is_dsp_enabled(prev))				\
			
 
				 		__save_dsp(prev);				\
			
 
				+	if (is_dsp_enabled(next))				\
			
 
				+		__restore_dsp(next);				\
			
 
				 								\
			
 
				 	__ts1 = (u32 *)&prev->thread.sp;			\
			
 
				 	__ts2 = (u32 *)&prev->thread.pc;			\
			
@@ -125,10 +127,4 @@ do {								\
 
				 	last = __last;						\
			
 
				 } while (0)
			
 
				 
			
 
				-#define finish_arch_switch(prev)				\
			
 
				-do {								\
			
 
				-	if (is_dsp_enabled(prev))				\
			
 
				-		__restore_dsp(prev);				\
			
 
				-} while (0)
			
 
				-
			
 
				 #endif /* __ASM_SH_SWITCH_TO_32_H */
			
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -333,11 +333,11 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
 
				 	childregs = (struct pt_regs *) (new_stack + STACKFRAME_SZ);
			
 
				 
			
 
				 	/*
			
 
				-	 * A new process must start with interrupts closed in 2.5,
			
 
				-	 * because this is how Mingo's scheduler works (see schedule_tail
			
 
				-	 * and finish_arch_switch). If we do not do it, a timer interrupt hits
			
 
				-	 * before we unlock, attempts to re-take the rq->lock, and then we die.
			
 
				-	 * Thus, kpsr|=PSR_PIL.
			
 
				+	 * A new process must start with interrupts disabled, see schedule_tail()
			
 
				+	 * and finish_task_switch(). (If we do not do it and if a timer interrupt
			
 
				+	 * hits before we unlock and attempts to take the rq->lock, we deadlock.)
			
 
				+	 *
			
 
				+	 * Thus, kpsr |= PSR_PIL.
			
 
				 	 */
			
 
				 	ti->ksp = (unsigned long) new_stack;
			
 
				 	p->thread.kregs = childregs;
			
--- a/arch/tile/include/asm/switch_to.h
+++ b/arch/tile/include/asm/switch_to.h
@@ -53,15 +53,13 @@ extern unsigned long get_switch_to_pc(void);
 
				  * Kernel threads can check to see if they need to migrate their
			
 
				  * stack whenever they return from a context switch; for user
			
 
				  * threads, we defer until they are returning to user-space.
			
 
				+ * We defer homecache migration until the runqueue lock is released.
			
 
				  */
			
 
				-#define finish_arch_switch(prev) do {                                     \
			
 
				-	if (unlikely((prev)->state == TASK_DEAD))                         \
			
 
				-		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |       \
			
 
				-			((prev)->pid << _SIM_CONTROL_OPERATOR_BITS));     \
			
 
				+#define finish_arch_post_lock_switch() do {                               \
			
 
				 	__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_SWITCH |             \
			
 
				 		(current->pid << _SIM_CONTROL_OPERATOR_BITS));            \
			
 
				 	if (current->mm == NULL && !kstack_hash &&                        \
			
 
				-	    current_thread_info()->homecache_cpu != smp_processor_id())   \
			
 
				+	    current_thread_info()->homecache_cpu != raw_smp_processor_id()) \
			
 
				 		homecache_migrate_kthread();                              \
			
 
				 } while (0)
			
 
				 
			
--- a/arch/tile/kernel/process.c
+++ b/arch/tile/kernel/process.c
@@ -446,6 +446,11 @@ struct task_struct *__sched _switch_to(struct task_struct *prev,
 
				 	hardwall_switch_tasks(prev, next);
			
 
				 #endif
			
 
				 
			
 
				+	/* Notify the simulator of task exit. */
			
 
				+	if (unlikely(prev->state == TASK_DEAD))
			
 
				+		__insn_mtspr(SPR_SIM_CONTROL, SIM_CONTROL_OS_EXIT |
			
 
				+			     (prev->pid << _SIM_CONTROL_OPERATOR_BITS));
			
 
				+
			
 
				 	/*
			
 
				 	 * Switch kernel SP, PC, and callee-saved registers.
			
 
				 	 * In the context of the new task, return the old task pointer
			
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -90,9 +90,9 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 
				 /*
			
 
				  * Returns true when we need to resched and can (barring IRQ state).
			
 
				  */
			
 
				-static __always_inline bool should_resched(void)
			
 
				+static __always_inline bool should_resched(int preempt_offset)
			
 
				 {
			
 
				-	return unlikely(!raw_cpu_read_4(__preempt_count));
			
 
				+	return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset);
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_PREEMPT
			
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -123,6 +123,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 
				 	 * cpuidle mechanism enables interrupts and doing that with timekeeping
			
 
				 	 * suspended is generally unsafe.
			
 
				 	 */
			
 
				+	stop_critical_timings();
			
 
				 	drv->states[index].enter_freeze(dev, drv, index);
			
 
				 	WARN_ON(!irqs_disabled());
			
 
				 	/*
			
@@ -131,6 +132,7 @@ static void enter_freeze_proper(struct cpuidle_driver *drv,
 
				 	 * critical sections, so tell RCU about that.
			
 
				 	 */
			
 
				 	RCU_NONIDLE(tick_unfreeze());
			
 
				+	start_critical_timings();
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -195,7 +197,9 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
 
				 	trace_cpu_idle_rcuidle(index, dev->cpu);
			
 
				 	time_start = ktime_get();
			
 
				 
			
 
				+	stop_critical_timings();
			
 
				 	entered_state = target_state->enter(dev, drv, index);
			
 
				+	start_critical_timings();
			
 
				 
			
 
				 	time_end = ktime_get();
			
 
				 	trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
			
--- a/drivers/xen/preempt.c
+++ b/drivers/xen/preempt.c
@@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(xen_in_preemptible_hcall);
 
				 asmlinkage __visible void xen_maybe_preempt_hcall(void)
			
 
				 {
			
 
				 	if (unlikely(__this_cpu_read(xen_in_preemptible_hcall)
			
 
				-		     && should_resched())) {
			
 
				+		     && need_resched())) {
			
 
				 		/*
			
 
				 		 * Clear flag as we may be rescheduled on a different
			
 
				 		 * cpu.
			
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -71,9 +71,10 @@ static __always_inline bool __preempt_count_dec_and_test(void)
 
				 /*
			
 
				  * Returns true when we need to resched and can (barring IRQ state).
			
 
				  */
			
 
				-static __always_inline bool should_resched(void)
			
 
				+static __always_inline bool should_resched(int preempt_offset)
			
 
				 {
			
 
				-	return unlikely(!preempt_count() && tif_need_resched());
			
 
				+	return unlikely(preempt_count() == preempt_offset &&
			
 
				+			tif_need_resched());
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_PREEMPT
			
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
 
				 #define INIT_CPUSET_SEQ(tsk)
			
 
				 #endif
			
 
				 
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				+#define INIT_PREV_CPUTIME(x)	.prev_cputime = {			\
			
 
				+	.lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),		\
			
 
				+},
			
 
				+#else
			
 
				+#define INIT_PREV_CPUTIME(x)
			
 
				+#endif
			
 
				+
			
 
				 #define INIT_SIGNALS(sig) {						\
			
 
				 	.nr_threads	= 1,						\
			
 
				 	.thread_head	= LIST_HEAD_INIT(init_task.thread_node),	\
			
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
 
				 		.cputime_atomic	= INIT_CPUTIME_ATOMIC,			\
			
 
				 		.running	= 0,					\
			
 
				 	},								\
			
 
				+	INIT_PREV_CPUTIME(sig)						\
			
 
				 	.cred_guard_mutex =						\
			
 
				 		 __MUTEX_INITIALIZER(sig.cred_guard_mutex),		\
			
 
				 }
			
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
 
				 	INIT_TASK_RCU_TASKS(tsk)					\
			
 
				 	INIT_CPUSET_SEQ(tsk)						\
			
 
				 	INIT_RT_MUTEXES(tsk)						\
			
 
				+	INIT_PREV_CPUTIME(tsk)						\
			
 
				 	INIT_VTIME(tsk)							\
			
 
				 	INIT_NUMA_BALANCING(tsk)					\
			
 
				 	INIT_KASAN(tsk)							\
			
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -38,6 +38,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 
				 })
			
 
				 
			
 
				 void kthread_bind(struct task_struct *k, unsigned int cpu);
			
 
				+void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
			
 
				 int kthread_stop(struct task_struct *k);
			
 
				 bool kthread_should_stop(void);
			
 
				 bool kthread_should_park(void);
			
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -84,12 +84,20 @@
 
				  */
			
 
				 #define in_nmi()	(preempt_count() & NMI_MASK)
			
 
				 
			
 
				+/*
			
 
				+ * The preempt_count offset after preempt_disable();
			
 
				+ */
			
 
				 #if defined(CONFIG_PREEMPT_COUNT)
			
 
				-# define PREEMPT_DISABLE_OFFSET 1
			
 
				+# define PREEMPT_DISABLE_OFFSET	PREEMPT_OFFSET
			
 
				 #else
			
 
				-# define PREEMPT_DISABLE_OFFSET 0
			
 
				+# define PREEMPT_DISABLE_OFFSET	0
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * The preempt_count offset after spin_lock()
			
 
				+ */
			
 
				+#define PREEMPT_LOCK_OFFSET	PREEMPT_DISABLE_OFFSET
			
 
				+
			
 
				 /*
			
 
				  * The preempt_count offset needed for things like:
			
 
				  *
			
@@ -103,7 +111,7 @@
 
				  *
			
 
				  * Work as expected.
			
 
				  */
			
 
				-#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_DISABLE_OFFSET)
			
 
				+#define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET)
			
 
				 
			
 
				 /*
			
 
				  * Are we running in atomic context?  WARNING: this macro cannot
			
@@ -124,7 +132,8 @@
 
				 #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_PREEMPT_TRACER)
			
 
				 extern void preempt_count_add(int val);
			
 
				 extern void preempt_count_sub(int val);
			
 
				-#define preempt_count_dec_and_test() ({ preempt_count_sub(1); should_resched(); })
			
 
				+#define preempt_count_dec_and_test() \
			
 
				+	({ preempt_count_sub(1); should_resched(0); })
			
 
				 #else
			
 
				 #define preempt_count_add(val)	__preempt_count_add(val)
			
 
				 #define preempt_count_sub(val)	__preempt_count_sub(val)
			
@@ -184,7 +193,7 @@ do { \
 
				 
			
 
				 #define preempt_check_resched() \
			
 
				 do { \
			
 
				-	if (should_resched()) \
			
 
				+	if (should_resched(0)) \
			
 
				 		__preempt_schedule(); \
			
 
				 } while (0)
			
 
				 
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,39 +530,49 @@ struct cpu_itimer {
 
				 };
			
 
				 
			
 
				 /**
			
 
				- * struct cputime - snaphsot of system and user cputime
			
 
				+ * struct prev_cputime - snaphsot of system and user cputime
			
 
				  * @utime: time spent in user mode
			
 
				  * @stime: time spent in system mode
			
 
				+ * @lock: protects the above two fields
			
 
				  *
			
 
				- * Gathers a generic snapshot of user and system time.
			
 
				+ * Stores previous user/system time values such that we can guarantee
			
 
				+ * monotonicity.
			
 
				  */
			
 
				-struct cputime {
			
 
				+struct prev_cputime {
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				 	cputime_t utime;
			
 
				 	cputime_t stime;
			
 
				+	raw_spinlock_t lock;
			
 
				+#endif
			
 
				 };
			
 
				 
			
 
				+static inline void prev_cputime_init(struct prev_cputime *prev)
			
 
				+{
			
 
				+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				+	prev->utime = prev->stime = 0;
			
 
				+	raw_spin_lock_init(&prev->lock);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * struct task_cputime - collected CPU time counts
			
 
				  * @utime:		time spent in user mode, in &cputime_t units
			
 
				  * @stime:		time spent in kernel mode, in &cputime_t units
			
 
				  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
			
 
				  *
			
 
				- * This is an extension of struct cputime that includes the total runtime
			
 
				- * spent by the task from the scheduler point of view.
			
 
				- *
			
 
				- * As a result, this structure groups together three kinds of CPU time
			
 
				- * that are tracked for threads and thread groups.  Most things considering
			
 
				- * CPU time want to group these counts together and treat all three
			
 
				- * of them in parallel.
			
 
				+ * This structure groups together three kinds of CPU time that are tracked for
			
 
				+ * threads and thread groups.  Most things considering CPU time want to group
			
 
				+ * these counts together and treat all three of them in parallel.
			
 
				  */
			
 
				 struct task_cputime {
			
 
				 	cputime_t utime;
			
 
				 	cputime_t stime;
			
 
				 	unsigned long long sum_exec_runtime;
			
 
				 };
			
 
				+
			
 
				 /* Alternate field names when used to cache expirations. */
			
 
				-#define prof_exp	stime
			
 
				 #define virt_exp	utime
			
 
				+#define prof_exp	stime
			
 
				 #define sched_exp	sum_exec_runtime
			
 
				 
			
 
				 #define INIT_CPUTIME	\
			
@@ -715,9 +725,7 @@ struct signal_struct {
 
				 	cputime_t utime, stime, cutime, cstime;
			
 
				 	cputime_t gtime;
			
 
				 	cputime_t cgtime;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	struct cputime prev_cputime;
			
 
				-#endif
			
 
				+	struct prev_cputime prev_cputime;
			
 
				 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
			
 
				 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
			
 
				 	unsigned long inblock, oublock, cinblock, coublock;
			
@@ -1167,29 +1175,24 @@ struct load_weight {
 
				 	u32 inv_weight;
			
 
				 };
			
 
				 
			
 
				+/*
			
 
				+ * The load_avg/util_avg accumulates an infinite geometric series.
			
 
				+ * 1) load_avg factors the amount of time that a sched_entity is
			
 
				+ * runnable on a rq into its weight. For cfs_rq, it is the aggregated
			
 
				+ * such weights of all runnable and blocked sched_entities.
			
 
				+ * 2) util_avg factors frequency scaling into the amount of time
			
 
				+ * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE].
			
 
				+ * For cfs_rq, it is the aggregated such times of all runnable and
			
 
				+ * blocked sched_entities.
			
 
				+ * The 64 bit load_sum can:
			
 
				+ * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with
			
 
				+ * the highest weight (=88761) always runnable, we should not overflow
			
 
				+ * 2) for entity, support any load.weight always runnable
			
 
				+ */
			
 
				 struct sched_avg {
			
 
				-	u64 last_runnable_update;
			
 
				-	s64 decay_count;
			
 
				-	/*
			
 
				-	 * utilization_avg_contrib describes the amount of time that a
			
 
				-	 * sched_entity is running on a CPU. It is based on running_avg_sum
			
 
				-	 * and is scaled in the range [0..SCHED_LOAD_SCALE].
			
 
				-	 * load_avg_contrib described the amount of time that a sched_entity
			
 
				-	 * is runnable on a rq. It is based on both runnable_avg_sum and the
			
 
				-	 * weight of the task.
			
 
				-	 */
			
 
				-	unsigned long load_avg_contrib, utilization_avg_contrib;
			
 
				-	/*
			
 
				-	 * These sums represent an infinite geometric series and so are bound
			
 
				-	 * above by 1024/(1-y).  Thus we only need a u32 to store them for all
			
 
				-	 * choices of y < 1-2^(-32)*1024.
			
 
				-	 * running_avg_sum reflects the time that the sched_entity is
			
 
				-	 * effectively running on the CPU.
			
 
				-	 * runnable_avg_sum represents the amount of time a sched_entity is on
			
 
				-	 * a runqueue which includes the running time that is monitored by
			
 
				-	 * running_avg_sum.
			
 
				-	 */
			
 
				-	u32 runnable_avg_sum, avg_period, running_avg_sum;
			
 
				+	u64 last_update_time, load_sum;
			
 
				+	u32 util_sum, period_contrib;
			
 
				+	unsigned long load_avg, util_avg;
			
 
				 };
			
 
				 
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
@@ -1255,7 +1258,7 @@ struct sched_entity {
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-	/* Per-entity load-tracking */
			
 
				+	/* Per entity load average tracking */
			
 
				 	struct sched_avg	avg;
			
 
				 #endif
			
 
				 };
			
@@ -1351,9 +1354,9 @@ struct task_struct {
 
				 #ifdef CONFIG_SMP
			
 
				 	struct llist_node wake_entry;
			
 
				 	int on_cpu;
			
 
				-	struct task_struct *last_wakee;
			
 
				-	unsigned long wakee_flips;
			
 
				+	unsigned int wakee_flips;
			
 
				 	unsigned long wakee_flip_decay_ts;
			
 
				+	struct task_struct *last_wakee;
			
 
				 
			
 
				 	int wake_cpu;
			
 
				 #endif
			
@@ -1481,9 +1484,7 @@ struct task_struct {
 
				 
			
 
				 	cputime_t utime, stime, utimescaled, stimescaled;
			
 
				 	cputime_t gtime;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	struct cputime prev_cputime;
			
 
				-#endif
			
 
				+	struct prev_cputime prev_cputime;
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
 
				 	seqlock_t vtime_seqlock;
			
 
				 	unsigned long long vtime_snap;
			
@@ -2214,13 +2215,6 @@ static inline void calc_load_enter_idle(void) { }
 
				 static inline void calc_load_exit_idle(void) { }
			
 
				 #endif /* CONFIG_NO_HZ_COMMON */
			
 
				 
			
 
				-#ifndef CONFIG_CPUMASK_OFFSTACK
			
 
				-static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
			
 
				-{
			
 
				-	return set_cpus_allowed_ptr(p, &new_mask);
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				 /*
			
 
				  * Do not use outside of architecture code which knows its limitations.
			
 
				  *
			
@@ -2897,12 +2891,6 @@ extern int _cond_resched(void);
 
				 
			
 
				 extern int __cond_resched_lock(spinlock_t *lock);
			
 
				 
			
 
				-#ifdef CONFIG_PREEMPT_COUNT
			
 
				-#define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
			
 
				-#else
			
 
				-#define PREEMPT_LOCK_OFFSET	0
			
 
				-#endif
			
 
				-
			
 
				 #define cond_resched_lock(lock) ({				\
			
 
				 	___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
			
 
				 	__cond_resched_lock(lock);				\
			
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -112,25 +112,13 @@ static inline int try_stop_cpus(const struct cpumask *cpumask,
 
				  *
			
 
				  * This can be thought of as a very heavy write lock, equivalent to
			
 
				  * grabbing every spinlock in the kernel. */
			
 
				-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
			
 
				+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
			
 
				 
			
 
				-/**
			
 
				- * __stop_machine: freeze the machine on all CPUs and run this function
			
 
				- * @fn: the function to run
			
 
				- * @data: the data ptr for the @fn
			
 
				- * @cpus: the cpus to run the @fn() on (NULL = any online cpu)
			
 
				- *
			
 
				- * Description: This is a special version of the above, which assumes cpus
			
 
				- * won't come or go while it's being called.  Used by hotplug cpu.
			
 
				- */
			
 
				-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus);
			
 
				-
			
 
				-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
			
 
				+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
			
 
				 				   const struct cpumask *cpus);
			
 
				-
			
 
				 #else	 /* CONFIG_STOP_MACHINE && CONFIG_SMP */
			
 
				 
			
 
				-static inline int __stop_machine(int (*fn)(void *), void *data,
			
 
				+static inline int stop_machine(cpu_stop_fn_t fn, void *data,
			
 
				 				 const struct cpumask *cpus)
			
 
				 {
			
 
				 	unsigned long flags;
			
@@ -141,16 +129,10 @@ static inline int __stop_machine(int (*fn)(void *), void *data,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static inline int stop_machine(int (*fn)(void *), void *data,
			
 
				-			       const struct cpumask *cpus)
			
 
				-{
			
 
				-	return __stop_machine(fn, data, cpus);
			
 
				-}
			
 
				-
			
 
				-static inline int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
			
 
				+static inline int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
			
 
				 						 const struct cpumask *cpus)
			
 
				 {
			
 
				-	return __stop_machine(fn, data, cpus);
			
 
				+	return stop_machine(fn, data, cpus);
			
 
				 }
			
 
				 
			
 
				 #endif	/* CONFIG_STOP_MACHINE && CONFIG_SMP */
			
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -55,9 +55,9 @@ TRACE_EVENT(sched_kthread_stop_ret,
 
				  */
			
 
				 DECLARE_EVENT_CLASS(sched_wakeup_template,
			
 
				 
			
 
				-	TP_PROTO(struct task_struct *p, int success),
			
 
				+	TP_PROTO(struct task_struct *p),
			
 
				 
			
 
				-	TP_ARGS(__perf_task(p), success),
			
 
				+	TP_ARGS(__perf_task(p)),
			
 
				 
			
 
				 	TP_STRUCT__entry(
			
 
				 		__array(	char,	comm,	TASK_COMM_LEN	)
			
@@ -71,25 +71,37 @@ DECLARE_EVENT_CLASS(sched_wakeup_template,
 
				 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
			
 
				 		__entry->pid		= p->pid;
			
 
				 		__entry->prio		= p->prio;
			
 
				-		__entry->success	= success;
			
 
				+		__entry->success	= 1; /* rudiment, kill when possible */
			
 
				 		__entry->target_cpu	= task_cpu(p);
			
 
				 	),
			
 
				 
			
 
				-	TP_printk("comm=%s pid=%d prio=%d success=%d target_cpu=%03d",
			
 
				+	TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d",
			
 
				 		  __entry->comm, __entry->pid, __entry->prio,
			
 
				-		  __entry->success, __entry->target_cpu)
			
 
				+		  __entry->target_cpu)
			
 
				 );
			
 
				 
			
 
				+/*
			
 
				+ * Tracepoint called when waking a task; this tracepoint is guaranteed to be
			
 
				+ * called from the waking context.
			
 
				+ */
			
 
				+DEFINE_EVENT(sched_wakeup_template, sched_waking,
			
 
				+	     TP_PROTO(struct task_struct *p),
			
 
				+	     TP_ARGS(p));
			
 
				+
			
 
				+/*
			
 
				+ * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG.
			
 
				+ * It it not always called from the waking context.
			
 
				+ */
			
 
				 DEFINE_EVENT(sched_wakeup_template, sched_wakeup,
			
 
				-	     TP_PROTO(struct task_struct *p, int success),
			
 
				-	     TP_ARGS(p, success));
			
 
				+	     TP_PROTO(struct task_struct *p),
			
 
				+	     TP_ARGS(p));
			
 
				 
			
 
				 /*
			
 
				  * Tracepoint for waking up a new task:
			
 
				  */
			
 
				 DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
			
 
				-	     TP_PROTO(struct task_struct *p, int success),
			
 
				-	     TP_ARGS(p, success));
			
 
				+	     TP_PROTO(struct task_struct *p),
			
 
				+	     TP_ARGS(p));
			
 
				 
			
 
				 #ifdef CREATE_TRACE_POINTS
			
 
				 static inline long __trace_sched_switch_state(struct task_struct *p)
			
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -402,7 +402,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 
				 	/*
			
 
				 	 * So now all preempt/rcu users must observe !cpu_active().
			
 
				 	 */
			
 
				-	err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
			
 
				+	err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
			
 
				 	if (err) {
			
 
				 		/* CPU didn't die: tell everyone.  Can't complain. */
			
 
				 		cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu);
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1072,6 +1072,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
 
				 	rcu_assign_pointer(tsk->sighand, sig);
			
 
				 	if (!sig)
			
 
				 		return -ENOMEM;
			
 
				+
			
 
				 	atomic_set(&sig->count, 1);
			
 
				 	memcpy(sig->action, current->sighand->action, sizeof(sig->action));
			
 
				 	return 0;
			
@@ -1133,6 +1134,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 
				 	init_sigpending(&sig->shared_pending);
			
 
				 	INIT_LIST_HEAD(&sig->posix_timers);
			
 
				 	seqlock_init(&sig->stats_lock);
			
 
				+	prev_cputime_init(&sig->prev_cputime);
			
 
				 
			
 
				 	hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
			
 
				 	sig->real_timer.function = it_real_fn;
			
@@ -1340,9 +1342,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
				 
			
 
				 	p->utime = p->stime = p->gtime = 0;
			
 
				 	p->utimescaled = p->stimescaled = 0;
			
 
				-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
			
 
				-	p->prev_cputime.utime = p->prev_cputime.stime = 0;
			
 
				-#endif
			
 
				+	prev_cputime_init(&p->prev_cputime);
			
 
				+
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
 
				 	seqlock_init(&p->vtime_seqlock);
			
 
				 	p->vtime_snap = 0;
			
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -327,16 +327,30 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 
				 }
			
 
				 EXPORT_SYMBOL(kthread_create_on_node);
			
 
				 
			
 
				-static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
			
 
				+static void __kthread_bind_mask(struct task_struct *p, const struct cpumask *mask, long state)
			
 
				 {
			
 
				-	/* Must have done schedule() in kthread() before we set_task_cpu */
			
 
				+	unsigned long flags;
			
 
				+
			
 
				 	if (!wait_task_inactive(p, state)) {
			
 
				 		WARN_ON(1);
			
 
				 		return;
			
 
				 	}
			
 
				+
			
 
				 	/* It's safe because the task is inactive. */
			
 
				-	do_set_cpus_allowed(p, cpumask_of(cpu));
			
 
				+	raw_spin_lock_irqsave(&p->pi_lock, flags);
			
 
				+	do_set_cpus_allowed(p, mask);
			
 
				 	p->flags |= PF_NO_SETAFFINITY;
			
 
				+	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
			
 
				+}
			
 
				+
			
 
				+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
			
 
				+{
			
 
				+	__kthread_bind_mask(p, cpumask_of(cpu), state);
			
 
				+}
			
 
				+
			
 
				+void kthread_bind_mask(struct task_struct *p, const struct cpumask *mask)
			
 
				+{
			
 
				+	__kthread_bind_mask(p, mask, TASK_UNINTERRUPTIBLE);
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1151,15 +1151,45 @@ static int migration_cpu_stop(void *data)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
			
 
				+/*
			
 
				+ * sched_class::set_cpus_allowed must do the below, but is not required to
			
 
				+ * actually call this function.
			
 
				+ */
			
 
				+void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
			
 
				 {
			
 
				-	if (p->sched_class->set_cpus_allowed)
			
 
				-		p->sched_class->set_cpus_allowed(p, new_mask);
			
 
				-
			
 
				 	cpumask_copy(&p->cpus_allowed, new_mask);
			
 
				 	p->nr_cpus_allowed = cpumask_weight(new_mask);
			
 
				 }
			
 
				 
			
 
				+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
			
 
				+{
			
 
				+	struct rq *rq = task_rq(p);
			
 
				+	bool queued, running;
			
 
				+
			
 
				+	lockdep_assert_held(&p->pi_lock);
			
 
				+
			
 
				+	queued = task_on_rq_queued(p);
			
 
				+	running = task_current(rq, p);
			
 
				+
			
 
				+	if (queued) {
			
 
				+		/*
			
 
				+		 * Because __kthread_bind() calls this on blocked tasks without
			
 
				+		 * holding rq->lock.
			
 
				+		 */
			
 
				+		lockdep_assert_held(&rq->lock);
			
 
				+		dequeue_task(rq, p, 0);
			
 
				+	}
			
 
				+	if (running)
			
 
				+		put_prev_task(rq, p);
			
 
				+
			
 
				+	p->sched_class->set_cpus_allowed(p, new_mask);
			
 
				+
			
 
				+	if (running)
			
 
				+		p->sched_class->set_curr_task(rq);
			
 
				+	if (queued)
			
 
				+		enqueue_task(rq, p, 0);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Change a given task's CPU affinity. Migrate the thread to a
			
 
				  * proper CPU and schedule it away if the CPU it's executing on
			
@@ -1169,7 +1199,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 
				  * task must not exit() & deallocate itself prematurely. The
			
 
				  * call is not atomic; no spinlocks may be held.
			
 
				  */
			
 
				-int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
			
 
				+static int __set_cpus_allowed_ptr(struct task_struct *p,
			
 
				+				  const struct cpumask *new_mask, bool check)
			
 
				 {
			
 
				 	unsigned long flags;
			
 
				 	struct rq *rq;
			
@@ -1178,6 +1209,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
 
				 
			
 
				 	rq = task_rq_lock(p, &flags);
			
 
				 
			
 
				+	/*
			
 
				+	 * Must re-check here, to close a race against __kthread_bind(),
			
 
				+	 * sched_setaffinity() is not guaranteed to observe the flag.
			
 
				+	 */
			
 
				+	if (check && (p->flags & PF_NO_SETAFFINITY)) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				 	if (cpumask_equal(&p->cpus_allowed, new_mask))
			
 
				 		goto out;
			
 
				 
			
@@ -1214,6 +1254,11 @@ out:
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
			
 
				+{
			
 
				+	return __set_cpus_allowed_ptr(p, new_mask, false);
			
 
				+}
			
 
				 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
			
 
				 
			
 
				 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
			
@@ -1595,6 +1640,15 @@ static void update_avg(u64 *avg, u64 sample)
 
				 	s64 diff = sample - *avg;
			
 
				 	*avg += diff >> 3;
			
 
				 }
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+static inline int __set_cpus_allowed_ptr(struct task_struct *p,
			
 
				+					 const struct cpumask *new_mask, bool check)
			
 
				+{
			
 
				+	return set_cpus_allowed_ptr(p, new_mask);
			
 
				+}
			
 
				+
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
 
				 static void
			
@@ -1654,9 +1708,9 @@ static void
 
				 ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
			
 
				 {
			
 
				 	check_preempt_curr(rq, p, wake_flags);
			
 
				-	trace_sched_wakeup(p, true);
			
 
				-
			
 
				 	p->state = TASK_RUNNING;
			
 
				+	trace_sched_wakeup(p);
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 	if (p->sched_class->task_woken) {
			
 
				 		/*
			
@@ -1874,6 +1928,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 
				 	if (!(p->state & state))
			
 
				 		goto out;
			
 
				 
			
 
				+	trace_sched_waking(p);
			
 
				+
			
 
				 	success = 1; /* we're going to change ->state */
			
 
				 	cpu = task_cpu(p);
			
 
				 
			
@@ -1949,6 +2005,8 @@ static void try_to_wake_up_local(struct task_struct *p)
 
				 	if (!(p->state & TASK_NORMAL))
			
 
				 		goto out;
			
 
				 
			
 
				+	trace_sched_waking(p);
			
 
				+
			
 
				 	if (!task_on_rq_queued(p))
			
 
				 		ttwu_activate(rq, p, ENQUEUE_WAKEUP);
			
 
				 
			
@@ -2016,9 +2074,6 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 
				 	p->se.prev_sum_exec_runtime	= 0;
			
 
				 	p->se.nr_migrations		= 0;
			
 
				 	p->se.vruntime			= 0;
			
 
				-#ifdef CONFIG_SMP
			
 
				-	p->se.avg.decay_count		= 0;
			
 
				-#endif
			
 
				 	INIT_LIST_HEAD(&p->se.group_node);
			
 
				 
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
@@ -2303,11 +2358,11 @@ void wake_up_new_task(struct task_struct *p)
 
				 #endif
			
 
				 
			
 
				 	/* Initialize new task's runnable average */
			
 
				-	init_task_runnable_average(p);
			
 
				+	init_entity_runnable_average(&p->se);
			
 
				 	rq = __task_rq_lock(p);
			
 
				 	activate_task(rq, p, 0);
			
 
				 	p->on_rq = TASK_ON_RQ_QUEUED;
			
 
				-	trace_sched_wakeup_new(p, true);
			
 
				+	trace_sched_wakeup_new(p);
			
 
				 	check_preempt_curr(rq, p, WF_FORK);
			
 
				 #ifdef CONFIG_SMP
			
 
				 	if (p->sched_class->task_woken)
			
@@ -2469,7 +2524,6 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 
				 	 */
			
 
				 	prev_state = prev->state;
			
 
				 	vtime_task_switch(prev);
			
 
				-	finish_arch_switch(prev);
			
 
				 	perf_event_task_sched_in(prev, current);
			
 
				 	finish_lock_switch(rq, prev);
			
 
				 	finish_arch_post_lock_switch();
			
@@ -4340,7 +4394,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 
				 	}
			
 
				 #endif
			
 
				 again:
			
 
				-	retval = set_cpus_allowed_ptr(p, new_mask);
			
 
				+	retval = __set_cpus_allowed_ptr(p, new_mask, true);
			
 
				 
			
 
				 	if (!retval) {
			
 
				 		cpuset_cpus_allowed(p, cpus_allowed);
			
@@ -4492,7 +4546,7 @@ SYSCALL_DEFINE0(sched_yield)
 
				 
			
 
				 int __sched _cond_resched(void)
			
 
				 {
			
 
				-	if (should_resched()) {
			
 
				+	if (should_resched(0)) {
			
 
				 		preempt_schedule_common();
			
 
				 		return 1;
			
 
				 	}
			
@@ -4510,7 +4564,7 @@ EXPORT_SYMBOL(_cond_resched);
 
				  */
			
 
				 int __cond_resched_lock(spinlock_t *lock)
			
 
				 {
			
 
				-	int resched = should_resched();
			
 
				+	int resched = should_resched(PREEMPT_LOCK_OFFSET);
			
 
				 	int ret = 0;
			
 
				 
			
 
				 	lockdep_assert_held(lock);
			
@@ -4532,7 +4586,7 @@ int __sched __cond_resched_softirq(void)
 
				 {
			
 
				 	BUG_ON(!in_softirq());
			
 
				 
			
 
				-	if (should_resched()) {
			
 
				+	if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
			
 
				 		local_bh_enable();
			
 
				 		preempt_schedule_common();
			
 
				 		local_bh_disable();
			
@@ -4865,7 +4919,8 @@ void init_idle(struct task_struct *idle, int cpu)
 
				 	struct rq *rq = cpu_rq(cpu);
			
 
				 	unsigned long flags;
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rq->lock, flags);
			
 
				+	raw_spin_lock_irqsave(&idle->pi_lock, flags);
			
 
				+	raw_spin_lock(&rq->lock);
			
 
				 
			
 
				 	__sched_fork(0, idle);
			
 
				 	idle->state = TASK_RUNNING;
			
@@ -4891,7 +4946,8 @@ void init_idle(struct task_struct *idle, int cpu)
 
				 #if defined(CONFIG_SMP)
			
 
				 	idle->on_cpu = 1;
			
 
				 #endif
			
 
				-	raw_spin_unlock_irqrestore(&rq->lock, flags);
			
 
				+	raw_spin_unlock(&rq->lock);
			
 
				+	raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
			
 
				 
			
 
				 	/* Set the preempt count _outside_ the spinlocks! */
			
 
				 	init_idle_preempt_count(idle, cpu);
			
@@ -5311,8 +5367,7 @@ static void register_sched_domain_sysctl(void)
 
				 /* may be called multiple times per register */
			
 
				 static void unregister_sched_domain_sysctl(void)
			
 
				 {
			
 
				-	if (sd_sysctl_header)
			
 
				-		unregister_sysctl_table(sd_sysctl_header);
			
 
				+	unregister_sysctl_table(sd_sysctl_header);
			
 
				 	sd_sysctl_header = NULL;
			
 
				 	if (sd_ctl_dir[0].child)
			
 
				 		sd_free_ctl_entry(&sd_ctl_dir[0].child);
			
@@ -6445,8 +6500,10 @@ static void init_numa_topology_type(void)
 
				 
			
 
				 	n = sched_max_numa_distance;
			
 
				 
			
 
				-	if (n <= 1)
			
 
				+	if (sched_domains_numa_levels <= 1) {
			
 
				 		sched_numa_topology_type = NUMA_DIRECT;
			
 
				+		return;
			
 
				+	}
			
 
				 
			
 
				 	for_each_online_node(a) {
			
 
				 		for_each_online_node(b) {
			
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Atomically advance counter to the new value. Interrupts, vcpu
			
 
				- * scheduling, and scaling inaccuracies can cause cputime_advance
			
 
				- * to be occasionally called with a new value smaller than counter.
			
 
				- * Let's enforce atomicity.
			
 
				+ * Adjust tick based cputime random precision against scheduler runtime
			
 
				+ * accounting.
			
 
				  *
			
 
				- * Normally a caller will only go through this loop once, or not
			
 
				- * at all in case a previous caller updated counter the same jiffy.
			
 
				- */
			
 
				-static void cputime_advance(cputime_t *counter, cputime_t new)
			
 
				-{
			
 
				-	cputime_t old;
			
 
				-
			
 
				-	while (new > (old = READ_ONCE(*counter)))
			
 
				-		cmpxchg_cputime(counter, old, new);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Adjust tick based cputime random precision against scheduler
			
 
				- * runtime accounting.
			
 
				+ * Tick based cputime accounting depend on random scheduling timeslices of a
			
 
				+ * task to be interrupted or not by the timer.  Depending on these
			
 
				+ * circumstances, the number of these interrupts may be over or
			
 
				+ * under-optimistic, matching the real user and system cputime with a variable
			
 
				+ * precision.
			
 
				+ *
			
 
				+ * Fix this by scaling these tick based values against the total runtime
			
 
				+ * accounted by the CFS scheduler.
			
 
				+ *
			
 
				+ * This code provides the following guarantees:
			
 
				+ *
			
 
				+ *   stime + utime == rtime
			
 
				+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
			
 
				+ *
			
 
				+ * Assuming that rtime_i+1 >= rtime_i.
			
 
				  */
			
 
				 static void cputime_adjust(struct task_cputime *curr,
			
 
				-			   struct cputime *prev,
			
 
				+			   struct prev_cputime *prev,
			
 
				 			   cputime_t *ut, cputime_t *st)
			
 
				 {
			
 
				 	cputime_t rtime, stime, utime;
			
 
				+	unsigned long flags;
			
 
				 
			
 
				-	/*
			
 
				-	 * Tick based cputime accounting depend on random scheduling
			
 
				-	 * timeslices of a task to be interrupted or not by the timer.
			
 
				-	 * Depending on these circumstances, the number of these interrupts
			
 
				-	 * may be over or under-optimistic, matching the real user and system
			
 
				-	 * cputime with a variable precision.
			
 
				-	 *
			
 
				-	 * Fix this by scaling these tick based values against the total
			
 
				-	 * runtime accounted by the CFS scheduler.
			
 
				-	 */
			
 
				+	/* Serialize concurrent callers such that we can honour our guarantees */
			
 
				+	raw_spin_lock_irqsave(&prev->lock, flags);
			
 
				 	rtime = nsecs_to_cputime(curr->sum_exec_runtime);
			
 
				 
			
 
				 	/*
			
 
				-	 * Update userspace visible utime/stime values only if actual execution
			
 
				-	 * time is bigger than already exported. Note that can happen, that we
			
 
				-	 * provided bigger values due to scaling inaccuracy on big numbers.
			
 
				+	 * This is possible under two circumstances:
			
 
				+	 *  - rtime isn't monotonic after all (a bug);
			
 
				+	 *  - we got reordered by the lock.
			
 
				+	 *
			
 
				+	 * In both cases this acts as a filter such that the rest of the code
			
 
				+	 * can assume it is monotonic regardless of anything else.
			
 
				 	 */
			
 
				 	if (prev->stime + prev->utime >= rtime)
			
 
				 		goto out;
			
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
 
				 
			
 
				 	if (utime == 0) {
			
 
				 		stime = rtime;
			
 
				-	} else if (stime == 0) {
			
 
				-		utime = rtime;
			
 
				-	} else {
			
 
				-		cputime_t total = stime + utime;
			
 
				+		goto update;
			
 
				+	}
			
 
				 
			
 
				-		stime = scale_stime((__force u64)stime,
			
 
				-				    (__force u64)rtime, (__force u64)total);
			
 
				-		utime = rtime - stime;
			
 
				+	if (stime == 0) {
			
 
				+		utime = rtime;
			
 
				+		goto update;
			
 
				 	}
			
 
				 
			
 
				-	cputime_advance(&prev->stime, stime);
			
 
				-	cputime_advance(&prev->utime, utime);
			
 
				+	stime = scale_stime((__force u64)stime, (__force u64)rtime,
			
 
				+			    (__force u64)(stime + utime));
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure stime doesn't go backwards; this preserves monotonicity
			
 
				+	 * for utime because rtime is monotonic.
			
 
				+	 *
			
 
				+	 *  utime_i+1 = rtime_i+1 - stime_i
			
 
				+	 *            = rtime_i+1 - (rtime_i - utime_i)
			
 
				+	 *            = (rtime_i+1 - rtime_i) + utime_i
			
 
				+	 *            >= utime_i
			
 
				+	 */
			
 
				+	if (stime < prev->stime)
			
 
				+		stime = prev->stime;
			
 
				+	utime = rtime - stime;
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure utime doesn't go backwards; this still preserves
			
 
				+	 * monotonicity for stime, analogous argument to above.
			
 
				+	 */
			
 
				+	if (utime < prev->utime) {
			
 
				+		utime = prev->utime;
			
 
				+		stime = rtime - utime;
			
 
				+	}
			
 
				 
			
 
				+update:
			
 
				+	prev->stime = stime;
			
 
				+	prev->utime = utime;
			
 
				 out:
			
 
				 	*ut = prev->utime;
			
 
				 	*st = prev->stime;
			
 
				+	raw_spin_unlock_irqrestore(&prev->lock, flags);
			
 
				 }
			
 
				 
			
 
				 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
			
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -953,7 +953,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
 
				 
			
 
				 	/*
			
 
				 	 * Use the scheduling parameters of the top pi-waiter
			
 
				-	 * task if we have one and its (relative) deadline is
			
 
				+	 * task if we have one and its (absolute) deadline is
			
 
				 	 * smaller than our one... OTW we keep our runtime and
			
 
				 	 * deadline.
			
 
				 	 */
			
@@ -1563,7 +1563,7 @@ out:
 
				 
			
 
				 static void push_dl_tasks(struct rq *rq)
			
 
				 {
			
 
				-	/* Terminates as it moves a -deadline task */
			
 
				+	/* push_dl_task() will return true if it moved a -deadline task */
			
 
				 	while (push_dl_task(rq))
			
 
				 		;
			
 
				 }
			
@@ -1657,7 +1657,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 
				 {
			
 
				 	if (!task_running(rq, p) &&
			
 
				 	    !test_tsk_need_resched(rq->curr) &&
			
 
				-	    has_pushable_dl_tasks(rq) &&
			
 
				 	    p->nr_cpus_allowed > 1 &&
			
 
				 	    dl_task(rq->curr) &&
			
 
				 	    (rq->curr->nr_cpus_allowed < 2 ||
			
@@ -1669,9 +1668,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
 
				 static void set_cpus_allowed_dl(struct task_struct *p,
			
 
				 				const struct cpumask *new_mask)
			
 
				 {
			
 
				-	struct rq *rq;
			
 
				 	struct root_domain *src_rd;
			
 
				-	int weight;
			
 
				+	struct rq *rq;
			
 
				 
			
 
				 	BUG_ON(!dl_task(p));
			
 
				 
			
@@ -1697,37 +1695,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 
				 		raw_spin_unlock(&src_dl_b->lock);
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * Update only if the task is actually running (i.e.,
			
 
				-	 * it is on the rq AND it is not throttled).
			
 
				-	 */
			
 
				-	if (!on_dl_rq(&p->dl))
			
 
				-		return;
			
 
				-
			
 
				-	weight = cpumask_weight(new_mask);
			
 
				-
			
 
				-	/*
			
 
				-	 * Only update if the process changes its state from whether it
			
 
				-	 * can migrate or not.
			
 
				-	 */
			
 
				-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
			
 
				-		return;
			
 
				-
			
 
				-	/*
			
 
				-	 * The process used to be able to migrate OR it can now migrate
			
 
				-	 */
			
 
				-	if (weight <= 1) {
			
 
				-		if (!task_current(rq, p))
			
 
				-			dequeue_pushable_dl_task(rq, p);
			
 
				-		BUG_ON(!rq->dl.dl_nr_migratory);
			
 
				-		rq->dl.dl_nr_migratory--;
			
 
				-	} else {
			
 
				-		if (!task_current(rq, p))
			
 
				-			enqueue_pushable_dl_task(rq, p);
			
 
				-		rq->dl.dl_nr_migratory++;
			
 
				-	}
			
 
				-
			
 
				-	update_dl_migration(&rq->dl);
			
 
				+	set_cpus_allowed_common(p, new_mask);
			
 
				 }
			
 
				 
			
 
				 /* Assumes rq->lock is held */
			
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -68,13 +68,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
				 #define PN(F) \
			
 
				 	SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
			
 
				 
			
 
				-	if (!se) {
			
 
				-		struct sched_avg *avg = &cpu_rq(cpu)->avg;
			
 
				-		P(avg->runnable_avg_sum);
			
 
				-		P(avg->avg_period);
			
 
				+	if (!se)
			
 
				 		return;
			
 
				-	}
			
 
				-
			
 
				 
			
 
				 	PN(se->exec_start);
			
 
				 	PN(se->vruntime);
			
@@ -93,12 +88,8 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
				 #endif
			
 
				 	P(se->load.weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				-	P(se->avg.runnable_avg_sum);
			
 
				-	P(se->avg.running_avg_sum);
			
 
				-	P(se->avg.avg_period);
			
 
				-	P(se->avg.load_avg_contrib);
			
 
				-	P(se->avg.utilization_avg_contrib);
			
 
				-	P(se->avg.decay_count);
			
 
				+	P(se->avg.load_avg);
			
 
				+	P(se->avg.util_avg);
			
 
				 #endif
			
 
				 #undef PN
			
 
				 #undef P
			
@@ -214,21 +205,21 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
				 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
			
 
				 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_load_avg",
			
 
				+	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
			
 
				+			cfs_rq->avg.load_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
			
 
				 			cfs_rq->runnable_load_avg);
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "blocked_load_avg",
			
 
				-			cfs_rq->blocked_load_avg);
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "utilization_load_avg",
			
 
				-			cfs_rq->utilization_load_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
			
 
				+			cfs_rq->avg.util_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
			
 
				+			atomic_long_read(&cfs_rq->removed_load_avg));
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
			
 
				+			atomic_long_read(&cfs_rq->removed_util_avg));
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_contrib",
			
 
				-			cfs_rq->tg_load_contrib);
			
 
				-	SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
			
 
				-			cfs_rq->tg_runnable_contrib);
			
 
				+	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
			
 
				+			cfs_rq->tg_load_avg_contrib);
			
 
				 	SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
			
 
				 			atomic_long_read(&cfs_rq->tg->load_avg));
			
 
				-	SEQ_printf(m, "  .%-30s: %d\n", "tg->runnable_avg",
			
 
				-			atomic_read(&cfs_rq->tg->runnable_avg));
			
 
				 #endif
			
 
				 #endif
			
 
				 #ifdef CONFIG_CFS_BANDWIDTH
			
@@ -636,12 +627,11 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 
				 
			
 
				 	P(se.load.weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				-	P(se.avg.runnable_avg_sum);
			
 
				-	P(se.avg.running_avg_sum);
			
 
				-	P(se.avg.avg_period);
			
 
				-	P(se.avg.load_avg_contrib);
			
 
				-	P(se.avg.utilization_avg_contrib);
			
 
				-	P(se.avg.decay_count);
			
 
				+	P(se.avg.load_sum);
			
 
				+	P(se.avg.util_sum);
			
 
				+	P(se.avg.load_avg);
			
 
				+	P(se.avg.util_avg);
			
 
				+	P(se.avg.last_update_time);
			
 
				 #endif
			
 
				 	P(policy);
			
 
				 	P(prio);
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -283,9 +283,6 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 
				 	return grp->my_q;
			
 
				 }
			
 
				 
			
 
				-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
			
 
				-				       int force_update);
			
 
				-
			
 
				 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				 	if (!cfs_rq->on_list) {
			
@@ -305,8 +302,6 @@ static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 
				 		}
			
 
				 
			
 
				 		cfs_rq->on_list = 1;
			
 
				-		/* We should have no load, but we need to update last_decay. */
			
 
				-		update_cfs_rq_blocked_load(cfs_rq, 0);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -616,15 +611,10 @@ static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
 
				  */
			
 
				 static u64 __sched_period(unsigned long nr_running)
			
 
				 {
			
 
				-	u64 period = sysctl_sched_latency;
			
 
				-	unsigned long nr_latency = sched_nr_latency;
			
 
				-
			
 
				-	if (unlikely(nr_running > nr_latency)) {
			
 
				-		period = sysctl_sched_min_granularity;
			
 
				-		period *= nr_running;
			
 
				-	}
			
 
				-
			
 
				-	return period;
			
 
				+	if (unlikely(nr_running > sched_nr_latency))
			
 
				+		return nr_running * sysctl_sched_min_granularity;
			
 
				+	else
			
 
				+		return sysctl_sched_latency;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -669,22 +659,37 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 static int select_idle_sibling(struct task_struct *p, int cpu);
			
 
				 static unsigned long task_h_load(struct task_struct *p);
			
 
				 
			
 
				-static inline void __update_task_entity_contrib(struct sched_entity *se);
			
 
				-static inline void __update_task_entity_utilization(struct sched_entity *se);
			
 
				+/*
			
 
				+ * We choose a half-life close to 1 scheduling period.
			
 
				+ * Note: The tables below are dependent on this value.
			
 
				+ */
			
 
				+#define LOAD_AVG_PERIOD 32
			
 
				+#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
			
 
				+#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
			
 
				 
			
 
				-/* Give new task start runnable values to heavy its load in infant time */
			
 
				-void init_task_runnable_average(struct task_struct *p)
			
 
				+/* Give new sched_entity start runnable values to heavy its load in infant time */
			
 
				+void init_entity_runnable_average(struct sched_entity *se)
			
 
				 {
			
 
				-	u32 slice;
			
 
				+	struct sched_avg *sa = &se->avg;
			
 
				 
			
 
				-	slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
			
 
				-	p->se.avg.runnable_avg_sum = p->se.avg.running_avg_sum = slice;
			
 
				-	p->se.avg.avg_period = slice;
			
 
				-	__update_task_entity_contrib(&p->se);
			
 
				-	__update_task_entity_utilization(&p->se);
			
 
				+	sa->last_update_time = 0;
			
 
				+	/*
			
 
				+	 * sched_avg's period_contrib should be strictly less then 1024, so
			
 
				+	 * we give it 1023 to make sure it is almost a period (1024us), and
			
 
				+	 * will definitely be update (after enqueue).
			
 
				+	 */
			
 
				+	sa->period_contrib = 1023;
			
 
				+	sa->load_avg = scale_load_down(se->load.weight);
			
 
				+	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
			
 
				+	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
			
 
				+	sa->util_sum = LOAD_AVG_MAX;
			
 
				+	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
			
 
				 }
			
 
				+
			
 
				+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
			
 
				+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
			
 
				 #else
			
 
				-void init_task_runnable_average(struct task_struct *p)
			
 
				+void init_entity_runnable_average(struct sched_entity *se)
			
 
				 {
			
 
				 }
			
 
				 #endif
			
@@ -1415,8 +1420,9 @@ static bool numa_has_capacity(struct task_numa_env *env)
 
				 	 * --------------------- vs ---------------------
			
 
				 	 * src->compute_capacity    dst->compute_capacity
			
 
				 	 */
			
 
				-	if (src->load * dst->compute_capacity >
			
 
				-	    dst->load * src->compute_capacity)
			
 
				+	if (src->load * dst->compute_capacity * env->imbalance_pct >
			
 
				+
			
 
				+	    dst->load * src->compute_capacity * 100)
			
 
				 		return true;
			
 
				 
			
 
				 	return false;
			
@@ -1702,8 +1708,8 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 
				 		delta = runtime - p->last_sum_exec_runtime;
			
 
				 		*period = now - p->last_task_numa_placement;
			
 
				 	} else {
			
 
				-		delta = p->se.avg.runnable_avg_sum;
			
 
				-		*period = p->se.avg.avg_period;
			
 
				+		delta = p->se.avg.load_sum / p->se.load.weight;
			
 
				+		*period = LOAD_AVG_MAX;
			
 
				 	}
			
 
				 
			
 
				 	p->last_sum_exec_runtime = runtime;
			
@@ -2351,13 +2357,13 @@ static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
 
				 	long tg_weight;
			
 
				 
			
 
				 	/*
			
 
				-	 * Use this CPU's actual weight instead of the last load_contribution
			
 
				-	 * to gain a more accurate current total weight. See
			
 
				-	 * update_cfs_rq_load_contribution().
			
 
				+	 * Use this CPU's real-time load instead of the last load contribution
			
 
				+	 * as the updating of the contribution is delayed, and we will use the
			
 
				+	 * the real-time load to calc the share. See update_tg_load_avg().
			
 
				 	 */
			
 
				 	tg_weight = atomic_long_read(&tg->load_avg);
			
 
				-	tg_weight -= cfs_rq->tg_load_contrib;
			
 
				-	tg_weight += cfs_rq->load.weight;
			
 
				+	tg_weight -= cfs_rq->tg_load_avg_contrib;
			
 
				+	tg_weight += cfs_rq_load_avg(cfs_rq);
			
 
				 
			
 
				 	return tg_weight;
			
 
				 }
			
@@ -2367,7 +2373,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
				 	long tg_weight, load, shares;
			
 
				 
			
 
				 	tg_weight = calc_tg_weight(tg, cfs_rq);
			
 
				-	load = cfs_rq->load.weight;
			
 
				+	load = cfs_rq_load_avg(cfs_rq);
			
 
				 
			
 
				 	shares = (tg->shares * load);
			
 
				 	if (tg_weight)
			
@@ -2429,14 +2435,6 @@ static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-/*
			
 
				- * We choose a half-life close to 1 scheduling period.
			
 
				- * Note: The tables below are dependent on this value.
			
 
				- */
			
 
				-#define LOAD_AVG_PERIOD 32
			
 
				-#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
			
 
				-#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
			
 
				-
			
 
				 /* Precomputed fixed inverse multiplies for multiplication by y^n */
			
 
				 static const u32 runnable_avg_yN_inv[] = {
			
 
				 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
			
@@ -2485,9 +2483,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
 
				 		local_n %= LOAD_AVG_PERIOD;
			
 
				 	}
			
 
				 
			
 
				-	val *= runnable_avg_yN_inv[local_n];
			
 
				-	/* We don't use SRR here since we always want to round down. */
			
 
				-	return val >> 32;
			
 
				+	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
			
 
				+	return val;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2546,23 +2543,22 @@ static u32 __compute_runnable_contrib(u64 n)
 
				  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
			
 
				  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
			
 
				  */
			
 
				-static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
			
 
				-							struct sched_avg *sa,
			
 
				-							int runnable,
			
 
				-							int running)
			
 
				+static __always_inline int
			
 
				+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
			
 
				+		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				 	u64 delta, periods;
			
 
				-	u32 runnable_contrib;
			
 
				+	u32 contrib;
			
 
				 	int delta_w, decayed = 0;
			
 
				 	unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
			
 
				 
			
 
				-	delta = now - sa->last_runnable_update;
			
 
				+	delta = now - sa->last_update_time;
			
 
				 	/*
			
 
				 	 * This should only happen when time goes backwards, which it
			
 
				 	 * unfortunately does during sched clock init when we swap over to TSC.
			
 
				 	 */
			
 
				 	if ((s64)delta < 0) {
			
 
				-		sa->last_runnable_update = now;
			
 
				+		sa->last_update_time = now;
			
 
				 		return 0;
			
 
				 	}
			
 
				 
			
@@ -2573,26 +2569,29 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
 
				 	delta >>= 10;
			
 
				 	if (!delta)
			
 
				 		return 0;
			
 
				-	sa->last_runnable_update = now;
			
 
				+	sa->last_update_time = now;
			
 
				 
			
 
				 	/* delta_w is the amount already accumulated against our next period */
			
 
				-	delta_w = sa->avg_period % 1024;
			
 
				+	delta_w = sa->period_contrib;
			
 
				 	if (delta + delta_w >= 1024) {
			
 
				-		/* period roll-over */
			
 
				 		decayed = 1;
			
 
				 
			
 
				+		/* how much left for next period will start over, we don't know yet */
			
 
				+		sa->period_contrib = 0;
			
 
				+
			
 
				 		/*
			
 
				 		 * Now that we know we're crossing a period boundary, figure
			
 
				 		 * out how much from delta we need to complete the current
			
 
				 		 * period and accrue it.
			
 
				 		 */
			
 
				 		delta_w = 1024 - delta_w;
			
 
				-		if (runnable)
			
 
				-			sa->runnable_avg_sum += delta_w;
			
 
				+		if (weight) {
			
 
				+			sa->load_sum += weight * delta_w;
			
 
				+			if (cfs_rq)
			
 
				+				cfs_rq->runnable_load_sum += weight * delta_w;
			
 
				+		}
			
 
				 		if (running)
			
 
				-			sa->running_avg_sum += delta_w * scale_freq
			
 
				-				>> SCHED_CAPACITY_SHIFT;
			
 
				-		sa->avg_period += delta_w;
			
 
				+			sa->util_sum += delta_w * scale_freq >> SCHED_CAPACITY_SHIFT;
			
 
				 
			
 
				 		delta -= delta_w;
			
 
				 
			
@@ -2600,341 +2599,186 @@ static __always_inline int __update_entity_runnable_avg(u64 now, int cpu,
 
				 		periods = delta / 1024;
			
 
				 		delta %= 1024;
			
 
				 
			
 
				-		sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
			
 
				-						  periods + 1);
			
 
				-		sa->running_avg_sum = decay_load(sa->running_avg_sum,
			
 
				-						  periods + 1);
			
 
				-		sa->avg_period = decay_load(sa->avg_period,
			
 
				-						     periods + 1);
			
 
				+		sa->load_sum = decay_load(sa->load_sum, periods + 1);
			
 
				+		if (cfs_rq) {
			
 
				+			cfs_rq->runnable_load_sum =
			
 
				+				decay_load(cfs_rq->runnable_load_sum, periods + 1);
			
 
				+		}
			
 
				+		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
			
 
				 
			
 
				 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
			
 
				-		runnable_contrib = __compute_runnable_contrib(periods);
			
 
				-		if (runnable)
			
 
				-			sa->runnable_avg_sum += runnable_contrib;
			
 
				+		contrib = __compute_runnable_contrib(periods);
			
 
				+		if (weight) {
			
 
				+			sa->load_sum += weight * contrib;
			
 
				+			if (cfs_rq)
			
 
				+				cfs_rq->runnable_load_sum += weight * contrib;
			
 
				+		}
			
 
				 		if (running)
			
 
				-			sa->running_avg_sum += runnable_contrib * scale_freq
			
 
				-				>> SCHED_CAPACITY_SHIFT;
			
 
				-		sa->avg_period += runnable_contrib;
			
 
				+			sa->util_sum += contrib * scale_freq >> SCHED_CAPACITY_SHIFT;
			
 
				 	}
			
 
				 
			
 
				 	/* Remainder of delta accrued against u_0` */
			
 
				-	if (runnable)
			
 
				-		sa->runnable_avg_sum += delta;
			
 
				+	if (weight) {
			
 
				+		sa->load_sum += weight * delta;
			
 
				+		if (cfs_rq)
			
 
				+			cfs_rq->runnable_load_sum += weight * delta;
			
 
				+	}
			
 
				 	if (running)
			
 
				-		sa->running_avg_sum += delta * scale_freq
			
 
				-			>> SCHED_CAPACITY_SHIFT;
			
 
				-	sa->avg_period += delta;
			
 
				-
			
 
				-	return decayed;
			
 
				-}
			
 
				+		sa->util_sum += delta * scale_freq >> SCHED_CAPACITY_SHIFT;
			
 
				 
			
 
				-/* Synchronize an entity's decay with its parenting cfs_rq.*/
			
 
				-static inline u64 __synchronize_entity_decay(struct sched_entity *se)
			
 
				-{
			
 
				-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	u64 decays = atomic64_read(&cfs_rq->decay_counter);
			
 
				-
			
 
				-	decays -= se->avg.decay_count;
			
 
				-	se->avg.decay_count = 0;
			
 
				-	if (!decays)
			
 
				-		return 0;
			
 
				+	sa->period_contrib += delta;
			
 
				 
			
 
				-	se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
			
 
				-	se->avg.utilization_avg_contrib =
			
 
				-		decay_load(se->avg.utilization_avg_contrib, decays);
			
 
				+	if (decayed) {
			
 
				+		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
			
 
				+		if (cfs_rq) {
			
 
				+			cfs_rq->runnable_load_avg =
			
 
				+				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
			
 
				+		}
			
 
				+		sa->util_avg = (sa->util_sum << SCHED_LOAD_SHIFT) / LOAD_AVG_MAX;
			
 
				+	}
			
 
				 
			
 
				-	return decays;
			
 
				+	return decayed;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
			
 
				-						 int force_update)
			
 
				-{
			
 
				-	struct task_group *tg = cfs_rq->tg;
			
 
				-	long tg_contrib;
			
 
				-
			
 
				-	tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
			
 
				-	tg_contrib -= cfs_rq->tg_load_contrib;
			
 
				-
			
 
				-	if (!tg_contrib)
			
 
				-		return;
			
 
				-
			
 
				-	if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
			
 
				-		atomic_long_add(tg_contrib, &tg->load_avg);
			
 
				-		cfs_rq->tg_load_contrib += tg_contrib;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				- * Aggregate cfs_rq runnable averages into an equivalent task_group
			
 
				- * representation for computing load contributions.
			
 
				+ * Updating tg's load_avg is necessary before update_cfs_share (which is done)
			
 
				+ * and effective_load (which is not done because it is too costly).
			
 
				  */
			
 
				-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
			
 
				-						  struct cfs_rq *cfs_rq)
			
 
				+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
			
 
				 {
			
 
				-	struct task_group *tg = cfs_rq->tg;
			
 
				-	long contrib;
			
 
				+	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
			
 
				 
			
 
				-	/* The fraction of a cpu used by this cfs_rq */
			
 
				-	contrib = div_u64((u64)sa->runnable_avg_sum << NICE_0_SHIFT,
			
 
				-			  sa->avg_period + 1);
			
 
				-	contrib -= cfs_rq->tg_runnable_contrib;
			
 
				-
			
 
				-	if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
			
 
				-		atomic_add(contrib, &tg->runnable_avg);
			
 
				-		cfs_rq->tg_runnable_contrib += contrib;
			
 
				+	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
			
 
				+		atomic_long_add(delta, &cfs_rq->tg->load_avg);
			
 
				+		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static inline void __update_group_entity_contrib(struct sched_entity *se)
			
 
				-{
			
 
				-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
			
 
				-	struct task_group *tg = cfs_rq->tg;
			
 
				-	int runnable_avg;
			
 
				-
			
 
				-	u64 contrib;
			
 
				-
			
 
				-	contrib = cfs_rq->tg_load_contrib * tg->shares;
			
 
				-	se->avg.load_avg_contrib = div_u64(contrib,
			
 
				-				     atomic_long_read(&tg->load_avg) + 1);
			
 
				-
			
 
				-	/*
			
 
				-	 * For group entities we need to compute a correction term in the case
			
 
				-	 * that they are consuming <1 cpu so that we would contribute the same
			
 
				-	 * load as a task of equal weight.
			
 
				-	 *
			
 
				-	 * Explicitly co-ordinating this measurement would be expensive, but
			
 
				-	 * fortunately the sum of each cpus contribution forms a usable
			
 
				-	 * lower-bound on the true value.
			
 
				-	 *
			
 
				-	 * Consider the aggregate of 2 contributions.  Either they are disjoint
			
 
				-	 * (and the sum represents true value) or they are disjoint and we are
			
 
				-	 * understating by the aggregate of their overlap.
			
 
				-	 *
			
 
				-	 * Extending this to N cpus, for a given overlap, the maximum amount we
			
 
				-	 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
			
 
				-	 * cpus that overlap for this interval and w_i is the interval width.
			
 
				-	 *
			
 
				-	 * On a small machine; the first term is well-bounded which bounds the
			
 
				-	 * total error since w_i is a subset of the period.  Whereas on a
			
 
				-	 * larger machine, while this first term can be larger, if w_i is the
			
 
				-	 * of consequential size guaranteed to see n_i*w_i quickly converge to
			
 
				-	 * our upper bound of 1-cpu.
			
 
				-	 */
			
 
				-	runnable_avg = atomic_read(&tg->runnable_avg);
			
 
				-	if (runnable_avg < NICE_0_LOAD) {
			
 
				-		se->avg.load_avg_contrib *= runnable_avg;
			
 
				-		se->avg.load_avg_contrib >>= NICE_0_SHIFT;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
			
 
				-{
			
 
				-	__update_entity_runnable_avg(rq_clock_task(rq), cpu_of(rq), &rq->avg,
			
 
				-			runnable, runnable);
			
 
				-	__update_tg_runnable_avg(&rq->avg, &rq->cfs);
			
 
				-}
			
 
				 #else /* CONFIG_FAIR_GROUP_SCHED */
			
 
				-static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
			
 
				-						 int force_update) {}
			
 
				-static inline void __update_tg_runnable_avg(struct sched_avg *sa,
			
 
				-						  struct cfs_rq *cfs_rq) {}
			
 
				-static inline void __update_group_entity_contrib(struct sched_entity *se) {}
			
 
				-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
			
 
				+static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				-static inline void __update_task_entity_contrib(struct sched_entity *se)
			
 
				-{
			
 
				-	u32 contrib;
			
 
				-
			
 
				-	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
			
 
				-	contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
			
 
				-	contrib /= (se->avg.avg_period + 1);
			
 
				-	se->avg.load_avg_contrib = scale_load(contrib);
			
 
				-}
			
 
				+static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
			
 
				 
			
 
				-/* Compute the current contribution to load_avg by se, return any delta */
			
 
				-static long __update_entity_load_avg_contrib(struct sched_entity *se)
			
 
				+/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
			
 
				+static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				-	long old_contrib = se->avg.load_avg_contrib;
			
 
				+	int decayed;
			
 
				+	struct sched_avg *sa = &cfs_rq->avg;
			
 
				 
			
 
				-	if (entity_is_task(se)) {
			
 
				-		__update_task_entity_contrib(se);
			
 
				-	} else {
			
 
				-		__update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
			
 
				-		__update_group_entity_contrib(se);
			
 
				+	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
			
 
				+		long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
			
 
				+		sa->load_avg = max_t(long, sa->load_avg - r, 0);
			
 
				+		sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
			
 
				 	}
			
 
				 
			
 
				-	return se->avg.load_avg_contrib - old_contrib;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static inline void __update_task_entity_utilization(struct sched_entity *se)
			
 
				-{
			
 
				-	u32 contrib;
			
 
				+	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
			
 
				+		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
			
 
				+		sa->util_avg = max_t(long, sa->util_avg - r, 0);
			
 
				+		sa->util_sum = max_t(s32, sa->util_sum -
			
 
				+			((r * LOAD_AVG_MAX) >> SCHED_LOAD_SHIFT), 0);
			
 
				+	}
			
 
				 
			
 
				-	/* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
			
 
				-	contrib = se->avg.running_avg_sum * scale_load_down(SCHED_LOAD_SCALE);
			
 
				-	contrib /= (se->avg.avg_period + 1);
			
 
				-	se->avg.utilization_avg_contrib = scale_load(contrib);
			
 
				-}
			
 
				+	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
			
 
				+		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
			
 
				 
			
 
				-static long __update_entity_utilization_avg_contrib(struct sched_entity *se)
			
 
				-{
			
 
				-	long old_contrib = se->avg.utilization_avg_contrib;
			
 
				-
			
 
				-	if (entity_is_task(se))
			
 
				-		__update_task_entity_utilization(se);
			
 
				-	else
			
 
				-		se->avg.utilization_avg_contrib =
			
 
				-					group_cfs_rq(se)->utilization_load_avg;
			
 
				+#ifndef CONFIG_64BIT
			
 
				+	smp_wmb();
			
 
				+	cfs_rq->load_last_update_time_copy = sa->last_update_time;
			
 
				+#endif
			
 
				 
			
 
				-	return se->avg.utilization_avg_contrib - old_contrib;
			
 
				+	return decayed;
			
 
				 }
			
 
				 
			
 
				-static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
			
 
				-						 long load_contrib)
			
 
				-{
			
 
				-	if (likely(load_contrib < cfs_rq->blocked_load_avg))
			
 
				-		cfs_rq->blocked_load_avg -= load_contrib;
			
 
				-	else
			
 
				-		cfs_rq->blocked_load_avg = 0;
			
 
				-}
			
 
				-
			
 
				-static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
			
 
				-
			
 
				-/* Update a sched_entity's runnable average */
			
 
				-static inline void update_entity_load_avg(struct sched_entity *se,
			
 
				-					  int update_cfs_rq)
			
 
				+/* Update task and its cfs_rq load average */
			
 
				+static inline void update_load_avg(struct sched_entity *se, int update_tg)
			
 
				 {
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	long contrib_delta, utilization_delta;
			
 
				 	int cpu = cpu_of(rq_of(cfs_rq));
			
 
				-	u64 now;
			
 
				+	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				 
			
 
				 	/*
			
 
				-	 * For a group entity we need to use their owned cfs_rq_clock_task() in
			
 
				-	 * case they are the parent of a throttled hierarchy.
			
 
				+	 * Track task load average for carrying it to new CPU after migrated, and
			
 
				+	 * track group sched_entity load average for task_h_load calc in migration
			
 
				 	 */
			
 
				-	if (entity_is_task(se))
			
 
				-		now = cfs_rq_clock_task(cfs_rq);
			
 
				-	else
			
 
				-		now = cfs_rq_clock_task(group_cfs_rq(se));
			
 
				-
			
 
				-	if (!__update_entity_runnable_avg(now, cpu, &se->avg, se->on_rq,
			
 
				-					cfs_rq->curr == se))
			
 
				-		return;
			
 
				-
			
 
				-	contrib_delta = __update_entity_load_avg_contrib(se);
			
 
				-	utilization_delta = __update_entity_utilization_avg_contrib(se);
			
 
				-
			
 
				-	if (!update_cfs_rq)
			
 
				-		return;
			
 
				+	__update_load_avg(now, cpu, &se->avg,
			
 
				+		se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
			
 
				 
			
 
				-	if (se->on_rq) {
			
 
				-		cfs_rq->runnable_load_avg += contrib_delta;
			
 
				-		cfs_rq->utilization_load_avg += utilization_delta;
			
 
				-	} else {
			
 
				-		subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
			
 
				-	}
			
 
				+	if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
			
 
				+		update_tg_load_avg(cfs_rq, 0);
			
 
				 }
			
 
				 
			
 
				-/*
			
 
				- * Decay the load contributed by all blocked children and account this so that
			
 
				- * their contribution may appropriately discounted when they wake up.
			
 
				- */
			
 
				-static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
			
 
				+/* Add the load generated by se into cfs_rq's load average */
			
 
				+static inline void
			
 
				+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
			
 
				-	u64 decays;
			
 
				-
			
 
				-	decays = now - cfs_rq->last_decay;
			
 
				-	if (!decays && !force_update)
			
 
				-		return;
			
 
				+	struct sched_avg *sa = &se->avg;
			
 
				+	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				+	int migrated = 0, decayed;
			
 
				 
			
 
				-	if (atomic_long_read(&cfs_rq->removed_load)) {
			
 
				-		unsigned long removed_load;
			
 
				-		removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
			
 
				-		subtract_blocked_load_contrib(cfs_rq, removed_load);
			
 
				+	if (sa->last_update_time == 0) {
			
 
				+		sa->last_update_time = now;
			
 
				+		migrated = 1;
			
 
				 	}
			
 
				+	else {
			
 
				+		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
			
 
				+			se->on_rq * scale_load_down(se->load.weight),
			
 
				+			cfs_rq->curr == se, NULL);
			
 
				+	}
			
 
				+
			
 
				+	decayed = update_cfs_rq_load_avg(now, cfs_rq);
			
 
				 
			
 
				-	if (decays) {
			
 
				-		cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
			
 
				-						      decays);
			
 
				-		atomic64_add(decays, &cfs_rq->decay_counter);
			
 
				-		cfs_rq->last_decay = now;
			
 
				+	cfs_rq->runnable_load_avg += sa->load_avg;
			
 
				+	cfs_rq->runnable_load_sum += sa->load_sum;
			
 
				+
			
 
				+	if (migrated) {
			
 
				+		cfs_rq->avg.load_avg += sa->load_avg;
			
 
				+		cfs_rq->avg.load_sum += sa->load_sum;
			
 
				+		cfs_rq->avg.util_avg += sa->util_avg;
			
 
				+		cfs_rq->avg.util_sum += sa->util_sum;
			
 
				 	}
			
 
				 
			
 
				-	__update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
			
 
				+	if (decayed || migrated)
			
 
				+		update_tg_load_avg(cfs_rq, 0);
			
 
				 }
			
 
				 
			
 
				-/* Add the load generated by se into cfs_rq's child load-average */
			
 
				-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
			
 
				-						  struct sched_entity *se,
			
 
				-						  int wakeup)
			
 
				+/* Remove the runnable load generated by se from cfs_rq's runnable load average */
			
 
				+static inline void
			
 
				+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	/*
			
 
				-	 * We track migrations using entity decay_count <= 0, on a wake-up
			
 
				-	 * migration we use a negative decay count to track the remote decays
			
 
				-	 * accumulated while sleeping.
			
 
				-	 *
			
 
				-	 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
			
 
				-	 * are seen by enqueue_entity_load_avg() as a migration with an already
			
 
				-	 * constructed load_avg_contrib.
			
 
				-	 */
			
 
				-	if (unlikely(se->avg.decay_count <= 0)) {
			
 
				-		se->avg.last_runnable_update = rq_clock_task(rq_of(cfs_rq));
			
 
				-		if (se->avg.decay_count) {
			
 
				-			/*
			
 
				-			 * In a wake-up migration we have to approximate the
			
 
				-			 * time sleeping.  This is because we can't synchronize
			
 
				-			 * clock_task between the two cpus, and it is not
			
 
				-			 * guaranteed to be read-safe.  Instead, we can
			
 
				-			 * approximate this using our carried decays, which are
			
 
				-			 * explicitly atomically readable.
			
 
				-			 */
			
 
				-			se->avg.last_runnable_update -= (-se->avg.decay_count)
			
 
				-							<< 20;
			
 
				-			update_entity_load_avg(se, 0);
			
 
				-			/* Indicate that we're now synchronized and on-rq */
			
 
				-			se->avg.decay_count = 0;
			
 
				-		}
			
 
				-		wakeup = 0;
			
 
				-	} else {
			
 
				-		__synchronize_entity_decay(se);
			
 
				-	}
			
 
				+	update_load_avg(se, 1);
			
 
				 
			
 
				-	/* migrated tasks did not contribute to our blocked load */
			
 
				-	if (wakeup) {
			
 
				-		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
			
 
				-		update_entity_load_avg(se, 0);
			
 
				-	}
			
 
				-
			
 
				-	cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
			
 
				-	cfs_rq->utilization_load_avg += se->avg.utilization_avg_contrib;
			
 
				-	/* we force update consideration on load-balancer moves */
			
 
				-	update_cfs_rq_blocked_load(cfs_rq, !wakeup);
			
 
				+	cfs_rq->runnable_load_avg =
			
 
				+		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
			
 
				+	cfs_rq->runnable_load_sum =
			
 
				+		max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Remove se's load from this cfs_rq child load-average, if the entity is
			
 
				- * transitioning to a blocked state we track its projected decay using
			
 
				- * blocked_load_avg.
			
 
				+ * Task first catches up with cfs_rq, and then subtract
			
 
				+ * itself from the cfs_rq (task must be off the queue now).
			
 
				  */
			
 
				-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
			
 
				-						  struct sched_entity *se,
			
 
				-						  int sleep)
			
 
				+void remove_entity_load_avg(struct sched_entity *se)
			
 
				 {
			
 
				-	update_entity_load_avg(se, 1);
			
 
				-	/* we force update consideration on load-balancer moves */
			
 
				-	update_cfs_rq_blocked_load(cfs_rq, !sleep);
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+	u64 last_update_time;
			
 
				+
			
 
				+#ifndef CONFIG_64BIT
			
 
				+	u64 last_update_time_copy;
			
 
				 
			
 
				-	cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
			
 
				-	cfs_rq->utilization_load_avg -= se->avg.utilization_avg_contrib;
			
 
				-	if (sleep) {
			
 
				-		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
			
 
				-		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
			
 
				-	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
			
 
				+	do {
			
 
				+		last_update_time_copy = cfs_rq->load_last_update_time_copy;
			
 
				+		smp_rmb();
			
 
				+		last_update_time = cfs_rq->avg.last_update_time;
			
 
				+	} while (last_update_time != last_update_time_copy);
			
 
				+#else
			
 
				+	last_update_time = cfs_rq->avg.last_update_time;
			
 
				+#endif
			
 
				+
			
 
				+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
			
 
				+	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
			
 
				+	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2944,7 +2788,6 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 
				  */
			
 
				 void idle_enter_fair(struct rq *this_rq)
			
 
				 {
			
 
				-	update_rq_runnable_avg(this_rq, 1);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -2954,24 +2797,28 @@ void idle_enter_fair(struct rq *this_rq)
 
				  */
			
 
				 void idle_exit_fair(struct rq *this_rq)
			
 
				 {
			
 
				-	update_rq_runnable_avg(this_rq, 0);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
			
 
				+{
			
 
				+	return cfs_rq->runnable_load_avg;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
			
 
				+{
			
 
				+	return cfs_rq->avg.load_avg;
			
 
				 }
			
 
				 
			
 
				 static int idle_balance(struct rq *this_rq);
			
 
				 
			
 
				 #else /* CONFIG_SMP */
			
 
				 
			
 
				-static inline void update_entity_load_avg(struct sched_entity *se,
			
 
				-					  int update_cfs_rq) {}
			
 
				-static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
			
 
				-static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
			
 
				-					   struct sched_entity *se,
			
 
				-					   int wakeup) {}
			
 
				-static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
			
 
				-					   struct sched_entity *se,
			
 
				-					   int sleep) {}
			
 
				-static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
			
 
				-					      int force_update) {}
			
 
				+static inline void update_load_avg(struct sched_entity *se, int update_tg) {}
			
 
				+static inline void
			
 
				+enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
			
 
				+static inline void
			
 
				+dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
			
 
				+static inline void remove_entity_load_avg(struct sched_entity *se) {}
			
 
				 
			
 
				 static inline int idle_balance(struct rq *rq)
			
 
				 {
			
@@ -3103,7 +2950,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	 * Update run-time statistics of the 'current'.
			
 
				 	 */
			
 
				 	update_curr(cfs_rq);
			
 
				-	enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
			
 
				+	enqueue_entity_load_avg(cfs_rq, se);
			
 
				 	account_entity_enqueue(cfs_rq, se);
			
 
				 	update_cfs_shares(cfs_rq);
			
 
				 
			
@@ -3178,7 +3025,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	 * Update run-time statistics of the 'current'.
			
 
				 	 */
			
 
				 	update_curr(cfs_rq);
			
 
				-	dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
			
 
				+	dequeue_entity_load_avg(cfs_rq, se);
			
 
				 
			
 
				 	update_stats_dequeue(cfs_rq, se);
			
 
				 	if (flags & DEQUEUE_SLEEP) {
			
@@ -3268,7 +3115,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		 */
			
 
				 		update_stats_wait_end(cfs_rq, se);
			
 
				 		__dequeue_entity(cfs_rq, se);
			
 
				-		update_entity_load_avg(se, 1);
			
 
				+		update_load_avg(se, 1);
			
 
				 	}
			
 
				 
			
 
				 	update_stats_curr_start(cfs_rq, se);
			
@@ -3368,7 +3215,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
				 		/* Put 'current' back into the tree. */
			
 
				 		__enqueue_entity(cfs_rq, prev);
			
 
				 		/* in !on_rq case, update occurred at dequeue */
			
 
				-		update_entity_load_avg(prev, 1);
			
 
				+		update_load_avg(prev, 0);
			
 
				 	}
			
 
				 	cfs_rq->curr = NULL;
			
 
				 }
			
@@ -3384,8 +3231,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
				 	/*
			
 
				 	 * Ensure that runnable average is periodically updated.
			
 
				 	 */
			
 
				-	update_entity_load_avg(curr, 1);
			
 
				-	update_cfs_rq_blocked_load(cfs_rq, 1);
			
 
				+	update_load_avg(curr, 1);
			
 
				 	update_cfs_shares(cfs_rq);
			
 
				 
			
 
				 #ifdef CONFIG_SCHED_HRTICK
			
@@ -4258,14 +4104,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				+		update_load_avg(se, 1);
			
 
				 		update_cfs_shares(cfs_rq);
			
 
				-		update_entity_load_avg(se, 1);
			
 
				 	}
			
 
				 
			
 
				-	if (!se) {
			
 
				-		update_rq_runnable_avg(rq, rq->nr_running);
			
 
				+	if (!se)
			
 
				 		add_nr_running(rq, 1);
			
 
				-	}
			
 
				+
			
 
				 	hrtick_update(rq);
			
 
				 }
			
 
				 
			
@@ -4319,14 +4164,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				+		update_load_avg(se, 1);
			
 
				 		update_cfs_shares(cfs_rq);
			
 
				-		update_entity_load_avg(se, 1);
			
 
				 	}
			
 
				 
			
 
				-	if (!se) {
			
 
				+	if (!se)
			
 
				 		sub_nr_running(rq, 1);
			
 
				-		update_rq_runnable_avg(rq, 1);
			
 
				-	}
			
 
				+
			
 
				 	hrtick_update(rq);
			
 
				 }
			
 
				 
			
@@ -4439,6 +4283,12 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
				 	sched_avg_update(this_rq);
			
 
				 }
			
 
				 
			
 
				+/* Used instead of source_load when we know the type == 0 */
			
 
				+static unsigned long weighted_cpuload(const int cpu)
			
 
				+{
			
 
				+	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_NO_HZ_COMMON
			
 
				 /*
			
 
				  * There is no sane way to deal with nohz on smp when using jiffies because the
			
@@ -4460,7 +4310,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 
				 static void update_idle_cpu_load(struct rq *this_rq)
			
 
				 {
			
 
				 	unsigned long curr_jiffies = READ_ONCE(jiffies);
			
 
				-	unsigned long load = this_rq->cfs.runnable_load_avg;
			
 
				+	unsigned long load = weighted_cpuload(cpu_of(this_rq));
			
 
				 	unsigned long pending_updates;
			
 
				 
			
 
				 	/*
			
@@ -4506,7 +4356,7 @@ void update_cpu_load_nohz(void)
 
				  */
			
 
				 void update_cpu_load_active(struct rq *this_rq)
			
 
				 {
			
 
				-	unsigned long load = this_rq->cfs.runnable_load_avg;
			
 
				+	unsigned long load = weighted_cpuload(cpu_of(this_rq));
			
 
				 	/*
			
 
				 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
			
 
				 	 */
			
@@ -4514,12 +4364,6 @@ void update_cpu_load_active(struct rq *this_rq)
 
				 	__update_cpu_load(this_rq, load, 1);
			
 
				 }
			
 
				 
			
 
				-/* Used instead of source_load when we know the type == 0 */
			
 
				-static unsigned long weighted_cpuload(const int cpu)
			
 
				-{
			
 
				-	return cpu_rq(cpu)->cfs.runnable_load_avg;
			
 
				-}
			
 
				-
			
 
				 /*
			
 
				  * Return a low guess at the load of a migration-source cpu weighted
			
 
				  * according to the scheduling class and "nice" value.
			
@@ -4567,7 +4411,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
				 {
			
 
				 	struct rq *rq = cpu_rq(cpu);
			
 
				 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
			
 
				-	unsigned long load_avg = rq->cfs.runnable_load_avg;
			
 
				+	unsigned long load_avg = weighted_cpuload(cpu);
			
 
				 
			
 
				 	if (nr_running)
			
 
				 		return load_avg / nr_running;
			
@@ -4686,7 +4530,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
				 		/*
			
 
				 		 * w = rw_i + @wl
			
 
				 		 */
			
 
				-		w = se->my_q->load.weight + wl;
			
 
				+		w = cfs_rq_load_avg(se->my_q) + wl;
			
 
				 
			
 
				 		/*
			
 
				 		 * wl = S * s'_i; see (2)
			
@@ -4707,7 +4551,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
				 		/*
			
 
				 		 * wl = dw_i = S * (s'_i - s_i); see (3)
			
 
				 		 */
			
 
				-		wl -= se->load.weight;
			
 
				+		wl -= se->avg.load_avg;
			
 
				 
			
 
				 		/*
			
 
				 		 * Recursively apply this logic to all parent groups to compute
			
@@ -4730,26 +4574,29 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 
				 
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
			
 
				+ * A waker of many should wake a different task than the one last awakened
			
 
				+ * at a frequency roughly N times higher than one of its wakees.  In order
			
 
				+ * to determine whether we should let the load spread vs consolodating to
			
 
				+ * shared cache, we look for a minimum 'flip' frequency of llc_size in one
			
 
				+ * partner, and a factor of lls_size higher frequency in the other.  With
			
 
				+ * both conditions met, we can be relatively sure that the relationship is
			
 
				+ * non-monogamous, with partner count exceeding socket size.  Waker/wakee
			
 
				+ * being client/server, worker/dispatcher, interrupt source or whatever is
			
 
				+ * irrelevant, spread criteria is apparent partner count exceeds socket size.
			
 
				+ */
			
 
				 static int wake_wide(struct task_struct *p)
			
 
				 {
			
 
				+	unsigned int master = current->wakee_flips;
			
 
				+	unsigned int slave = p->wakee_flips;
			
 
				 	int factor = this_cpu_read(sd_llc_size);
			
 
				 
			
 
				-	/*
			
 
				-	 * Yeah, it's the switching-frequency, could means many wakee or
			
 
				-	 * rapidly switch, use factor here will just help to automatically
			
 
				-	 * adjust the loose-degree, so bigger node will lead to more pull.
			
 
				-	 */
			
 
				-	if (p->wakee_flips > factor) {
			
 
				-		/*
			
 
				-		 * wakee is somewhat hot, it needs certain amount of cpu
			
 
				-		 * resource, so if waker is far more hot, prefer to leave
			
 
				-		 * it alone.
			
 
				-		 */
			
 
				-		if (current->wakee_flips > (factor * p->wakee_flips))
			
 
				-			return 1;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				+	if (master < slave)
			
 
				+		swap(master, slave);
			
 
				+	if (slave < factor || master < slave * factor)
			
 
				+		return 0;
			
 
				+	return 1;
			
 
				 }
			
 
				 
			
 
				 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
			
@@ -4761,13 +4608,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
				 	unsigned long weight;
			
 
				 	int balanced;
			
 
				 
			
 
				-	/*
			
 
				-	 * If we wake multiple tasks be careful to not bounce
			
 
				-	 * ourselves around too much.
			
 
				-	 */
			
 
				-	if (wake_wide(p))
			
 
				-		return 0;
			
 
				-
			
 
				 	idx	  = sd->wake_idx;
			
 
				 	this_cpu  = smp_processor_id();
			
 
				 	prev_cpu  = task_cpu(p);
			
@@ -4781,14 +4621,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 
				 	 */
			
 
				 	if (sync) {
			
 
				 		tg = task_group(current);
			
 
				-		weight = current->se.load.weight;
			
 
				+		weight = current->se.avg.load_avg;
			
 
				 
			
 
				 		this_load += effective_load(tg, this_cpu, -weight, -weight);
			
 
				 		load += effective_load(tg, prev_cpu, 0, -weight);
			
 
				 	}
			
 
				 
			
 
				 	tg = task_group(p);
			
 
				-	weight = p->se.load.weight;
			
 
				+	weight = p->se.avg.load_avg;
			
 
				 
			
 
				 	/*
			
 
				 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
			
@@ -4981,12 +4821,12 @@ done:
 
				  * tasks. The unit of the return value must be the one of capacity so we can
			
 
				  * compare the usage with the capacity of the CPU that is available for CFS
			
 
				  * task (ie cpu_capacity).
			
 
				- * cfs.utilization_load_avg is the sum of running time of runnable tasks on a
			
 
				+ * cfs.avg.util_avg is the sum of running time of runnable tasks on a
			
 
				  * CPU. It represents the amount of utilization of a CPU in the range
			
 
				  * [0..SCHED_LOAD_SCALE].  The usage of a CPU can't be higher than the full
			
 
				  * capacity of the CPU because it's about the running time on this CPU.
			
 
				- * Nevertheless, cfs.utilization_load_avg can be higher than SCHED_LOAD_SCALE
			
 
				- * because of unfortunate rounding in avg_period and running_load_avg or just
			
 
				+ * Nevertheless, cfs.avg.util_avg can be higher than SCHED_LOAD_SCALE
			
 
				+ * because of unfortunate rounding in util_avg or just
			
 
				  * after migrating tasks until the average stabilizes with the new running
			
 
				  * time. So we need to check that the usage stays into the range
			
 
				  * [0..cpu_capacity_orig] and cap if necessary.
			
@@ -4995,7 +4835,7 @@ done:
 
				  */
			
 
				 static int get_cpu_usage(int cpu)
			
 
				 {
			
 
				-	unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
			
 
				+	unsigned long usage = cpu_rq(cpu)->cfs.avg.util_avg;
			
 
				 	unsigned long capacity = capacity_orig_of(cpu);
			
 
				 
			
 
				 	if (usage >= SCHED_LOAD_SCALE)
			
@@ -5021,17 +4861,17 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 {
			
 
				 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
			
 
				 	int cpu = smp_processor_id();
			
 
				-	int new_cpu = cpu;
			
 
				+	int new_cpu = prev_cpu;
			
 
				 	int want_affine = 0;
			
 
				 	int sync = wake_flags & WF_SYNC;
			
 
				 
			
 
				 	if (sd_flag & SD_BALANCE_WAKE)
			
 
				-		want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
			
 
				+		want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
			
 
				 
			
 
				 	rcu_read_lock();
			
 
				 	for_each_domain(cpu, tmp) {
			
 
				 		if (!(tmp->flags & SD_LOAD_BALANCE))
			
 
				-			continue;
			
 
				+			break;
			
 
				 
			
 
				 		/*
			
 
				 		 * If both cpu and prev_cpu are part of this domain,
			
@@ -5045,17 +4885,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 
			
 
				 		if (tmp->flags & sd_flag)
			
 
				 			sd = tmp;
			
 
				+		else if (!want_affine)
			
 
				+			break;
			
 
				 	}
			
 
				 
			
 
				-	if (affine_sd && cpu != prev_cpu && wake_affine(affine_sd, p, sync))
			
 
				-		prev_cpu = cpu;
			
 
				-
			
 
				-	if (sd_flag & SD_BALANCE_WAKE) {
			
 
				-		new_cpu = select_idle_sibling(p, prev_cpu);
			
 
				-		goto unlock;
			
 
				+	if (affine_sd) {
			
 
				+		sd = NULL; /* Prefer wake_affine over balance flags */
			
 
				+		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
			
 
				+			new_cpu = cpu;
			
 
				 	}
			
 
				 
			
 
				-	while (sd) {
			
 
				+	if (!sd) {
			
 
				+		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
			
 
				+			new_cpu = select_idle_sibling(p, new_cpu);
			
 
				+
			
 
				+	} else while (sd) {
			
 
				 		struct sched_group *group;
			
 
				 		int weight;
			
 
				 
			
@@ -5089,7 +4933,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 		}
			
 
				 		/* while loop will break here if sd == NULL */
			
 
				 	}
			
 
				-unlock:
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				 	return new_cpu;
			
@@ -5101,26 +4944,27 @@ unlock:
 
				  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
			
 
				  * other assumptions, including the state of rq->lock, should be made.
			
 
				  */
			
 
				-static void
			
 
				-migrate_task_rq_fair(struct task_struct *p, int next_cpu)
			
 
				+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
			
 
				 {
			
 
				-	struct sched_entity *se = &p->se;
			
 
				-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-
			
 
				 	/*
			
 
				-	 * Load tracking: accumulate removed load so that it can be processed
			
 
				-	 * when we next update owning cfs_rq under rq->lock.  Tasks contribute
			
 
				-	 * to blocked load iff they have a positive decay-count.  It can never
			
 
				-	 * be negative here since on-rq tasks have decay-count == 0.
			
 
				+	 * We are supposed to update the task to "current" time, then its up to date
			
 
				+	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
			
 
				+	 * what current time is, so simply throw away the out-of-date time. This
			
 
				+	 * will result in the wakee task is less decayed, but giving the wakee more
			
 
				+	 * load sounds not bad.
			
 
				 	 */
			
 
				-	if (se->avg.decay_count) {
			
 
				-		se->avg.decay_count = -__synchronize_entity_decay(se);
			
 
				-		atomic_long_add(se->avg.load_avg_contrib,
			
 
				-						&cfs_rq->removed_load);
			
 
				-	}
			
 
				+	remove_entity_load_avg(&p->se);
			
 
				+
			
 
				+	/* Tell new CPU we are migrated */
			
 
				+	p->se.avg.last_update_time = 0;
			
 
				 
			
 
				 	/* We have migrated, no longer consider this task hot */
			
 
				-	se->exec_start = 0;
			
 
				+	p->se.exec_start = 0;
			
 
				+}
			
 
				+
			
 
				+static void task_dead_fair(struct task_struct *p)
			
 
				+{
			
 
				+	remove_entity_load_avg(&p->se);
			
 
				 }
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
@@ -5670,72 +5514,39 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 
				 
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				 /*
			
 
				- * Returns true if the destination node is the preferred node.
			
 
				- * Needs to match fbq_classify_rq(): if there is a runnable task
			
 
				- * that is not on its preferred node, we should identify it.
			
 
				+ * Returns 1, if task migration degrades locality
			
 
				+ * Returns 0, if task migration improves locality i.e migration preferred.
			
 
				+ * Returns -1, if task migration is not affected by locality.
			
 
				  */
			
 
				-static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
			
 
				+static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
			
 
				 {
			
 
				 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
			
 
				 	unsigned long src_faults, dst_faults;
			
 
				 	int src_nid, dst_nid;
			
 
				 
			
 
				-	if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
			
 
				-	    !(env->sd->flags & SD_NUMA)) {
			
 
				-		return false;
			
 
				-	}
			
 
				-
			
 
				-	src_nid = cpu_to_node(env->src_cpu);
			
 
				-	dst_nid = cpu_to_node(env->dst_cpu);
			
 
				-
			
 
				-	if (src_nid == dst_nid)
			
 
				-		return false;
			
 
				-
			
 
				-	/* Encourage migration to the preferred node. */
			
 
				-	if (dst_nid == p->numa_preferred_nid)
			
 
				-		return true;
			
 
				-
			
 
				-	/* Migrating away from the preferred node is bad. */
			
 
				-	if (src_nid == p->numa_preferred_nid)
			
 
				-		return false;
			
 
				-
			
 
				-	if (numa_group) {
			
 
				-		src_faults = group_faults(p, src_nid);
			
 
				-		dst_faults = group_faults(p, dst_nid);
			
 
				-	} else {
			
 
				-		src_faults = task_faults(p, src_nid);
			
 
				-		dst_faults = task_faults(p, dst_nid);
			
 
				-	}
			
 
				-
			
 
				-	return dst_faults > src_faults;
			
 
				-}
			
 
				-
			
 
				-
			
 
				-static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
			
 
				-{
			
 
				-	struct numa_group *numa_group = rcu_dereference(p->numa_group);
			
 
				-	unsigned long src_faults, dst_faults;
			
 
				-	int src_nid, dst_nid;
			
 
				-
			
 
				-	if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
			
 
				-		return false;
			
 
				-
			
 
				 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
			
 
				-		return false;
			
 
				+		return -1;
			
 
				+
			
 
				+	if (!sched_feat(NUMA))
			
 
				+		return -1;
			
 
				 
			
 
				 	src_nid = cpu_to_node(env->src_cpu);
			
 
				 	dst_nid = cpu_to_node(env->dst_cpu);
			
 
				 
			
 
				 	if (src_nid == dst_nid)
			
 
				-		return false;
			
 
				+		return -1;
			
 
				 
			
 
				-	/* Migrating away from the preferred node is bad. */
			
 
				-	if (src_nid == p->numa_preferred_nid)
			
 
				-		return true;
			
 
				+	/* Migrating away from the preferred node is always bad. */
			
 
				+	if (src_nid == p->numa_preferred_nid) {
			
 
				+		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
			
 
				+			return 1;
			
 
				+		else
			
 
				+			return -1;
			
 
				+	}
			
 
				 
			
 
				 	/* Encourage migration to the preferred node. */
			
 
				 	if (dst_nid == p->numa_preferred_nid)
			
 
				-		return false;
			
 
				+		return 0;
			
 
				 
			
 
				 	if (numa_group) {
			
 
				 		src_faults = group_faults(p, src_nid);
			
@@ -5749,16 +5560,10 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 
				 }
			
 
				 
			
 
				 #else
			
 
				-static inline bool migrate_improves_locality(struct task_struct *p,
			
 
				+static inline int migrate_degrades_locality(struct task_struct *p,
			
 
				 					     struct lb_env *env)
			
 
				 {
			
 
				-	return false;
			
 
				-}
			
 
				-
			
 
				-static inline bool migrate_degrades_locality(struct task_struct *p,
			
 
				-					     struct lb_env *env)
			
 
				-{
			
 
				-	return false;
			
 
				+	return -1;
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -5768,7 +5573,7 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 
				 static
			
 
				 int can_migrate_task(struct task_struct *p, struct lb_env *env)
			
 
				 {
			
 
				-	int tsk_cache_hot = 0;
			
 
				+	int tsk_cache_hot;
			
 
				 
			
 
				 	lockdep_assert_held(&env->src_rq->lock);
			
 
				 
			
@@ -5826,13 +5631,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 
				 	 * 2) task is cache cold, or
			
 
				 	 * 3) too many balance attempts have failed.
			
 
				 	 */
			
 
				-	tsk_cache_hot = task_hot(p, env);
			
 
				-	if (!tsk_cache_hot)
			
 
				-		tsk_cache_hot = migrate_degrades_locality(p, env);
			
 
				+	tsk_cache_hot = migrate_degrades_locality(p, env);
			
 
				+	if (tsk_cache_hot == -1)
			
 
				+		tsk_cache_hot = task_hot(p, env);
			
 
				 
			
 
				-	if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
			
 
				+	if (tsk_cache_hot <= 0 ||
			
 
				 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
			
 
				-		if (tsk_cache_hot) {
			
 
				+		if (tsk_cache_hot == 1) {
			
 
				 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
			
 
				 			schedstat_inc(p, se.statistics.nr_forced_migrations);
			
 
				 		}
			
@@ -5906,6 +5711,13 @@ static int detach_tasks(struct lb_env *env)
 
				 		return 0;
			
 
				 
			
 
				 	while (!list_empty(tasks)) {
			
 
				+		/*
			
 
				+		 * We don't want to steal all, otherwise we may be treated likewise,
			
 
				+		 * which could at worst lead to a livelock crash.
			
 
				+		 */
			
 
				+		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
			
 
				+			break;
			
 
				+
			
 
				 		p = list_first_entry(tasks, struct task_struct, se.group_node);
			
 
				 
			
 
				 		env->loop++;
			
@@ -6015,39 +5827,6 @@ static void attach_tasks(struct lb_env *env)
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-/*
			
 
				- * update tg->load_weight by folding this cpu's load_avg
			
 
				- */
			
 
				-static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
			
 
				-{
			
 
				-	struct sched_entity *se = tg->se[cpu];
			
 
				-	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
			
 
				-
			
 
				-	/* throttled entities do not contribute to load */
			
 
				-	if (throttled_hierarchy(cfs_rq))
			
 
				-		return;
			
 
				-
			
 
				-	update_cfs_rq_blocked_load(cfs_rq, 1);
			
 
				-
			
 
				-	if (se) {
			
 
				-		update_entity_load_avg(se, 1);
			
 
				-		/*
			
 
				-		 * We pivot on our runnable average having decayed to zero for
			
 
				-		 * list removal.  This generally implies that all our children
			
 
				-		 * have also been removed (modulo rounding error or bandwidth
			
 
				-		 * control); however, such cases are rare and we can fix these
			
 
				-		 * at enqueue.
			
 
				-		 *
			
 
				-		 * TODO: fix up out-of-order children on enqueue.
			
 
				-		 */
			
 
				-		if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
			
 
				-			list_del_leaf_cfs_rq(cfs_rq);
			
 
				-	} else {
			
 
				-		struct rq *rq = rq_of(cfs_rq);
			
 
				-		update_rq_runnable_avg(rq, rq->nr_running);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 static void update_blocked_averages(int cpu)
			
 
				 {
			
 
				 	struct rq *rq = cpu_rq(cpu);
			
@@ -6056,19 +5835,19 @@ static void update_blocked_averages(int cpu)
 
				 
			
 
				 	raw_spin_lock_irqsave(&rq->lock, flags);
			
 
				 	update_rq_clock(rq);
			
 
				+
			
 
				 	/*
			
 
				 	 * Iterates the task_group tree in a bottom up fashion, see
			
 
				 	 * list_add_leaf_cfs_rq() for details.
			
 
				 	 */
			
 
				 	for_each_leaf_cfs_rq(rq, cfs_rq) {
			
 
				-		/*
			
 
				-		 * Note: We may want to consider periodically releasing
			
 
				-		 * rq->lock about these updates so that creating many task
			
 
				-		 * groups does not result in continually extending hold time.
			
 
				-		 */
			
 
				-		__update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
			
 
				-	}
			
 
				+		/* throttled entities do not contribute to load */
			
 
				+		if (throttled_hierarchy(cfs_rq))
			
 
				+			continue;
			
 
				 
			
 
				+		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
			
 
				+			update_tg_load_avg(cfs_rq, 0);
			
 
				+	}
			
 
				 	raw_spin_unlock_irqrestore(&rq->lock, flags);
			
 
				 }
			
 
				 
			
@@ -6096,14 +5875,14 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
 
				 	}
			
 
				 
			
 
				 	if (!se) {
			
 
				-		cfs_rq->h_load = cfs_rq->runnable_load_avg;
			
 
				+		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
			
 
				 		cfs_rq->last_h_load_update = now;
			
 
				 	}
			
 
				 
			
 
				 	while ((se = cfs_rq->h_load_next) != NULL) {
			
 
				 		load = cfs_rq->h_load;
			
 
				-		load = div64_ul(load * se->avg.load_avg_contrib,
			
 
				-				cfs_rq->runnable_load_avg + 1);
			
 
				+		load = div64_ul(load * se->avg.load_avg,
			
 
				+			cfs_rq_load_avg(cfs_rq) + 1);
			
 
				 		cfs_rq = group_cfs_rq(se);
			
 
				 		cfs_rq->h_load = load;
			
 
				 		cfs_rq->last_h_load_update = now;
			
@@ -6115,17 +5894,25 @@ static unsigned long task_h_load(struct task_struct *p)
 
				 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
			
 
				 
			
 
				 	update_cfs_rq_h_load(cfs_rq);
			
 
				-	return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
			
 
				-			cfs_rq->runnable_load_avg + 1);
			
 
				+	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
			
 
				+			cfs_rq_load_avg(cfs_rq) + 1);
			
 
				 }
			
 
				 #else
			
 
				 static inline void update_blocked_averages(int cpu)
			
 
				 {
			
 
				+	struct rq *rq = cpu_rq(cpu);
			
 
				+	struct cfs_rq *cfs_rq = &rq->cfs;
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	raw_spin_lock_irqsave(&rq->lock, flags);
			
 
				+	update_rq_clock(rq);
			
 
				+	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
			
 
				+	raw_spin_unlock_irqrestore(&rq->lock, flags);
			
 
				 }
			
 
				 
			
 
				 static unsigned long task_h_load(struct task_struct *p)
			
 
				 {
			
 
				-	return p->se.avg.load_avg_contrib;
			
 
				+	return p->se.avg.load_avg;
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -8025,8 +7812,6 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 
				 
			
 
				 	if (numabalancing_enabled)
			
 
				 		task_tick_numa(rq, curr);
			
 
				-
			
 
				-	update_rq_runnable_avg(rq, 1);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -8125,15 +7910,18 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 
				 	}
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-	/*
			
 
				-	* Remove our load from contribution when we leave sched_fair
			
 
				-	* and ensure we don't carry in an old decay_count if we
			
 
				-	* switch back.
			
 
				-	*/
			
 
				-	if (se->avg.decay_count) {
			
 
				-		__synchronize_entity_decay(se);
			
 
				-		subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
			
 
				-	}
			
 
				+	/* Catch up with the cfs_rq and remove our load when we leave */
			
 
				+	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq), &se->avg,
			
 
				+		se->on_rq * scale_load_down(se->load.weight), cfs_rq->curr == se, NULL);
			
 
				+
			
 
				+	cfs_rq->avg.load_avg =
			
 
				+		max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
			
 
				+	cfs_rq->avg.load_sum =
			
 
				+		max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
			
 
				+	cfs_rq->avg.util_avg =
			
 
				+		max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
			
 
				+	cfs_rq->avg.util_sum =
			
 
				+		max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -8142,16 +7930,31 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 
				  */
			
 
				 static void switched_to_fair(struct rq *rq, struct task_struct *p)
			
 
				 {
			
 
				-#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	struct sched_entity *se = &p->se;
			
 
				+
			
 
				+#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/*
			
 
				 	 * Since the real-depth could have been changed (only FAIR
			
 
				 	 * class maintain depth value), reset depth properly.
			
 
				 	 */
			
 
				 	se->depth = se->parent ? se->parent->depth + 1 : 0;
			
 
				 #endif
			
 
				-	if (!task_on_rq_queued(p))
			
 
				+
			
 
				+	if (!task_on_rq_queued(p)) {
			
 
				+
			
 
				+		/*
			
 
				+		 * Ensure the task has a non-normalized vruntime when it is switched
			
 
				+		 * back to the fair class with !queued, so that enqueue_entity() at
			
 
				+		 * wake-up time will do the right thing.
			
 
				+		 *
			
 
				+		 * If it's queued, then the enqueue_entity(.flags=0) makes the task
			
 
				+		 * has non-normalized vruntime, if it's !queued, then it still has
			
 
				+		 * normalized vruntime.
			
 
				+		 */
			
 
				+		if (p->state != TASK_RUNNING)
			
 
				+			se->vruntime += cfs_rq_of(se)->min_vruntime;
			
 
				 		return;
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * We were most likely switched from sched_rt, so
			
@@ -8190,8 +7993,8 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 
				 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
			
 
				 #endif
			
 
				 #ifdef CONFIG_SMP
			
 
				-	atomic64_set(&cfs_rq->decay_counter, 1);
			
 
				-	atomic_long_set(&cfs_rq->removed_load, 0);
			
 
				+	atomic_long_set(&cfs_rq->removed_load_avg, 0);
			
 
				+	atomic_long_set(&cfs_rq->removed_util_avg, 0);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -8236,14 +8039,14 @@ static void task_move_group_fair(struct task_struct *p, int queued)
 
				 	if (!queued) {
			
 
				 		cfs_rq = cfs_rq_of(se);
			
 
				 		se->vruntime += cfs_rq->min_vruntime;
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				-		/*
			
 
				-		 * migrate_task_rq_fair() will have removed our previous
			
 
				-		 * contribution, but we must synchronize for ongoing future
			
 
				-		 * decay.
			
 
				-		 */
			
 
				-		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
			
 
				-		cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
			
 
				+		/* Virtually synchronize task with its new cfs_rq */
			
 
				+		p->se.avg.last_update_time = cfs_rq->avg.last_update_time;
			
 
				+		cfs_rq->avg.load_avg += p->se.avg.load_avg;
			
 
				+		cfs_rq->avg.load_sum += p->se.avg.load_sum;
			
 
				+		cfs_rq->avg.util_avg += p->se.avg.util_avg;
			
 
				+		cfs_rq->avg.util_sum += p->se.avg.util_sum;
			
 
				 #endif
			
 
				 	}
			
 
				 }
			
@@ -8257,8 +8060,11 @@ void free_fair_sched_group(struct task_group *tg)
 
				 	for_each_possible_cpu(i) {
			
 
				 		if (tg->cfs_rq)
			
 
				 			kfree(tg->cfs_rq[i]);
			
 
				-		if (tg->se)
			
 
				+		if (tg->se) {
			
 
				+			if (tg->se[i])
			
 
				+				remove_entity_load_avg(tg->se[i]);
			
 
				 			kfree(tg->se[i]);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	kfree(tg->cfs_rq);
			
@@ -8295,6 +8101,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 
				 
			
 
				 		init_cfs_rq(cfs_rq);
			
 
				 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
			
 
				+		init_entity_runnable_average(se);
			
 
				 	}
			
 
				 
			
 
				 	return 1;
			
@@ -8444,6 +8251,8 @@ const struct sched_class fair_sched_class = {
 
				 	.rq_offline		= rq_offline_fair,
			
 
				 
			
 
				 	.task_waking		= task_waking_fair,
			
 
				+	.task_dead		= task_dead_fair,
			
 
				+	.set_cpus_allowed	= set_cpus_allowed_common,
			
 
				 #endif
			
 
				 
			
 
				 	.set_curr_task          = set_curr_task_fair,
			
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -79,20 +79,12 @@ SCHED_FEAT(LB_MIN, false)
 
				  * numa_balancing=
			
 
				  */
			
 
				 #ifdef CONFIG_NUMA_BALANCING
			
 
				-SCHED_FEAT(NUMA,	false)
			
 
				 
			
 
				 /*
			
 
				- * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
			
 
				- * higher number of hinting faults are recorded during active load
			
 
				- * balancing.
			
 
				+ * NUMA will favor moving tasks towards nodes where a higher number of
			
 
				+ * hinting faults are recorded during active load balancing. It will
			
 
				+ * resist moving tasks towards nodes where a lower number of hinting
			
 
				+ * faults have been recorded.
			
 
				  */
			
 
				-SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
			
 
				-
			
 
				-/*
			
 
				- * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
			
 
				- * lower number of hinting faults have been recorded. As this has
			
 
				- * the potential to prevent a task ever migrating to a new node
			
 
				- * due to CPU overload it is disabled by default.
			
 
				- */
			
 
				-SCHED_FEAT(NUMA_RESIST_LOWER, false)
			
 
				+SCHED_FEAT(NUMA,	true)
			
 
				 #endif
			
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -83,10 +83,13 @@ void __weak arch_cpu_idle(void)
 
				  */
			
 
				 void default_idle_call(void)
			
 
				 {
			
 
				-	if (current_clr_polling_and_test())
			
 
				+	if (current_clr_polling_and_test()) {
			
 
				 		local_irq_enable();
			
 
				-	else
			
 
				+	} else {
			
 
				+		stop_critical_timings();
			
 
				 		arch_cpu_idle();
			
 
				+		start_critical_timings();
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
			
@@ -140,12 +143,6 @@ static void cpuidle_idle_call(void)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * During the idle period, stop measuring the disabled irqs
			
 
				-	 * critical sections latencies
			
 
				-	 */
			
 
				-	stop_critical_timings();
			
 
				-
			
 
				 	/*
			
 
				 	 * Tell the RCU framework we are entering an idle section,
			
 
				 	 * so no more rcu read side critical sections and one more
			
@@ -198,7 +195,6 @@ exit_idle:
 
				 		local_irq_enable();
			
 
				 
			
 
				 	rcu_idle_exit();
			
 
				-	start_critical_timings();
			
 
				 }
			
 
				 
			
 
				 DEFINE_PER_CPU(bool, cpu_dead_idle);
			
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -96,6 +96,7 @@ const struct sched_class idle_sched_class = {
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 	.select_task_rq		= select_task_rq_idle,
			
 
				+	.set_cpus_allowed	= set_cpus_allowed_common,
			
 
				 #endif
			
 
				 
			
 
				 	.set_curr_task          = set_curr_task_idle,
			
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -2069,7 +2069,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 
				 {
			
 
				 	if (!task_running(rq, p) &&
			
 
				 	    !test_tsk_need_resched(rq->curr) &&
			
 
				-	    has_pushable_tasks(rq) &&
			
 
				 	    p->nr_cpus_allowed > 1 &&
			
 
				 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
			
 
				 	    (rq->curr->nr_cpus_allowed < 2 ||
			
@@ -2077,45 +2076,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 
				 		push_rt_tasks(rq);
			
 
				 }
			
 
				 
			
 
				-static void set_cpus_allowed_rt(struct task_struct *p,
			
 
				-				const struct cpumask *new_mask)
			
 
				-{
			
 
				-	struct rq *rq;
			
 
				-	int weight;
			
 
				-
			
 
				-	BUG_ON(!rt_task(p));
			
 
				-
			
 
				-	if (!task_on_rq_queued(p))
			
 
				-		return;
			
 
				-
			
 
				-	weight = cpumask_weight(new_mask);
			
 
				-
			
 
				-	/*
			
 
				-	 * Only update if the process changes its state from whether it
			
 
				-	 * can migrate or not.
			
 
				-	 */
			
 
				-	if ((p->nr_cpus_allowed > 1) == (weight > 1))
			
 
				-		return;
			
 
				-
			
 
				-	rq = task_rq(p);
			
 
				-
			
 
				-	/*
			
 
				-	 * The process used to be able to migrate OR it can now migrate
			
 
				-	 */
			
 
				-	if (weight <= 1) {
			
 
				-		if (!task_current(rq, p))
			
 
				-			dequeue_pushable_task(rq, p);
			
 
				-		BUG_ON(!rq->rt.rt_nr_migratory);
			
 
				-		rq->rt.rt_nr_migratory--;
			
 
				-	} else {
			
 
				-		if (!task_current(rq, p))
			
 
				-			enqueue_pushable_task(rq, p);
			
 
				-		rq->rt.rt_nr_migratory++;
			
 
				-	}
			
 
				-
			
 
				-	update_rt_migration(&rq->rt);
			
 
				-}
			
 
				-
			
 
				 /* Assumes rq->lock is held */
			
 
				 static void rq_online_rt(struct rq *rq)
			
 
				 {
			
@@ -2324,7 +2284,7 @@ const struct sched_class rt_sched_class = {
 
				 #ifdef CONFIG_SMP
			
 
				 	.select_task_rq		= select_task_rq_rt,
			
 
				 
			
 
				-	.set_cpus_allowed       = set_cpus_allowed_rt,
			
 
				+	.set_cpus_allowed       = set_cpus_allowed_common,
			
 
				 	.rq_online              = rq_online_rt,
			
 
				 	.rq_offline             = rq_offline_rt,
			
 
				 	.task_woken		= task_woken_rt,
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -245,7 +245,6 @@ struct task_group {
 
				 
			
 
				 #ifdef	CONFIG_SMP
			
 
				 	atomic_long_t load_avg;
			
 
				-	atomic_t runnable_avg;
			
 
				 #endif
			
 
				 #endif
			
 
				 
			
@@ -366,27 +365,20 @@ struct cfs_rq {
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 	/*
			
 
				-	 * CFS Load tracking
			
 
				-	 * Under CFS, load is tracked on a per-entity basis and aggregated up.
			
 
				-	 * This allows for the description of both thread and group usage (in
			
 
				-	 * the FAIR_GROUP_SCHED case).
			
 
				-	 * runnable_load_avg is the sum of the load_avg_contrib of the
			
 
				-	 * sched_entities on the rq.
			
 
				-	 * blocked_load_avg is similar to runnable_load_avg except that its
			
 
				-	 * the blocked sched_entities on the rq.
			
 
				-	 * utilization_load_avg is the sum of the average running time of the
			
 
				-	 * sched_entities on the rq.
			
 
				+	 * CFS load tracking
			
 
				 	 */
			
 
				-	unsigned long runnable_load_avg, blocked_load_avg, utilization_load_avg;
			
 
				-	atomic64_t decay_counter;
			
 
				-	u64 last_decay;
			
 
				-	atomic_long_t removed_load;
			
 
				-
			
 
				+	struct sched_avg avg;
			
 
				+	u64 runnable_load_sum;
			
 
				+	unsigned long runnable_load_avg;
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	/* Required to track per-cpu representation of a task_group */
			
 
				-	u32 tg_runnable_contrib;
			
 
				-	unsigned long tg_load_contrib;
			
 
				+	unsigned long tg_load_avg_contrib;
			
 
				+#endif
			
 
				+	atomic_long_t removed_load_avg, removed_util_avg;
			
 
				+#ifndef CONFIG_64BIT
			
 
				+	u64 load_last_update_time_copy;
			
 
				+#endif
			
 
				 
			
 
				+#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/*
			
 
				 	 *   h_load = weight * f(tg)
			
 
				 	 *
			
@@ -595,8 +587,6 @@ struct rq {
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/* list of leaf cfs_rq on this cpu: */
			
 
				 	struct list_head leaf_cfs_rq_list;
			
 
				-
			
 
				-	struct sched_avg avg;
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				 	/*
			
@@ -1065,9 +1055,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 
				 #ifndef prepare_arch_switch
			
 
				 # define prepare_arch_switch(next)	do { } while (0)
			
 
				 #endif
			
 
				-#ifndef finish_arch_switch
			
 
				-# define finish_arch_switch(prev)	do { } while (0)
			
 
				-#endif
			
 
				 #ifndef finish_arch_post_lock_switch
			
 
				 # define finish_arch_post_lock_switch()	do { } while (0)
			
 
				 #endif
			
@@ -1268,6 +1255,8 @@ extern void trigger_load_balance(struct rq *rq);
 
				 extern void idle_enter_fair(struct rq *this_rq);
			
 
				 extern void idle_exit_fair(struct rq *this_rq);
			
 
				 
			
 
				+extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask);
			
 
				+
			
 
				 #else
			
 
				 
			
 
				 static inline void idle_enter_fair(struct rq *rq) { }
			
@@ -1319,7 +1308,7 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 
				 
			
 
				 unsigned long to_ratio(u64 period, u64 runtime);
			
 
				 
			
 
				-extern void init_task_runnable_average(struct task_struct *p);
			
 
				+extern void init_entity_runnable_average(struct sched_entity *se);
			
 
				 
			
 
				 static inline void add_nr_running(struct rq *rq, unsigned count)
			
 
				 {
			
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -123,6 +123,7 @@ const struct sched_class stop_sched_class = {
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 	.select_task_rq		= select_task_rq_stop,
			
 
				+	.set_cpus_allowed	= set_cpus_allowed_common,
			
 
				 #endif
			
 
				 
			
 
				 	.set_curr_task          = set_curr_task_stop,
			
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -35,13 +35,16 @@ struct cpu_stop_done {
 
				 
			
 
				 /* the actual stopper, one per every possible cpu, enabled on online cpus */
			
 
				 struct cpu_stopper {
			
 
				+	struct task_struct	*thread;
			
 
				+
			
 
				 	spinlock_t		lock;
			
 
				 	bool			enabled;	/* is this stopper enabled? */
			
 
				 	struct list_head	works;		/* list of pending works */
			
 
				+
			
 
				+	struct cpu_stop_work	stop_work;	/* for stop_cpus */
			
 
				 };
			
 
				 
			
 
				 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
			
 
				-static DEFINE_PER_CPU(struct task_struct *, cpu_stopper_task);
			
 
				 static bool stop_machine_initialized = false;
			
 
				 
			
 
				 /*
			
@@ -74,7 +77,6 @@ static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed)
 
				 static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
			
 
				 {
			
 
				 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
			
 
				-	struct task_struct *p = per_cpu(cpu_stopper_task, cpu);
			
 
				 
			
 
				 	unsigned long flags;
			
 
				 
			
@@ -82,7 +84,7 @@ static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work)
 
				 
			
 
				 	if (stopper->enabled) {
			
 
				 		list_add_tail(&work->list, &stopper->works);
			
 
				-		wake_up_process(p);
			
 
				+		wake_up_process(stopper->thread);
			
 
				 	} else
			
 
				 		cpu_stop_signal_done(work->done, false);
			
 
				 
			
@@ -139,7 +141,7 @@ enum multi_stop_state {
 
				 };
			
 
				 
			
 
				 struct multi_stop_data {
			
 
				-	int			(*fn)(void *);
			
 
				+	cpu_stop_fn_t		fn;
			
 
				 	void			*data;
			
 
				 	/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
			
 
				 	unsigned int		num_threads;
			
@@ -293,7 +295,6 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
 
				 
			
 
				 /* static data for stop_cpus */
			
 
				 static DEFINE_MUTEX(stop_cpus_mutex);
			
 
				-static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
			
 
				 
			
 
				 static void queue_stop_cpus_work(const struct cpumask *cpumask,
			
 
				 				 cpu_stop_fn_t fn, void *arg,
			
@@ -302,22 +303,19 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask,
 
				 	struct cpu_stop_work *work;
			
 
				 	unsigned int cpu;
			
 
				 
			
 
				-	/* initialize works and done */
			
 
				-	for_each_cpu(cpu, cpumask) {
			
 
				-		work = &per_cpu(stop_cpus_work, cpu);
			
 
				-		work->fn = fn;
			
 
				-		work->arg = arg;
			
 
				-		work->done = done;
			
 
				-	}
			
 
				-
			
 
				 	/*
			
 
				 	 * Disable preemption while queueing to avoid getting
			
 
				 	 * preempted by a stopper which might wait for other stoppers
			
 
				 	 * to enter @fn which can lead to deadlock.
			
 
				 	 */
			
 
				 	lg_global_lock(&stop_cpus_lock);
			
 
				-	for_each_cpu(cpu, cpumask)
			
 
				-		cpu_stop_queue_work(cpu, &per_cpu(stop_cpus_work, cpu));
			
 
				+	for_each_cpu(cpu, cpumask) {
			
 
				+		work = &per_cpu(cpu_stopper.stop_work, cpu);
			
 
				+		work->fn = fn;
			
 
				+		work->arg = arg;
			
 
				+		work->done = done;
			
 
				+		cpu_stop_queue_work(cpu, work);
			
 
				+	}
			
 
				 	lg_global_unlock(&stop_cpus_lock);
			
 
				 }
			
 
				 
			
@@ -458,19 +456,21 @@ extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
				 
			
 
				 static void cpu_stop_create(unsigned int cpu)
			
 
				 {
			
 
				-	sched_set_stop_task(cpu, per_cpu(cpu_stopper_task, cpu));
			
 
				+	sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));
			
 
				 }
			
 
				 
			
 
				 static void cpu_stop_park(unsigned int cpu)
			
 
				 {
			
 
				 	struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
			
 
				-	struct cpu_stop_work *work;
			
 
				+	struct cpu_stop_work *work, *tmp;
			
 
				 	unsigned long flags;
			
 
				 
			
 
				 	/* drain remaining works */
			
 
				 	spin_lock_irqsave(&stopper->lock, flags);
			
 
				-	list_for_each_entry(work, &stopper->works, list)
			
 
				+	list_for_each_entry_safe(work, tmp, &stopper->works, list) {
			
 
				+		list_del_init(&work->list);
			
 
				 		cpu_stop_signal_done(work->done, false);
			
 
				+	}
			
 
				 	stopper->enabled = false;
			
 
				 	spin_unlock_irqrestore(&stopper->lock, flags);
			
 
				 }
			
@@ -485,7 +485,7 @@ static void cpu_stop_unpark(unsigned int cpu)
 
				 }
			
 
				 
			
 
				 static struct smp_hotplug_thread cpu_stop_threads = {
			
 
				-	.store			= &cpu_stopper_task,
			
 
				+	.store			= &cpu_stopper.thread,
			
 
				 	.thread_should_run	= cpu_stop_should_run,
			
 
				 	.thread_fn		= cpu_stopper_thread,
			
 
				 	.thread_comm		= "migration/%u",
			
@@ -515,7 +515,7 @@ early_initcall(cpu_stop_init);
 
				 
			
 
				 #ifdef CONFIG_STOP_MACHINE
			
 
				 
			
 
				-int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
			
 
				+static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
			
 
				 {
			
 
				 	struct multi_stop_data msdata = {
			
 
				 		.fn = fn,
			
@@ -548,7 +548,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 
				 	return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
			
 
				 }
			
 
				 
			
 
				-int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
			
 
				+int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
			
 
				 {
			
 
				 	int ret;
			
 
				 
			
@@ -582,7 +582,7 @@ EXPORT_SYMBOL_GPL(stop_machine);
 
				  * 0 if all executions of @fn returned 0, any non zero return value if any
			
 
				  * returned non zero.
			
 
				  */
			
 
				-int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
			
 
				+int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
			
 
				 				  const struct cpumask *cpus)
			
 
				 {
			
 
				 	struct multi_stop_data msdata = { .fn = fn, .data = data,
			
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -26,7 +26,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
 
				 }
			
 
				 
			
 
				 static void
			
 
				-probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
			
 
				+probe_sched_wakeup(void *ignore, struct task_struct *wakee)
			
 
				 {
			
 
				 	if (unlikely(!sched_ref))
			
 
				 		return;
			
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -514,7 +514,7 @@ static void wakeup_reset(struct trace_array *tr)
 
				 }
			
 
				 
			
 
				 static void
			
 
				-probe_wakeup(void *ignore, struct task_struct *p, int success)
			
 
				+probe_wakeup(void *ignore, struct task_struct *p)
			
 
				 {
			
 
				 	struct trace_array_cpu *data;
			
 
				 	int cpu = smp_processor_id();
			
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1714,9 +1714,7 @@ static struct worker *create_worker(struct worker_pool *pool)
 
				 		goto fail;
			
 
				 
			
 
				 	set_user_nice(worker->task, pool->attrs->nice);
			
 
				-
			
 
				-	/* prevent userland from meddling with cpumask of workqueue workers */
			
 
				-	worker->task->flags |= PF_NO_SETAFFINITY;
			
 
				+	kthread_bind_mask(worker->task, pool->attrs->cpumask);
			
 
				 
			
 
				 	/* successful, attach the worker to the pool */
			
 
				 	worker_attach_to_pool(worker, pool);
			
@@ -3856,7 +3854,7 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 
				 		}
			
 
				 
			
 
				 		wq->rescuer = rescuer;
			
 
				-		rescuer->task->flags |= PF_NO_SETAFFINITY;
			
 
				+		kthread_bind_mask(rescuer->task, cpu_possible_mask);
			
 
				 		wake_up_process(rescuer->task);
			
 
				 	}