7 years ago · 3e2014637c
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1730,20 +1730,33 @@
 
				 	isapnp=		[ISAPNP]
			
 
				 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
			
 
				 
			
 
				-	isolcpus=	[KNL,SMP] Isolate CPUs from the general scheduler.
			
 
				-			The argument is a cpu list, as described above.
			
 
				+	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
			
 
				+			[Deprecated - use cpusets instead]
			
 
				+			Format: [flag-list,]<cpu-list>
			
 
				+
			
 
				+			Specify one or more CPUs to isolate from disturbances
			
 
				+			specified in the flag list (default: domain):
			
 
				+
			
 
				+			nohz
			
 
				+			  Disable the tick when a single task runs.
			
 
				+			domain
			
 
				+			  Isolate from the general SMP balancing and scheduling
			
 
				+			  algorithms. Note that performing domain isolation this way
			
 
				+			  is irreversible: it's not possible to bring back a CPU to
			
 
				+			  the domains once isolated through isolcpus. It's strongly
			
 
				+			  advised to use cpusets instead to disable scheduler load
			
 
				+			  balancing through the "cpuset.sched_load_balance" file.
			
 
				+			  It offers a much more flexible interface where CPUs can
			
 
				+			  move in and out of an isolated set anytime.
			
 
				+
			
 
				+			  You can move a process onto or off an "isolated" CPU via
			
 
				+			  the CPU affinity syscalls or cpuset.
			
 
				+			  <cpu number> begins at 0 and the maximum value is
			
 
				+			  "number of CPUs in system - 1".
			
 
				+
			
 
				+			The format of <cpu-list> is described above.
			
 
				 
			
 
				-			This option can be used to specify one or more CPUs
			
 
				-			to isolate from the general SMP balancing and scheduling
			
 
				-			algorithms. You can move a process onto or off an
			
 
				-			"isolated" CPU via the CPU affinity syscalls or cpuset.
			
 
				-			<cpu number> begins at 0 and the maximum value is
			
 
				-			"number of CPUs in system - 1".
			
 
				 
			
 
				-			This option is the preferred way to isolate CPUs. The
			
 
				-			alternative -- manually setting the CPU mask of all
			
 
				-			tasks in the system -- can cause problems and
			
 
				-			suboptimal load balancer performance.
			
 
				 
			
 
				 	iucv=		[HW,NET]
			
 
				 
			
@@ -4209,6 +4222,9 @@
 
				 			Used to run time disable IRQ_TIME_ACCOUNTING on any
			
 
				 			platforms where RDTSC is slow and this accounting
			
 
				 			can add overhead.
			
 
				+			[x86] unstable: mark the TSC clocksource as unstable, this
			
 
				+			marks the TSC unconditionally unstable at bootup and
			
 
				+			avoids any further wobbles once the TSC watchdog notices.
			
 
				 
			
 
				 	turbografx.map[2|3]=	[HW,JOY]
			
 
				 			TurboGraFX parallel port interface
			
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -18,6 +18,7 @@
 
				 #include <linux/cpufeature.h>
			
 
				 #include <linux/tick.h>
			
 
				 #include <linux/pm_qos.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include "base.h"
			
 
				 
			
@@ -271,8 +272,16 @@ static ssize_t print_cpus_isolated(struct device *dev,
 
				 				  struct device_attribute *attr, char *buf)
			
 
				 {
			
 
				 	int n = 0, len = PAGE_SIZE-2;
			
 
				+	cpumask_var_t isolated;
			
 
				 
			
 
				-	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(cpu_isolated_map));
			
 
				+	if (!alloc_cpumask_var(&isolated, GFP_KERNEL))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	cpumask_andnot(isolated, cpu_possible_mask,
			
 
				+		       housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				+	n = scnprintf(buf, len, "%*pbl\n", cpumask_pr_args(isolated));
			
 
				+
			
 
				+	free_cpumask_var(isolated);
			
 
				 
			
 
				 	return n;
			
 
				 }
			
--- a/drivers/net/ethernet/tile/tilegx.c
+++ b/drivers/net/ethernet/tile/tilegx.c
@@ -40,7 +40,7 @@
 
				 #include <linux/tcp.h>
			
 
				 #include <linux/net_tstamp.h>
			
 
				 #include <linux/ptp_clock_kernel.h>
			
 
				-#include <linux/tick.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include <asm/checksum.h>
			
 
				 #include <asm/homecache.h>
			
@@ -2270,8 +2270,8 @@ static int __init tile_net_init_module(void)
 
				 		tile_net_dev_init(name, mac);
			
 
				 
			
 
				 	if (!network_cpus_init())
			
 
				-		cpumask_and(&network_cpus_map, housekeeping_cpumask(),
			
 
				-			    cpu_online_mask);
			
 
				+		cpumask_and(&network_cpus_map,
			
 
				+			    housekeeping_cpumask(HK_FLAG_MISC), cpu_online_mask);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -138,7 +138,7 @@ static const char * const task_state_array[] = {
 
				 static inline const char *get_task_state(struct task_struct *tsk)
			
 
				 {
			
 
				 	BUILD_BUG_ON(1 + ilog2(TASK_REPORT_MAX) != ARRAY_SIZE(task_state_array));
			
 
				-	return task_state_array[__get_task_state(tsk)];
			
 
				+	return task_state_array[task_state_index(tsk)];
			
 
				 }
			
 
				 
			
 
				 static inline int get_task_umask(struct task_struct *tsk)
			
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -131,6 +131,11 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static inline unsigned int cpumask_last(const struct cpumask *srcp)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* Valid inputs for n are -1 and 0. */
			
 
				 static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
			
 
				 {
			
@@ -179,6 +184,17 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
 
				 	return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits);
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * cpumask_last - get the last CPU in a cpumask
			
 
				+ * @srcp:	- the cpumask pointer
			
 
				+ *
			
 
				+ * Returns	>= nr_cpumask_bits if no CPUs set.
			
 
				+ */
			
 
				+static inline unsigned int cpumask_last(const struct cpumask *srcp)
			
 
				+{
			
 
				+	return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
			
 
				+}
			
 
				+
			
 
				 unsigned int cpumask_next(int n, const struct cpumask *srcp);
			
 
				 
			
 
				 /**
			
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -3,6 +3,7 @@
 
				 #define IOPRIO_H
			
 
				 
			
 
				 #include <linux/sched.h>
			
 
				+#include <linux/sched/rt.h>
			
 
				 #include <linux/iocontext.h>
			
 
				 
			
 
				 /*
			
@@ -63,7 +64,7 @@ static inline int task_nice_ioclass(struct task_struct *task)
 
				 {
			
 
				 	if (task->policy == SCHED_IDLE)
			
 
				 		return IOPRIO_CLASS_IDLE;
			
 
				-	else if (task->policy == SCHED_FIFO || task->policy == SCHED_RR)
			
 
				+	else if (task_is_realtime(task))
			
 
				 		return IOPRIO_CLASS_RT;
			
 
				 	else
			
 
				 		return IOPRIO_CLASS_BE;
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,8 +166,6 @@ struct task_group;
 
				 /* Task command name length: */
			
 
				 #define TASK_COMM_LEN			16
			
 
				 
			
 
				-extern cpumask_var_t			cpu_isolated_map;
			
 
				-
			
 
				 extern void scheduler_tick(void);
			
 
				 
			
 
				 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
			
@@ -332,9 +330,11 @@ struct load_weight {
 
				 struct sched_avg {
			
 
				 	u64				last_update_time;
			
 
				 	u64				load_sum;
			
 
				+	u64				runnable_load_sum;
			
 
				 	u32				util_sum;
			
 
				 	u32				period_contrib;
			
 
				 	unsigned long			load_avg;
			
 
				+	unsigned long			runnable_load_avg;
			
 
				 	unsigned long			util_avg;
			
 
				 };
			
 
				 
			
@@ -377,6 +377,7 @@ struct sched_statistics {
 
				 struct sched_entity {
			
 
				 	/* For load-balancing: */
			
 
				 	struct load_weight		load;
			
 
				+	unsigned long			runnable_weight;
			
 
				 	struct rb_node			run_node;
			
 
				 	struct list_head		group_node;
			
 
				 	unsigned int			on_rq;
			
@@ -472,10 +473,10 @@ struct sched_dl_entity {
 
				 	 * conditions between the inactive timer handler and the wakeup
			
 
				 	 * code.
			
 
				 	 */
			
 
				-	int				dl_throttled;
			
 
				-	int				dl_boosted;
			
 
				-	int				dl_yielded;
			
 
				-	int				dl_non_contending;
			
 
				+	int				dl_throttled      : 1;
			
 
				+	int				dl_boosted        : 1;
			
 
				+	int				dl_yielded        : 1;
			
 
				+	int				dl_non_contending : 1;
			
 
				 
			
 
				 	/*
			
 
				 	 * Bandwidth enforcement timer. Each -deadline task has its
			
@@ -1246,7 +1247,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 
				 #define TASK_REPORT_IDLE	(TASK_REPORT + 1)
			
 
				 #define TASK_REPORT_MAX		(TASK_REPORT_IDLE << 1)
			
 
				 
			
 
				-static inline unsigned int __get_task_state(struct task_struct *tsk)
			
 
				+static inline unsigned int task_state_index(struct task_struct *tsk)
			
 
				 {
			
 
				 	unsigned int tsk_state = READ_ONCE(tsk->state);
			
 
				 	unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
			
@@ -1259,7 +1260,7 @@ static inline unsigned int __get_task_state(struct task_struct *tsk)
 
				 	return fls(state);
			
 
				 }
			
 
				 
			
 
				-static inline char __task_state_to_char(unsigned int state)
			
 
				+static inline char task_index_to_char(unsigned int state)
			
 
				 {
			
 
				 	static const char state_char[] = "RSDTtXZPI";
			
 
				 
			
@@ -1270,7 +1271,7 @@ static inline char __task_state_to_char(unsigned int state)
 
				 
			
 
				 static inline char task_state_to_char(struct task_struct *tsk)
			
 
				 {
			
 
				-	return __task_state_to_char(__get_task_state(tsk));
			
 
				+	return task_index_to_char(task_state_index(tsk));
			
 
				 }
			
 
				 
			
 
				 /**
			
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -0,0 +1,51 @@
 
				+#ifndef _LINUX_SCHED_ISOLATION_H
			
 
				+#define _LINUX_SCHED_ISOLATION_H
			
 
				+
			
 
				+#include <linux/cpumask.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/tick.h>
			
 
				+
			
 
				+enum hk_flags {
			
 
				+	HK_FLAG_TIMER		= 1,
			
 
				+	HK_FLAG_RCU		= (1 << 1),
			
 
				+	HK_FLAG_MISC		= (1 << 2),
			
 
				+	HK_FLAG_SCHED		= (1 << 3),
			
 
				+	HK_FLAG_TICK		= (1 << 4),
			
 
				+	HK_FLAG_DOMAIN		= (1 << 5),
			
 
				+};
			
 
				+
			
 
				+#ifdef CONFIG_CPU_ISOLATION
			
 
				+DECLARE_STATIC_KEY_FALSE(housekeeping_overriden);
			
 
				+extern int housekeeping_any_cpu(enum hk_flags flags);
			
 
				+extern const struct cpumask *housekeeping_cpumask(enum hk_flags flags);
			
 
				+extern void housekeeping_affine(struct task_struct *t, enum hk_flags flags);
			
 
				+extern bool housekeeping_test_cpu(int cpu, enum hk_flags flags);
			
 
				+extern void __init housekeeping_init(void);
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+static inline int housekeeping_any_cpu(enum hk_flags flags)
			
 
				+{
			
 
				+	return smp_processor_id();
			
 
				+}
			
 
				+
			
 
				+static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
			
 
				+{
			
 
				+	return cpu_possible_mask;
			
 
				+}
			
 
				+
			
 
				+static inline void housekeeping_affine(struct task_struct *t,
			
 
				+				       enum hk_flags flags) { }
			
 
				+static inline void housekeeping_init(void) { }
			
 
				+#endif /* CONFIG_CPU_ISOLATION */
			
 
				+
			
 
				+static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
			
 
				+{
			
 
				+#ifdef CONFIG_CPU_ISOLATION
			
 
				+	if (static_branch_unlikely(&housekeeping_overriden))
			
 
				+		return housekeeping_test_cpu(cpu, flags);
			
 
				+#endif
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+#endif /* _LINUX_SCHED_ISOLATION_H */
			
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -18,6 +18,17 @@ static inline int rt_task(struct task_struct *p)
 
				 	return rt_prio(p->prio);
			
 
				 }
			
 
				 
			
 
				+static inline bool task_is_realtime(struct task_struct *tsk)
			
 
				+{
			
 
				+	int policy = tsk->policy;
			
 
				+
			
 
				+	if (policy == SCHED_FIFO || policy == SCHED_RR)
			
 
				+		return true;
			
 
				+	if (policy == SCHED_DEADLINE)
			
 
				+		return true;
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_RT_MUTEXES
			
 
				 /*
			
 
				  * Must hold either p->pi_lock or task_rq(p)->lock.
			
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -38,9 +38,9 @@ extern unsigned int sysctl_numa_balancing_scan_period_max;
 
				 extern unsigned int sysctl_numa_balancing_scan_size;
			
 
				 
			
 
				 #ifdef CONFIG_SCHED_DEBUG
			
 
				-extern unsigned int sysctl_sched_migration_cost;
			
 
				-extern unsigned int sysctl_sched_nr_migrate;
			
 
				-extern unsigned int sysctl_sched_time_avg;
			
 
				+extern __read_mostly unsigned int sysctl_sched_migration_cost;
			
 
				+extern __read_mostly unsigned int sysctl_sched_nr_migrate;
			
 
				+extern __read_mostly unsigned int sysctl_sched_time_avg;
			
 
				 
			
 
				 int sched_proc_update_handler(struct ctl_table *table, int write,
			
 
				 		void __user *buffer, size_t *length,
			
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -138,7 +138,6 @@ static inline u64 get_cpu_iowait_time_us(int cpu, u64 *unused) { return -1; }
 
				 #ifdef CONFIG_NO_HZ_FULL
			
 
				 extern bool tick_nohz_full_running;
			
 
				 extern cpumask_var_t tick_nohz_full_mask;
			
 
				-extern cpumask_var_t housekeeping_mask;
			
 
				 
			
 
				 static inline bool tick_nohz_full_enabled(void)
			
 
				 {
			
@@ -162,11 +161,6 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask)
 
				 		cpumask_or(mask, mask, tick_nohz_full_mask);
			
 
				 }
			
 
				 
			
 
				-static inline int housekeeping_any_cpu(void)
			
 
				-{
			
 
				-	return cpumask_any_and(housekeeping_mask, cpu_online_mask);
			
 
				-}
			
 
				-
			
 
				 extern void tick_nohz_dep_set(enum tick_dep_bits bit);
			
 
				 extern void tick_nohz_dep_clear(enum tick_dep_bits bit);
			
 
				 extern void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit);
			
@@ -235,11 +229,8 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
				 
			
 
				 extern void tick_nohz_full_kick_cpu(int cpu);
			
 
				 extern void __tick_nohz_task_switch(void);
			
 
				+extern void __init tick_nohz_full_setup(cpumask_var_t cpumask);
			
 
				 #else
			
 
				-static inline int housekeeping_any_cpu(void)
			
 
				-{
			
 
				-	return smp_processor_id();
			
 
				-}
			
 
				 static inline bool tick_nohz_full_enabled(void) { return false; }
			
 
				 static inline bool tick_nohz_full_cpu(int cpu) { return false; }
			
 
				 static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { }
			
@@ -259,35 +250,9 @@ static inline void tick_dep_clear_signal(struct signal_struct *signal,
 
				 
			
 
				 static inline void tick_nohz_full_kick_cpu(int cpu) { }
			
 
				 static inline void __tick_nohz_task_switch(void) { }
			
 
				+static inline void tick_nohz_full_setup(cpumask_var_t cpumask) { }
			
 
				 #endif
			
 
				 
			
 
				-static inline const struct cpumask *housekeeping_cpumask(void)
			
 
				-{
			
 
				-#ifdef CONFIG_NO_HZ_FULL
			
 
				-	if (tick_nohz_full_enabled())
			
 
				-		return housekeeping_mask;
			
 
				-#endif
			
 
				-	return cpu_possible_mask;
			
 
				-}
			
 
				-
			
 
				-static inline bool is_housekeeping_cpu(int cpu)
			
 
				-{
			
 
				-#ifdef CONFIG_NO_HZ_FULL
			
 
				-	if (tick_nohz_full_enabled())
			
 
				-		return cpumask_test_cpu(cpu, housekeeping_mask);
			
 
				-#endif
			
 
				-	return true;
			
 
				-}
			
 
				-
			
 
				-static inline void housekeeping_affine(struct task_struct *t)
			
 
				-{
			
 
				-#ifdef CONFIG_NO_HZ_FULL
			
 
				-	if (tick_nohz_full_enabled())
			
 
				-		set_cpus_allowed_ptr(t, housekeeping_mask);
			
 
				-
			
 
				-#endif
			
 
				-}
			
 
				-
			
 
				 static inline void tick_nohz_task_switch(void)
			
 
				 {
			
 
				 	if (tick_nohz_full_enabled())
			
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -118,7 +118,7 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
 
				 	if (preempt)
			
 
				 		return TASK_STATE_MAX;
			
 
				 
			
 
				-	return __get_task_state(p);
			
 
				+	return task_state_index(p);
			
 
				 }
			
 
				 #endif /* CREATE_TRACE_POINTS */
			
 
				 
			
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -472,6 +472,13 @@ config TASK_IO_ACCOUNTING
 
				 
			
 
				 endmenu # "CPU/Task time and stats accounting"
			
 
				 
			
 
				+config CPU_ISOLATION
			
 
				+	bool "CPU isolation"
			
 
				+	help
			
 
				+	  Make sure that CPUs running critical tasks are not disturbed by
			
 
				+	  any source of "noise" such as unbound workqueues, timers, kthreads...
			
 
				+	  Unbound jobs get offloaded to housekeeping CPUs.
			
 
				+
			
 
				 source "kernel/rcu/Kconfig"
			
 
				 
			
 
				 config BUILD_BIN2C
			
--- a/init/main.c
+++ b/init/main.c
@@ -46,6 +46,7 @@
 
				 #include <linux/cgroup.h>
			
 
				 #include <linux/efi.h>
			
 
				 #include <linux/tick.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 #include <linux/interrupt.h>
			
 
				 #include <linux/taskstats_kern.h>
			
 
				 #include <linux/delayacct.h>
			
@@ -606,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void)
 
				 	early_irq_init();
			
 
				 	init_IRQ();
			
 
				 	tick_init();
			
 
				+	housekeeping_init();
			
 
				 	rcu_init_nohz();
			
 
				 	init_timers();
			
 
				 	hrtimers_init();
			
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -57,7 +57,7 @@
 
				 #include <linux/backing-dev.h>
			
 
				 #include <linux/sort.h>
			
 
				 #include <linux/oom.h>
			
 
				-
			
 
				+#include <linux/sched/isolation.h>
			
 
				 #include <linux/uaccess.h>
			
 
				 #include <linux/atomic.h>
			
 
				 #include <linux/mutex.h>
			
@@ -656,7 +656,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 
				 	int csn;		/* how many cpuset ptrs in csa so far */
			
 
				 	int i, j, k;		/* indices for partition finding loops */
			
 
				 	cpumask_var_t *doms;	/* resulting partition; i.e. sched domains */
			
 
				-	cpumask_var_t non_isolated_cpus;  /* load balanced CPUs */
			
 
				 	struct sched_domain_attr *dattr;  /* attributes for custom domains */
			
 
				 	int ndoms = 0;		/* number of sched domains in result */
			
 
				 	int nslot;		/* next empty doms[] struct cpumask slot */
			
@@ -666,10 +665,6 @@ static int generate_sched_domains(cpumask_var_t **domains,
 
				 	dattr = NULL;
			
 
				 	csa = NULL;
			
 
				 
			
 
				-	if (!alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL))
			
 
				-		goto done;
			
 
				-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
			
 
				-
			
 
				 	/* Special case for the 99% of systems with one, full, sched domain */
			
 
				 	if (is_sched_load_balance(&top_cpuset)) {
			
 
				 		ndoms = 1;
			
@@ -683,7 +678,7 @@ static int generate_sched_domains(cpumask_var_t **domains,
 
				 			update_domain_attr_tree(dattr, &top_cpuset);
			
 
				 		}
			
 
				 		cpumask_and(doms[0], top_cpuset.effective_cpus,
			
 
				-				     non_isolated_cpus);
			
 
				+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				 
			
 
				 		goto done;
			
 
				 	}
			
@@ -707,7 +702,8 @@ static int generate_sched_domains(cpumask_var_t **domains,
 
				 		 */
			
 
				 		if (!cpumask_empty(cp->cpus_allowed) &&
			
 
				 		    !(is_sched_load_balance(cp) &&
			
 
				-		      cpumask_intersects(cp->cpus_allowed, non_isolated_cpus)))
			
 
				+		      cpumask_intersects(cp->cpus_allowed,
			
 
				+					 housekeeping_cpumask(HK_FLAG_DOMAIN))))
			
 
				 			continue;
			
 
				 
			
 
				 		if (is_sched_load_balance(cp))
			
@@ -789,7 +785,7 @@ restart:
 
				 
			
 
				 			if (apn == b->pn) {
			
 
				 				cpumask_or(dp, dp, b->effective_cpus);
			
 
				-				cpumask_and(dp, dp, non_isolated_cpus);
			
 
				+				cpumask_and(dp, dp, housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				 				if (dattr)
			
 
				 					update_domain_attr_tree(dattr + nslot, b);
			
 
				 
			
@@ -802,7 +798,6 @@ restart:
 
				 	BUG_ON(nslot != ndoms);
			
 
				 
			
 
				 done:
			
 
				-	free_cpumask_var(non_isolated_cpus);
			
 
				 	kfree(csa);
			
 
				 
			
 
				 	/*
			
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -29,6 +29,7 @@
 
				 #include <linux/oom.h>
			
 
				 #include <linux/sched/debug.h>
			
 
				 #include <linux/smpboot.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 #include <uapi/linux/sched/types.h>
			
 
				 #include "../time/tick-internal.h"
			
 
				 
			
@@ -2587,7 +2588,7 @@ static void rcu_bind_gp_kthread(void)
 
				 
			
 
				 	if (!tick_nohz_full_enabled())
			
 
				 		return;
			
 
				-	housekeeping_affine(current);
			
 
				+	housekeeping_affine(current, HK_FLAG_RCU);
			
 
				 }
			
 
				 
			
 
				 /* Record the current task on dyntick-idle entry. */
			
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -51,6 +51,7 @@
 
				 #include <linux/kthread.h>
			
 
				 #include <linux/tick.h>
			
 
				 #include <linux/rcupdate_wait.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #define CREATE_TRACE_POINTS
			
 
				 
			
@@ -714,7 +715,7 @@ static int __noreturn rcu_tasks_kthread(void *arg)
 
				 	LIST_HEAD(rcu_tasks_holdouts);
			
 
				 
			
 
				 	/* Run on housekeeping CPUs by default.  Sysadm can move if desired. */
			
 
				-	housekeeping_affine(current);
			
 
				+	housekeeping_affine(current, HK_FLAG_RCU);
			
 
				 
			
 
				 	/*
			
 
				 	 * Each pass through the following loop makes one check for
			
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -27,3 +27,4 @@ obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 
				 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
			
 
				 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
			
 
				 obj-$(CONFIG_MEMBARRIER) += membarrier.o
			
 
				+obj-$(CONFIG_CPU_ISOLATION) += isolation.o
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,6 +26,7 @@
 
				 #include <linux/profile.h>
			
 
				 #include <linux/security.h>
			
 
				 #include <linux/syscalls.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include <asm/switch_to.h>
			
 
				 #include <asm/tlb.h>
			
@@ -42,18 +43,21 @@
 
				 
			
 
				 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
			
 
				 
			
 
				+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
			
 
				 /*
			
 
				  * Debugging: various feature bits
			
 
				+ *
			
 
				+ * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
			
 
				+ * sysctl_sched_features, defined in sched.h, to allow constants propagation
			
 
				+ * at compile time and compiler optimization based on features default.
			
 
				  */
			
 
				-
			
 
				 #define SCHED_FEAT(name, enabled)	\
			
 
				 	(1UL << __SCHED_FEAT_##name) * enabled |
			
 
				-
			
 
				 const_debug unsigned int sysctl_sched_features =
			
 
				 #include "features.h"
			
 
				 	0;
			
 
				-
			
 
				 #undef SCHED_FEAT
			
 
				+#endif
			
 
				 
			
 
				 /*
			
 
				  * Number of tasks to iterate in a single balance run.
			
@@ -83,9 +87,6 @@ __read_mostly int scheduler_running;
 
				  */
			
 
				 int sysctl_sched_rt_runtime = 950000;
			
 
				 
			
 
				-/* CPUs with isolated domains */
			
 
				-cpumask_var_t cpu_isolated_map;
			
 
				-
			
 
				 /*
			
 
				  * __task_rq_lock - lock the rq @p resides on.
			
 
				  */
			
@@ -525,7 +526,7 @@ int get_nohz_timer_target(void)
 
				 	int i, cpu = smp_processor_id();
			
 
				 	struct sched_domain *sd;
			
 
				 
			
 
				-	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
			
 
				+	if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
			
 
				 		return cpu;
			
 
				 
			
 
				 	rcu_read_lock();
			
@@ -534,15 +535,15 @@ int get_nohz_timer_target(void)
 
				 			if (cpu == i)
			
 
				 				continue;
			
 
				 
			
 
				-			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
			
 
				+			if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
			
 
				 				cpu = i;
			
 
				 				goto unlock;
			
 
				 			}
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	if (!is_housekeeping_cpu(cpu))
			
 
				-		cpu = housekeeping_any_cpu();
			
 
				+	if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
			
 
				+		cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
			
 
				 unlock:
			
 
				 	rcu_read_unlock();
			
 
				 	return cpu;
			
@@ -732,7 +733,7 @@ int tg_nop(struct task_group *tg, void *data)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				-static void set_load_weight(struct task_struct *p)
			
 
				+static void set_load_weight(struct task_struct *p, bool update_load)
			
 
				 {
			
 
				 	int prio = p->static_prio - MAX_RT_PRIO;
			
 
				 	struct load_weight *load = &p->se.load;
			
@@ -746,8 +747,16 @@ static void set_load_weight(struct task_struct *p)
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	load->weight = scale_load(sched_prio_to_weight[prio]);
			
 
				-	load->inv_weight = sched_prio_to_wmult[prio];
			
 
				+	/*
			
 
				+	 * SCHED_OTHER tasks have to update their load when changing their
			
 
				+	 * weight
			
 
				+	 */
			
 
				+	if (update_load && p->sched_class == &fair_sched_class) {
			
 
				+		reweight_task(p, prio);
			
 
				+	} else {
			
 
				+		load->weight = scale_load(sched_prio_to_weight[prio]);
			
 
				+		load->inv_weight = sched_prio_to_wmult[prio];
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
			
@@ -2357,7 +2366,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
 
				 			p->static_prio = NICE_TO_PRIO(0);
			
 
				 
			
 
				 		p->prio = p->normal_prio = __normal_prio(p);
			
 
				-		set_load_weight(p);
			
 
				+		set_load_weight(p, false);
			
 
				 
			
 
				 		/*
			
 
				 		 * We don't need the reset flag anymore after the fork. It has
			
@@ -3804,7 +3813,7 @@ void set_user_nice(struct task_struct *p, long nice)
 
				 		put_prev_task(rq, p);
			
 
				 
			
 
				 	p->static_prio = NICE_TO_PRIO(nice);
			
 
				-	set_load_weight(p);
			
 
				+	set_load_weight(p, true);
			
 
				 	old_prio = p->prio;
			
 
				 	p->prio = effective_prio(p);
			
 
				 	delta = p->prio - old_prio;
			
@@ -3961,7 +3970,7 @@ static void __setscheduler_params(struct task_struct *p,
 
				 	 */
			
 
				 	p->rt_priority = attr->sched_priority;
			
 
				 	p->normal_prio = normal_prio(p);
			
 
				-	set_load_weight(p);
			
 
				+	set_load_weight(p, true);
			
 
				 }
			
 
				 
			
 
				 /* Actually do priority change: must hold pi & rq lock. */
			
@@ -5727,10 +5736,6 @@ static inline void sched_init_smt(void) { }
 
				 
			
 
				 void __init sched_init_smp(void)
			
 
				 {
			
 
				-	cpumask_var_t non_isolated_cpus;
			
 
				-
			
 
				-	alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
			
 
				-
			
 
				 	sched_init_numa();
			
 
				 
			
 
				 	/*
			
@@ -5740,16 +5745,12 @@ void __init sched_init_smp(void)
 
				 	 */
			
 
				 	mutex_lock(&sched_domains_mutex);
			
 
				 	sched_init_domains(cpu_active_mask);
			
 
				-	cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
			
 
				-	if (cpumask_empty(non_isolated_cpus))
			
 
				-		cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
			
 
				 	mutex_unlock(&sched_domains_mutex);
			
 
				 
			
 
				 	/* Move init over to a non-isolated CPU */
			
 
				-	if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
			
 
				+	if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0)
			
 
				 		BUG();
			
 
				 	sched_init_granularity();
			
 
				-	free_cpumask_var(non_isolated_cpus);
			
 
				 
			
 
				 	init_sched_rt_class();
			
 
				 	init_sched_dl_class();
			
@@ -5934,7 +5935,7 @@ void __init sched_init(void)
 
				 		atomic_set(&rq->nr_iowait, 0);
			
 
				 	}
			
 
				 
			
 
				-	set_load_weight(&init_task);
			
 
				+	set_load_weight(&init_task, false);
			
 
				 
			
 
				 	/*
			
 
				 	 * The boot idle thread does lazy MMU switching as well:
			
@@ -5953,9 +5954,6 @@ void __init sched_init(void)
 
				 	calc_load_update = jiffies + LOAD_FREQ;
			
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				-	/* May be allocated at isolcpus cmdline parse time */
			
 
				-	if (cpu_isolated_map == NULL)
			
 
				-		zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
			
 
				 	idle_thread_set_boot_cpu();
			
 
				 	set_cpu_rq_start_time(smp_processor_id());
			
 
				 #endif
			
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,7 +243,7 @@ static void task_non_contending(struct task_struct *p)
 
				 			if (p->state == TASK_DEAD)
			
 
				 				sub_rq_bw(p->dl.dl_bw, &rq->dl);
			
 
				 			raw_spin_lock(&dl_b->lock);
			
 
				-			__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				+			__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				 			__dl_clear_params(p);
			
 
				 			raw_spin_unlock(&dl_b->lock);
			
 
				 		}
			
@@ -1210,7 +1210,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
 
				 		}
			
 
				 
			
 
				 		raw_spin_lock(&dl_b->lock);
			
 
				-		__dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				+		__dl_sub(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				 		raw_spin_unlock(&dl_b->lock);
			
 
				 		__dl_clear_params(p);
			
 
				 
			
@@ -1365,6 +1365,10 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
 
				 		update_dl_entity(dl_se, pi_se);
			
 
				 	} else if (flags & ENQUEUE_REPLENISH) {
			
 
				 		replenish_dl_entity(dl_se, pi_se);
			
 
				+	} else if ((flags & ENQUEUE_RESTORE) &&
			
 
				+		  dl_time_before(dl_se->deadline,
			
 
				+				 rq_clock(rq_of_dl_rq(dl_rq_of_se(dl_se))))) {
			
 
				+		setup_new_dl_entity(dl_se);
			
 
				 	}
			
 
				 
			
 
				 	__enqueue_dl_entity(dl_se);
			
@@ -2167,7 +2171,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
 
				 		 * until we complete the update.
			
 
				 		 */
			
 
				 		raw_spin_lock(&src_dl_b->lock);
			
 
				-		__dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				+		__dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
			
 
				 		raw_spin_unlock(&src_dl_b->lock);
			
 
				 	}
			
 
				 
			
@@ -2256,13 +2260,6 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 
				 
			
 
				 		return;
			
 
				 	}
			
 
				-	/*
			
 
				-	 * If p is boosted we already updated its params in
			
 
				-	 * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
			
 
				-	 * p's deadline being now already after rq_clock(rq).
			
 
				-	 */
			
 
				-	if (dl_time_before(p->dl.deadline, rq_clock(rq)))
			
 
				-		setup_new_dl_entity(&p->dl);
			
 
				 
			
 
				 	if (rq->curr != p) {
			
 
				 #ifdef CONFIG_SMP
			
@@ -2452,7 +2449,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 
				 	if (dl_policy(policy) && !task_has_dl_policy(p) &&
			
 
				 	    !__dl_overflow(dl_b, cpus, 0, new_bw)) {
			
 
				 		if (hrtimer_active(&p->dl.inactive_timer))
			
 
				-			__dl_clear(dl_b, p->dl.dl_bw, cpus);
			
 
				+			__dl_sub(dl_b, p->dl.dl_bw, cpus);
			
 
				 		__dl_add(dl_b, new_bw, cpus);
			
 
				 		err = 0;
			
 
				 	} else if (dl_policy(policy) && task_has_dl_policy(p) &&
			
@@ -2464,7 +2461,7 @@ int sched_dl_overflow(struct task_struct *p, int policy,
 
				 		 * But this would require to set the task's "inactive
			
 
				 		 * timer" when the task is not inactive.
			
 
				 		 */
			
 
				-		__dl_clear(dl_b, p->dl.dl_bw, cpus);
			
 
				+		__dl_sub(dl_b, p->dl.dl_bw, cpus);
			
 
				 		__dl_add(dl_b, new_bw, cpus);
			
 
				 		dl_change_utilization(p, new_bw);
			
 
				 		err = 0;
			
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -441,9 +441,11 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 
				 		P_SCHEDSTAT(se->statistics.wait_count);
			
 
				 	}
			
 
				 	P(se->load.weight);
			
 
				+	P(se->runnable_weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				 	P(se->avg.load_avg);
			
 
				 	P(se->avg.util_avg);
			
 
				+	P(se->avg.runnable_load_avg);
			
 
				 #endif
			
 
				 
			
 
				 #undef PN_SCHEDSTAT
			
@@ -558,16 +560,19 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 
				 	SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
			
 
				 	SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "runnable_weight", cfs_rq->runnable_weight);
			
 
				 	SEQ_printf(m, "  .%-30s: %lu\n", "load_avg",
			
 
				 			cfs_rq->avg.load_avg);
			
 
				 	SEQ_printf(m, "  .%-30s: %lu\n", "runnable_load_avg",
			
 
				-			cfs_rq->runnable_load_avg);
			
 
				+			cfs_rq->avg.runnable_load_avg);
			
 
				 	SEQ_printf(m, "  .%-30s: %lu\n", "util_avg",
			
 
				 			cfs_rq->avg.util_avg);
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_load_avg",
			
 
				-			atomic_long_read(&cfs_rq->removed_load_avg));
			
 
				-	SEQ_printf(m, "  .%-30s: %ld\n", "removed_util_avg",
			
 
				-			atomic_long_read(&cfs_rq->removed_util_avg));
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.load_avg",
			
 
				+			cfs_rq->removed.load_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.util_avg",
			
 
				+			cfs_rq->removed.util_avg);
			
 
				+	SEQ_printf(m, "  .%-30s: %ld\n", "removed.runnable_sum",
			
 
				+			cfs_rq->removed.runnable_sum);
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	SEQ_printf(m, "  .%-30s: %lu\n", "tg_load_avg_contrib",
			
 
				 			cfs_rq->tg_load_avg_contrib);
			
@@ -1004,10 +1009,13 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,
 
				 		   "nr_involuntary_switches", (long long)p->nivcsw);
			
 
				 
			
 
				 	P(se.load.weight);
			
 
				+	P(se.runnable_weight);
			
 
				 #ifdef CONFIG_SMP
			
 
				 	P(se.avg.load_sum);
			
 
				+	P(se.avg.runnable_load_sum);
			
 
				 	P(se.avg.util_sum);
			
 
				 	P(se.avg.load_avg);
			
 
				+	P(se.avg.runnable_load_avg);
			
 
				 	P(se.avg.util_avg);
			
 
				 	P(se.avg.last_update_time);
			
 
				 #endif
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -33,6 +33,7 @@
 
				 #include <linux/mempolicy.h>
			
 
				 #include <linux/migrate.h>
			
 
				 #include <linux/task_work.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include <trace/events/sched.h>
			
 
				 
			
@@ -717,13 +718,8 @@ void init_entity_runnable_average(struct sched_entity *se)
 
				 {
			
 
				 	struct sched_avg *sa = &se->avg;
			
 
				 
			
 
				-	sa->last_update_time = 0;
			
 
				-	/*
			
 
				-	 * sched_avg's period_contrib should be strictly less then 1024, so
			
 
				-	 * we give it 1023 to make sure it is almost a period (1024us), and
			
 
				-	 * will definitely be update (after enqueue).
			
 
				-	 */
			
 
				-	sa->period_contrib = 1023;
			
 
				+	memset(sa, 0, sizeof(*sa));
			
 
				+
			
 
				 	/*
			
 
				 	 * Tasks are intialized with full load to be seen as heavy tasks until
			
 
				 	 * they get a chance to stabilize to their real load level.
			
@@ -731,13 +727,10 @@ void init_entity_runnable_average(struct sched_entity *se)
 
				 	 * nothing has been attached to the task group yet.
			
 
				 	 */
			
 
				 	if (entity_is_task(se))
			
 
				-		sa->load_avg = scale_load_down(se->load.weight);
			
 
				-	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
			
 
				-	/*
			
 
				-	 * At this point, util_avg won't be used in select_task_rq_fair anyway
			
 
				-	 */
			
 
				-	sa->util_avg = 0;
			
 
				-	sa->util_sum = 0;
			
 
				+		sa->runnable_load_avg = sa->load_avg = scale_load_down(se->load.weight);
			
 
				+
			
 
				+	se->runnable_weight = se->load.weight;
			
 
				+
			
 
				 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
			
 
				 }
			
 
				 
			
@@ -785,7 +778,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
				 		} else {
			
 
				 			sa->util_avg = cap;
			
 
				 		}
			
 
				-		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
			
 
				 	}
			
 
				 
			
 
				 	if (entity_is_task(se)) {
			
@@ -2026,7 +2018,7 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
 
				 		delta = runtime - p->last_sum_exec_runtime;
			
 
				 		*period = now - p->last_task_numa_placement;
			
 
				 	} else {
			
 
				-		delta = p->se.avg.load_sum / p->se.load.weight;
			
 
				+		delta = p->se.avg.load_sum;
			
 
				 		*period = LOAD_AVG_MAX;
			
 
				 	}
			
 
				 
			
@@ -2693,18 +2685,226 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 	cfs_rq->nr_running--;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Signed add and clamp on underflow.
			
 
				+ *
			
 
				+ * Explicitly do a load-store to ensure the intermediate value never hits
			
 
				+ * memory. This allows lockless observations without ever seeing the negative
			
 
				+ * values.
			
 
				+ */
			
 
				+#define add_positive(_ptr, _val) do {                           \
			
 
				+	typeof(_ptr) ptr = (_ptr);                              \
			
 
				+	typeof(_val) val = (_val);                              \
			
 
				+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
			
 
				+								\
			
 
				+	res = var + val;                                        \
			
 
				+								\
			
 
				+	if (val < 0 && res > var)                               \
			
 
				+		res = 0;                                        \
			
 
				+								\
			
 
				+	WRITE_ONCE(*ptr, res);                                  \
			
 
				+} while (0)
			
 
				+
			
 
				+/*
			
 
				+ * Unsigned subtract and clamp on underflow.
			
 
				+ *
			
 
				+ * Explicitly do a load-store to ensure the intermediate value never hits
			
 
				+ * memory. This allows lockless observations without ever seeing the negative
			
 
				+ * values.
			
 
				+ */
			
 
				+#define sub_positive(_ptr, _val) do {				\
			
 
				+	typeof(_ptr) ptr = (_ptr);				\
			
 
				+	typeof(*ptr) val = (_val);				\
			
 
				+	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
			
 
				+	res = var - val;					\
			
 
				+	if (res > var)						\
			
 
				+		res = 0;					\
			
 
				+	WRITE_ONCE(*ptr, res);					\
			
 
				+} while (0)
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+/*
			
 
				+ * XXX we want to get rid of these helpers and use the full load resolution.
			
 
				+ */
			
 
				+static inline long se_weight(struct sched_entity *se)
			
 
				+{
			
 
				+	return scale_load_down(se->load.weight);
			
 
				+}
			
 
				+
			
 
				+static inline long se_runnable(struct sched_entity *se)
			
 
				+{
			
 
				+	return scale_load_down(se->runnable_weight);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	cfs_rq->runnable_weight += se->runnable_weight;
			
 
				+
			
 
				+	cfs_rq->avg.runnable_load_avg += se->avg.runnable_load_avg;
			
 
				+	cfs_rq->avg.runnable_load_sum += se_runnable(se) * se->avg.runnable_load_sum;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	cfs_rq->runnable_weight -= se->runnable_weight;
			
 
				+
			
 
				+	sub_positive(&cfs_rq->avg.runnable_load_avg, se->avg.runnable_load_avg);
			
 
				+	sub_positive(&cfs_rq->avg.runnable_load_sum,
			
 
				+		     se_runnable(se) * se->avg.runnable_load_sum);
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	cfs_rq->avg.load_avg += se->avg.load_avg;
			
 
				+	cfs_rq->avg.load_sum += se_weight(se) * se->avg.load_sum;
			
 
				+}
			
 
				+
			
 
				+static inline void
			
 
				+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
			
 
				+	sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum);
			
 
				+}
			
 
				+#else
			
 
				+static inline void
			
 
				+enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
			
 
				+static inline void
			
 
				+dequeue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
			
 
				+static inline void
			
 
				+enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
			
 
				+static inline void
			
 
				+dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { }
			
 
				+#endif
			
 
				+
			
 
				+static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
			
 
				+			    unsigned long weight, unsigned long runnable)
			
 
				+{
			
 
				+	if (se->on_rq) {
			
 
				+		/* commit outstanding execution time */
			
 
				+		if (cfs_rq->curr == se)
			
 
				+			update_curr(cfs_rq);
			
 
				+		account_entity_dequeue(cfs_rq, se);
			
 
				+		dequeue_runnable_load_avg(cfs_rq, se);
			
 
				+	}
			
 
				+	dequeue_load_avg(cfs_rq, se);
			
 
				+
			
 
				+	se->runnable_weight = runnable;
			
 
				+	update_load_set(&se->load, weight);
			
 
				+
			
 
				+#ifdef CONFIG_SMP
			
 
				+	do {
			
 
				+		u32 divider = LOAD_AVG_MAX - 1024 + se->avg.period_contrib;
			
 
				+
			
 
				+		se->avg.load_avg = div_u64(se_weight(se) * se->avg.load_sum, divider);
			
 
				+		se->avg.runnable_load_avg =
			
 
				+			div_u64(se_runnable(se) * se->avg.runnable_load_sum, divider);
			
 
				+	} while (0);
			
 
				+#endif
			
 
				+
			
 
				+	enqueue_load_avg(cfs_rq, se);
			
 
				+	if (se->on_rq) {
			
 
				+		account_entity_enqueue(cfs_rq, se);
			
 
				+		enqueue_runnable_load_avg(cfs_rq, se);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void reweight_task(struct task_struct *p, int prio)
			
 
				+{
			
 
				+	struct sched_entity *se = &p->se;
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+	struct load_weight *load = &se->load;
			
 
				+	unsigned long weight = scale_load(sched_prio_to_weight[prio]);
			
 
				+
			
 
				+	reweight_entity(cfs_rq, se, weight, weight);
			
 
				+	load->inv_weight = sched_prio_to_wmult[prio];
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 # ifdef CONFIG_SMP
			
 
				-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
			
 
				+/*
			
 
				+ * All this does is approximate the hierarchical proportion which includes that
			
 
				+ * global sum we all love to hate.
			
 
				+ *
			
 
				+ * That is, the weight of a group entity, is the proportional share of the
			
 
				+ * group weight based on the group runqueue weights. That is:
			
 
				+ *
			
 
				+ *                     tg->weight * grq->load.weight
			
 
				+ *   ge->load.weight = -----------------------------               (1)
			
 
				+ *			  \Sum grq->load.weight
			
 
				+ *
			
 
				+ * Now, because computing that sum is prohibitively expensive to compute (been
			
 
				+ * there, done that) we approximate it with this average stuff. The average
			
 
				+ * moves slower and therefore the approximation is cheaper and more stable.
			
 
				+ *
			
 
				+ * So instead of the above, we substitute:
			
 
				+ *
			
 
				+ *   grq->load.weight -> grq->avg.load_avg                         (2)
			
 
				+ *
			
 
				+ * which yields the following:
			
 
				+ *
			
 
				+ *                     tg->weight * grq->avg.load_avg
			
 
				+ *   ge->load.weight = ------------------------------              (3)
			
 
				+ *				tg->load_avg
			
 
				+ *
			
 
				+ * Where: tg->load_avg ~= \Sum grq->avg.load_avg
			
 
				+ *
			
 
				+ * That is shares_avg, and it is right (given the approximation (2)).
			
 
				+ *
			
 
				+ * The problem with it is that because the average is slow -- it was designed
			
 
				+ * to be exactly that of course -- this leads to transients in boundary
			
 
				+ * conditions. In specific, the case where the group was idle and we start the
			
 
				+ * one task. It takes time for our CPU's grq->avg.load_avg to build up,
			
 
				+ * yielding bad latency etc..
			
 
				+ *
			
 
				+ * Now, in that special case (1) reduces to:
			
 
				+ *
			
 
				+ *                     tg->weight * grq->load.weight
			
 
				+ *   ge->load.weight = ----------------------------- = tg->weight   (4)
			
 
				+ *			    grp->load.weight
			
 
				+ *
			
 
				+ * That is, the sum collapses because all other CPUs are idle; the UP scenario.
			
 
				+ *
			
 
				+ * So what we do is modify our approximation (3) to approach (4) in the (near)
			
 
				+ * UP case, like:
			
 
				+ *
			
 
				+ *   ge->load.weight =
			
 
				+ *
			
 
				+ *              tg->weight * grq->load.weight
			
 
				+ *     ---------------------------------------------------         (5)
			
 
				+ *     tg->load_avg - grq->avg.load_avg + grq->load.weight
			
 
				+ *
			
 
				+ * But because grq->load.weight can drop to 0, resulting in a divide by zero,
			
 
				+ * we need to use grq->avg.load_avg as its lower bound, which then gives:
			
 
				+ *
			
 
				+ *
			
 
				+ *                     tg->weight * grq->load.weight
			
 
				+ *   ge->load.weight = -----------------------------		   (6)
			
 
				+ *				tg_load_avg'
			
 
				+ *
			
 
				+ * Where:
			
 
				+ *
			
 
				+ *   tg_load_avg' = tg->load_avg - grq->avg.load_avg +
			
 
				+ *                  max(grq->load.weight, grq->avg.load_avg)
			
 
				+ *
			
 
				+ * And that is shares_weight and is icky. In the (near) UP case it approaches
			
 
				+ * (4) while in the normal case it approaches (3). It consistently
			
 
				+ * overestimates the ge->load.weight and therefore:
			
 
				+ *
			
 
				+ *   \Sum ge->load.weight >= tg->weight
			
 
				+ *
			
 
				+ * hence icky!
			
 
				+ */
			
 
				+static long calc_group_shares(struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				-	long tg_weight, load, shares;
			
 
				+	long tg_weight, tg_shares, load, shares;
			
 
				+	struct task_group *tg = cfs_rq->tg;
			
 
				 
			
 
				-	/*
			
 
				-	 * This really should be: cfs_rq->avg.load_avg, but instead we use
			
 
				-	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
			
 
				-	 * the shares for small weight interactive tasks.
			
 
				-	 */
			
 
				-	load = scale_load_down(cfs_rq->load.weight);
			
 
				+	tg_shares = READ_ONCE(tg->shares);
			
 
				+
			
 
				+	load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);
			
 
				 
			
 
				 	tg_weight = atomic_long_read(&tg->load_avg);
			
 
				 
			
@@ -2712,7 +2912,7 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
				 	tg_weight -= cfs_rq->tg_load_avg_contrib;
			
 
				 	tg_weight += load;
			
 
				 
			
 
				-	shares = (tg->shares * load);
			
 
				+	shares = (tg_shares * load);
			
 
				 	if (tg_weight)
			
 
				 		shares /= tg_weight;
			
 
				 
			
@@ -2728,63 +2928,86 @@ static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 
				 	 * case no task is runnable on a CPU MIN_SHARES=2 should be returned
			
 
				 	 * instead of 0.
			
 
				 	 */
			
 
				-	if (shares < MIN_SHARES)
			
 
				-		shares = MIN_SHARES;
			
 
				-	if (shares > tg->shares)
			
 
				-		shares = tg->shares;
			
 
				-
			
 
				-	return shares;
			
 
				-}
			
 
				-# else /* CONFIG_SMP */
			
 
				-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
			
 
				-{
			
 
				-	return tg->shares;
			
 
				+	return clamp_t(long, shares, MIN_SHARES, tg_shares);
			
 
				 }
			
 
				-# endif /* CONFIG_SMP */
			
 
				 
			
 
				-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
			
 
				-			    unsigned long weight)
			
 
				+/*
			
 
				+ * This calculates the effective runnable weight for a group entity based on
			
 
				+ * the group entity weight calculated above.
			
 
				+ *
			
 
				+ * Because of the above approximation (2), our group entity weight is
			
 
				+ * an load_avg based ratio (3). This means that it includes blocked load and
			
 
				+ * does not represent the runnable weight.
			
 
				+ *
			
 
				+ * Approximate the group entity's runnable weight per ratio from the group
			
 
				+ * runqueue:
			
 
				+ *
			
 
				+ *					     grq->avg.runnable_load_avg
			
 
				+ *   ge->runnable_weight = ge->load.weight * -------------------------- (7)
			
 
				+ *						 grq->avg.load_avg
			
 
				+ *
			
 
				+ * However, analogous to above, since the avg numbers are slow, this leads to
			
 
				+ * transients in the from-idle case. Instead we use:
			
 
				+ *
			
 
				+ *   ge->runnable_weight = ge->load.weight *
			
 
				+ *
			
 
				+ *		max(grq->avg.runnable_load_avg, grq->runnable_weight)
			
 
				+ *		-----------------------------------------------------	(8)
			
 
				+ *		      max(grq->avg.load_avg, grq->load.weight)
			
 
				+ *
			
 
				+ * Where these max() serve both to use the 'instant' values to fix the slow
			
 
				+ * from-idle and avoid the /0 on to-idle, similar to (6).
			
 
				+ */
			
 
				+static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
			
 
				 {
			
 
				-	if (se->on_rq) {
			
 
				-		/* commit outstanding execution time */
			
 
				-		if (cfs_rq->curr == se)
			
 
				-			update_curr(cfs_rq);
			
 
				-		account_entity_dequeue(cfs_rq, se);
			
 
				-	}
			
 
				+	long runnable, load_avg;
			
 
				 
			
 
				-	update_load_set(&se->load, weight);
			
 
				+	load_avg = max(cfs_rq->avg.load_avg,
			
 
				+		       scale_load_down(cfs_rq->load.weight));
			
 
				 
			
 
				-	if (se->on_rq)
			
 
				-		account_entity_enqueue(cfs_rq, se);
			
 
				+	runnable = max(cfs_rq->avg.runnable_load_avg,
			
 
				+		       scale_load_down(cfs_rq->runnable_weight));
			
 
				+
			
 
				+	runnable *= shares;
			
 
				+	if (load_avg)
			
 
				+		runnable /= load_avg;
			
 
				+
			
 
				+	return clamp_t(long, runnable, MIN_SHARES, shares);
			
 
				 }
			
 
				+# endif /* CONFIG_SMP */
			
 
				 
			
 
				 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
			
 
				 
			
 
				-static void update_cfs_shares(struct sched_entity *se)
			
 
				+/*
			
 
				+ * Recomputes the group entity based on the current state of its group
			
 
				+ * runqueue.
			
 
				+ */
			
 
				+static void update_cfs_group(struct sched_entity *se)
			
 
				 {
			
 
				-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
			
 
				-	struct task_group *tg;
			
 
				-	long shares;
			
 
				+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				+	long shares, runnable;
			
 
				 
			
 
				-	if (!cfs_rq)
			
 
				+	if (!gcfs_rq)
			
 
				 		return;
			
 
				 
			
 
				-	if (throttled_hierarchy(cfs_rq))
			
 
				+	if (throttled_hierarchy(gcfs_rq))
			
 
				 		return;
			
 
				 
			
 
				-	tg = cfs_rq->tg;
			
 
				-
			
 
				 #ifndef CONFIG_SMP
			
 
				-	if (likely(se->load.weight == tg->shares))
			
 
				+	runnable = shares = READ_ONCE(gcfs_rq->tg->shares);
			
 
				+
			
 
				+	if (likely(se->load.weight == shares))
			
 
				 		return;
			
 
				+#else
			
 
				+	shares   = calc_group_shares(gcfs_rq);
			
 
				+	runnable = calc_group_runnable(gcfs_rq, shares);
			
 
				 #endif
			
 
				-	shares = calc_cfs_shares(cfs_rq, tg);
			
 
				 
			
 
				-	reweight_entity(cfs_rq_of(se), se, shares);
			
 
				+	reweight_entity(cfs_rq_of(se), se, shares, runnable);
			
 
				 }
			
 
				 
			
 
				 #else /* CONFIG_FAIR_GROUP_SCHED */
			
 
				-static inline void update_cfs_shares(struct sched_entity *se)
			
 
				+static inline void update_cfs_group(struct sched_entity *se)
			
 
				 {
			
 
				 }
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
@@ -2893,7 +3116,7 @@ static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
 
				  */
			
 
				 static __always_inline u32
			
 
				 accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
			
 
				-	       unsigned long weight, int running, struct cfs_rq *cfs_rq)
			
 
				+	       unsigned long load, unsigned long runnable, int running)
			
 
				 {
			
 
				 	unsigned long scale_freq, scale_cpu;
			
 
				 	u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
			
@@ -2910,10 +3133,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 
				 	 */
			
 
				 	if (periods) {
			
 
				 		sa->load_sum = decay_load(sa->load_sum, periods);
			
 
				-		if (cfs_rq) {
			
 
				-			cfs_rq->runnable_load_sum =
			
 
				-				decay_load(cfs_rq->runnable_load_sum, periods);
			
 
				-		}
			
 
				+		sa->runnable_load_sum =
			
 
				+			decay_load(sa->runnable_load_sum, periods);
			
 
				 		sa->util_sum = decay_load((u64)(sa->util_sum), periods);
			
 
				 
			
 
				 		/*
			
@@ -2926,11 +3147,10 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 
				 	sa->period_contrib = delta;
			
 
				 
			
 
				 	contrib = cap_scale(contrib, scale_freq);
			
 
				-	if (weight) {
			
 
				-		sa->load_sum += weight * contrib;
			
 
				-		if (cfs_rq)
			
 
				-			cfs_rq->runnable_load_sum += weight * contrib;
			
 
				-	}
			
 
				+	if (load)
			
 
				+		sa->load_sum += load * contrib;
			
 
				+	if (runnable)
			
 
				+		sa->runnable_load_sum += runnable * contrib;
			
 
				 	if (running)
			
 
				 		sa->util_sum += contrib * scale_cpu;
			
 
				 
			
@@ -2966,8 +3186,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
 
				  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
			
 
				  */
			
 
				 static __always_inline int
			
 
				-___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
			
 
				-		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
			
 
				+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
			
 
				+		  unsigned long load, unsigned long runnable, int running)
			
 
				 {
			
 
				 	u64 delta;
			
 
				 
			
@@ -3000,8 +3220,8 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
				 	 * this happens during idle_balance() which calls
			
 
				 	 * update_blocked_averages()
			
 
				 	 */
			
 
				-	if (!weight)
			
 
				-		running = 0;
			
 
				+	if (!load)
			
 
				+		runnable = running = 0;
			
 
				 
			
 
				 	/*
			
 
				 	 * Now we know we crossed measurement unit boundaries. The *_avg
			
@@ -3010,63 +3230,96 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
				 	 * Step 1: accumulate *_sum since last_update_time. If we haven't
			
 
				 	 * crossed period boundaries, finish.
			
 
				 	 */
			
 
				-	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
			
 
				+	if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
			
 
				 		return 0;
			
 
				 
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static __always_inline void
			
 
				+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
			
 
				+{
			
 
				+	u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
			
 
				+
			
 
				 	/*
			
 
				 	 * Step 2: update *_avg.
			
 
				 	 */
			
 
				-	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
			
 
				-	if (cfs_rq) {
			
 
				-		cfs_rq->runnable_load_avg =
			
 
				-			div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
			
 
				-	}
			
 
				-	sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
			
 
				-
			
 
				-	return 1;
			
 
				+	sa->load_avg = div_u64(load * sa->load_sum, divider);
			
 
				+	sa->runnable_load_avg =	div_u64(runnable * sa->runnable_load_sum, divider);
			
 
				+	sa->util_avg = sa->util_sum / divider;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * sched_entity:
			
 
				+ *
			
 
				+ *   task:
			
 
				+ *     se_runnable() == se_weight()
			
 
				+ *
			
 
				+ *   group: [ see update_cfs_group() ]
			
 
				+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
			
 
				+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
			
 
				+ *
			
 
				+ *   load_sum := runnable_sum
			
 
				+ *   load_avg = se_weight(se) * runnable_avg
			
 
				+ *
			
 
				+ *   runnable_load_sum := runnable_sum
			
 
				+ *   runnable_load_avg = se_runnable(se) * runnable_avg
			
 
				+ *
			
 
				+ * XXX collapse load_sum and runnable_load_sum
			
 
				+ *
			
 
				+ * cfq_rs:
			
 
				+ *
			
 
				+ *   load_sum = \Sum se_weight(se) * se->avg.load_sum
			
 
				+ *   load_avg = \Sum se->avg.load_avg
			
 
				+ *
			
 
				+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
			
 
				+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
			
 
				+ */
			
 
				+
			
 
				 static int
			
 
				 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
			
 
				 {
			
 
				-	return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
			
 
				+	if (entity_is_task(se))
			
 
				+		se->runnable_weight = se->load.weight;
			
 
				+
			
 
				+	if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
			
 
				+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static int
			
 
				 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	return ___update_load_avg(now, cpu, &se->avg,
			
 
				-				  se->on_rq * scale_load_down(se->load.weight),
			
 
				-				  cfs_rq->curr == se, NULL);
			
 
				+	if (entity_is_task(se))
			
 
				+		se->runnable_weight = se->load.weight;
			
 
				+
			
 
				+	if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
			
 
				+				cfs_rq->curr == se)) {
			
 
				+
			
 
				+		___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 
			
 
				 static int
			
 
				 __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				-	return ___update_load_avg(now, cpu, &cfs_rq->avg,
			
 
				-			scale_load_down(cfs_rq->load.weight),
			
 
				-			cfs_rq->curr != NULL, cfs_rq);
			
 
				-}
			
 
				+	if (___update_load_sum(now, cpu, &cfs_rq->avg,
			
 
				+				scale_load_down(cfs_rq->load.weight),
			
 
				+				scale_load_down(cfs_rq->runnable_weight),
			
 
				+				cfs_rq->curr != NULL)) {
			
 
				 
			
 
				-/*
			
 
				- * Signed add and clamp on underflow.
			
 
				- *
			
 
				- * Explicitly do a load-store to ensure the intermediate value never hits
			
 
				- * memory. This allows lockless observations without ever seeing the negative
			
 
				- * values.
			
 
				- */
			
 
				-#define add_positive(_ptr, _val) do {                           \
			
 
				-	typeof(_ptr) ptr = (_ptr);                              \
			
 
				-	typeof(_val) val = (_val);                              \
			
 
				-	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
			
 
				-								\
			
 
				-	res = var + val;                                        \
			
 
				-								\
			
 
				-	if (val < 0 && res > var)                               \
			
 
				-		res = 0;                                        \
			
 
				-								\
			
 
				-	WRITE_ONCE(*ptr, res);                                  \
			
 
				-} while (0)
			
 
				+		___update_load_avg(&cfs_rq->avg, 1, 1);
			
 
				+		return 1;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 /**
			
@@ -3149,11 +3402,77 @@ void set_task_rq_fair(struct sched_entity *se,
 
				 	se->avg.last_update_time = n_last_update_time;
			
 
				 }
			
 
				 
			
 
				-/* Take into account change of utilization of a child task group */
			
 
				+
			
 
				+/*
			
 
				+ * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to
			
 
				+ * propagate its contribution. The key to this propagation is the invariant
			
 
				+ * that for each group:
			
 
				+ *
			
 
				+ *   ge->avg == grq->avg						(1)
			
 
				+ *
			
 
				+ * _IFF_ we look at the pure running and runnable sums. Because they
			
 
				+ * represent the very same entity, just at different points in the hierarchy.
			
 
				+ *
			
 
				+ *
			
 
				+ * Per the above update_tg_cfs_util() is trivial (and still 'wrong') and
			
 
				+ * simply copies the running sum over.
			
 
				+ *
			
 
				+ * However, update_tg_cfs_runnable() is more complex. So we have:
			
 
				+ *
			
 
				+ *   ge->avg.load_avg = ge->load.weight * ge->avg.runnable_avg		(2)
			
 
				+ *
			
 
				+ * And since, like util, the runnable part should be directly transferable,
			
 
				+ * the following would _appear_ to be the straight forward approach:
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = grq->load.weight * grq->avg.running_avg	(3)
			
 
				+ *
			
 
				+ * And per (1) we have:
			
 
				+ *
			
 
				+ *   ge->avg.running_avg == grq->avg.running_avg
			
 
				+ *
			
 
				+ * Which gives:
			
 
				+ *
			
 
				+ *                      ge->load.weight * grq->avg.load_avg
			
 
				+ *   ge->avg.load_avg = -----------------------------------		(4)
			
 
				+ *                               grq->load.weight
			
 
				+ *
			
 
				+ * Except that is wrong!
			
 
				+ *
			
 
				+ * Because while for entities historical weight is not important and we
			
 
				+ * really only care about our future and therefore can consider a pure
			
 
				+ * runnable sum, runqueues can NOT do this.
			
 
				+ *
			
 
				+ * We specifically want runqueues to have a load_avg that includes
			
 
				+ * historical weights. Those represent the blocked load, the load we expect
			
 
				+ * to (shortly) return to us. This only works by keeping the weights as
			
 
				+ * integral part of the sum. We therefore cannot decompose as per (3).
			
 
				+ *
			
 
				+ * OK, so what then?
			
 
				+ *
			
 
				+ *
			
 
				+ * Another way to look at things is:
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = \Sum se->avg.load_avg
			
 
				+ *
			
 
				+ * Therefore, per (2):
			
 
				+ *
			
 
				+ *   grq->avg.load_avg = \Sum se->load.weight * se->avg.runnable_avg
			
 
				+ *
			
 
				+ * And the very thing we're propagating is a change in that sum (someone
			
 
				+ * joined/left). So we can easily know the runnable change, which would be, per
			
 
				+ * (2) the already tracked se->load_avg divided by the corresponding
			
 
				+ * se->weight.
			
 
				+ *
			
 
				+ * Basically (4) but in differential form:
			
 
				+ *
			
 
				+ *   d(runnable_avg) += se->avg.load_avg / se->load.weight
			
 
				+ *								   (5)
			
 
				+ *   ge->avg.load_avg += ge->load.weight * d(runnable_avg)
			
 
				+ */
			
 
				+
			
 
				 static inline void
			
 
				-update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
			
 
				 {
			
 
				-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
			
 
				 
			
 
				 	/* Nothing to update */
			
@@ -3169,102 +3488,65 @@ update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
			
 
				 }
			
 
				 
			
 
				-/* Take into account change of load of a child task group */
			
 
				 static inline void
			
 
				-update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq)
			
 
				 {
			
 
				-	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				-	long delta, load = gcfs_rq->avg.load_avg;
			
 
				+	long runnable_sum = gcfs_rq->prop_runnable_sum;
			
 
				+	long runnable_load_avg, load_avg;
			
 
				+	s64 runnable_load_sum, load_sum;
			
 
				 
			
 
				-	/*
			
 
				-	 * If the load of group cfs_rq is null, the load of the
			
 
				-	 * sched_entity will also be null so we can skip the formula
			
 
				-	 */
			
 
				-	if (load) {
			
 
				-		long tg_load;
			
 
				+	if (!runnable_sum)
			
 
				+		return;
			
 
				 
			
 
				-		/* Get tg's load and ensure tg_load > 0 */
			
 
				-		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
			
 
				+	gcfs_rq->prop_runnable_sum = 0;
			
 
				 
			
 
				-		/* Ensure tg_load >= load and updated with current load*/
			
 
				-		tg_load -= gcfs_rq->tg_load_avg_contrib;
			
 
				-		tg_load += load;
			
 
				+	load_sum = (s64)se_weight(se) * runnable_sum;
			
 
				+	load_avg = div_s64(load_sum, LOAD_AVG_MAX);
			
 
				 
			
 
				-		/*
			
 
				-		 * We need to compute a correction term in the case that the
			
 
				-		 * task group is consuming more CPU than a task of equal
			
 
				-		 * weight. A task with a weight equals to tg->shares will have
			
 
				-		 * a load less or equal to scale_load_down(tg->shares).
			
 
				-		 * Similarly, the sched_entities that represent the task group
			
 
				-		 * at parent level, can't have a load higher than
			
 
				-		 * scale_load_down(tg->shares). And the Sum of sched_entities'
			
 
				-		 * load must be <= scale_load_down(tg->shares).
			
 
				-		 */
			
 
				-		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
			
 
				-			/* scale gcfs_rq's load into tg's shares*/
			
 
				-			load *= scale_load_down(gcfs_rq->tg->shares);
			
 
				-			load /= tg_load;
			
 
				-		}
			
 
				-	}
			
 
				+	add_positive(&se->avg.load_sum, runnable_sum);
			
 
				+	add_positive(&se->avg.load_avg, load_avg);
			
 
				 
			
 
				-	delta = load - se->avg.load_avg;
			
 
				+	add_positive(&cfs_rq->avg.load_avg, load_avg);
			
 
				+	add_positive(&cfs_rq->avg.load_sum, load_sum);
			
 
				 
			
 
				-	/* Nothing to update */
			
 
				-	if (!delta)
			
 
				-		return;
			
 
				-
			
 
				-	/* Set new sched_entity's load */
			
 
				-	se->avg.load_avg = load;
			
 
				-	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
			
 
				+	runnable_load_sum = (s64)se_runnable(se) * runnable_sum;
			
 
				+	runnable_load_avg = div_s64(runnable_load_sum, LOAD_AVG_MAX);
			
 
				 
			
 
				-	/* Update parent cfs_rq load */
			
 
				-	add_positive(&cfs_rq->avg.load_avg, delta);
			
 
				-	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
			
 
				+	add_positive(&se->avg.runnable_load_sum, runnable_sum);
			
 
				+	add_positive(&se->avg.runnable_load_avg, runnable_load_avg);
			
 
				 
			
 
				-	/*
			
 
				-	 * If the sched_entity is already enqueued, we also have to update the
			
 
				-	 * runnable load avg.
			
 
				-	 */
			
 
				 	if (se->on_rq) {
			
 
				-		/* Update parent cfs_rq runnable_load_avg */
			
 
				-		add_positive(&cfs_rq->runnable_load_avg, delta);
			
 
				-		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
			
 
				+		add_positive(&cfs_rq->avg.runnable_load_avg, runnable_load_avg);
			
 
				+		add_positive(&cfs_rq->avg.runnable_load_sum, runnable_load_sum);
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
			
 
				-{
			
 
				-	cfs_rq->propagate_avg = 1;
			
 
				-}
			
 
				-
			
 
				-static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
			
 
				+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum)
			
 
				 {
			
 
				-	struct cfs_rq *cfs_rq = group_cfs_rq(se);
			
 
				-
			
 
				-	if (!cfs_rq->propagate_avg)
			
 
				-		return 0;
			
 
				-
			
 
				-	cfs_rq->propagate_avg = 0;
			
 
				-	return 1;
			
 
				+	cfs_rq->propagate = 1;
			
 
				+	cfs_rq->prop_runnable_sum += runnable_sum;
			
 
				 }
			
 
				 
			
 
				 /* Update task and its cfs_rq load average */
			
 
				 static inline int propagate_entity_load_avg(struct sched_entity *se)
			
 
				 {
			
 
				-	struct cfs_rq *cfs_rq;
			
 
				+	struct cfs_rq *cfs_rq, *gcfs_rq;
			
 
				 
			
 
				 	if (entity_is_task(se))
			
 
				 		return 0;
			
 
				 
			
 
				-	if (!test_and_clear_tg_cfs_propagate(se))
			
 
				+	gcfs_rq = group_cfs_rq(se);
			
 
				+	if (!gcfs_rq->propagate)
			
 
				 		return 0;
			
 
				 
			
 
				+	gcfs_rq->propagate = 0;
			
 
				+
			
 
				 	cfs_rq = cfs_rq_of(se);
			
 
				 
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+	add_tg_cfs_propagate(cfs_rq, gcfs_rq->prop_runnable_sum);
			
 
				 
			
 
				-	update_tg_cfs_util(cfs_rq, se);
			
 
				-	update_tg_cfs_load(cfs_rq, se);
			
 
				+	update_tg_cfs_util(cfs_rq, se, gcfs_rq);
			
 
				+	update_tg_cfs_runnable(cfs_rq, se, gcfs_rq);
			
 
				 
			
 
				 	return 1;
			
 
				 }
			
@@ -3288,7 +3570,7 @@ static inline bool skip_blocked_update(struct sched_entity *se)
 
				 	 * If there is a pending propagation, we have to update the load and
			
 
				 	 * the utilization of the sched_entity:
			
 
				 	 */
			
 
				-	if (gcfs_rq->propagate_avg)
			
 
				+	if (gcfs_rq->propagate)
			
 
				 		return false;
			
 
				 
			
 
				 	/*
			
@@ -3308,27 +3590,10 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
			
 
				+static inline void add_tg_cfs_propagate(struct cfs_rq *cfs_rq, long runnable_sum) {}
			
 
				 
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				-/*
			
 
				- * Unsigned subtract and clamp on underflow.
			
 
				- *
			
 
				- * Explicitly do a load-store to ensure the intermediate value never hits
			
 
				- * memory. This allows lockless observations without ever seeing the negative
			
 
				- * values.
			
 
				- */
			
 
				-#define sub_positive(_ptr, _val) do {				\
			
 
				-	typeof(_ptr) ptr = (_ptr);				\
			
 
				-	typeof(*ptr) val = (_val);				\
			
 
				-	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
			
 
				-	res = var - val;					\
			
 
				-	if (res > var)						\
			
 
				-		res = 0;					\
			
 
				-	WRITE_ONCE(*ptr, res);					\
			
 
				-} while (0)
			
 
				-
			
 
				 /**
			
 
				  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
			
 
				  * @now: current time, as per cfs_rq_clock_task()
			
@@ -3348,65 +3613,45 @@ static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
 
				 static inline int
			
 
				 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				+	unsigned long removed_load = 0, removed_util = 0, removed_runnable_sum = 0;
			
 
				 	struct sched_avg *sa = &cfs_rq->avg;
			
 
				-	int decayed, removed_load = 0, removed_util = 0;
			
 
				+	int decayed = 0;
			
 
				 
			
 
				-	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
			
 
				-		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
			
 
				+	if (cfs_rq->removed.nr) {
			
 
				+		unsigned long r;
			
 
				+		u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
			
 
				+
			
 
				+		raw_spin_lock(&cfs_rq->removed.lock);
			
 
				+		swap(cfs_rq->removed.util_avg, removed_util);
			
 
				+		swap(cfs_rq->removed.load_avg, removed_load);
			
 
				+		swap(cfs_rq->removed.runnable_sum, removed_runnable_sum);
			
 
				+		cfs_rq->removed.nr = 0;
			
 
				+		raw_spin_unlock(&cfs_rq->removed.lock);
			
 
				+
			
 
				+		r = removed_load;
			
 
				 		sub_positive(&sa->load_avg, r);
			
 
				-		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
			
 
				-		removed_load = 1;
			
 
				-		set_tg_cfs_propagate(cfs_rq);
			
 
				-	}
			
 
				+		sub_positive(&sa->load_sum, r * divider);
			
 
				 
			
 
				-	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
			
 
				-		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
			
 
				+		r = removed_util;
			
 
				 		sub_positive(&sa->util_avg, r);
			
 
				-		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
			
 
				-		removed_util = 1;
			
 
				-		set_tg_cfs_propagate(cfs_rq);
			
 
				+		sub_positive(&sa->util_sum, r * divider);
			
 
				+
			
 
				+		add_tg_cfs_propagate(cfs_rq, -(long)removed_runnable_sum);
			
 
				+
			
 
				+		decayed = 1;
			
 
				 	}
			
 
				 
			
 
				-	decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
			
 
				+	decayed |= __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq);
			
 
				 
			
 
				 #ifndef CONFIG_64BIT
			
 
				 	smp_wmb();
			
 
				 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
			
 
				 #endif
			
 
				 
			
 
				-	if (decayed || removed_util)
			
 
				+	if (decayed)
			
 
				 		cfs_rq_util_change(cfs_rq);
			
 
				 
			
 
				-	return decayed || removed_load;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Optional action to be done while updating the load average
			
 
				- */
			
 
				-#define UPDATE_TG	0x1
			
 
				-#define SKIP_AGE_LOAD	0x2
			
 
				-
			
 
				-/* Update task and its cfs_rq load average */
			
 
				-static inline void update_load_avg(struct sched_entity *se, int flags)
			
 
				-{
			
 
				-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				-	struct rq *rq = rq_of(cfs_rq);
			
 
				-	int cpu = cpu_of(rq);
			
 
				-	int decayed;
			
 
				-
			
 
				-	/*
			
 
				-	 * Track task load average for carrying it to new CPU after migrated, and
			
 
				-	 * track group sched_entity load average for task_h_load calc in migration
			
 
				-	 */
			
 
				-	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
			
 
				-		__update_load_avg_se(now, cpu, cfs_rq, se);
			
 
				-
			
 
				-	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
			
 
				-	decayed |= propagate_entity_load_avg(se);
			
 
				-
			
 
				-	if (decayed && (flags & UPDATE_TG))
			
 
				-		update_tg_load_avg(cfs_rq, 0);
			
 
				+	return decayed;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -3419,12 +3664,39 @@ static inline void update_load_avg(struct sched_entity *se, int flags)
 
				  */
			
 
				 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				+	u32 divider = LOAD_AVG_MAX - 1024 + cfs_rq->avg.period_contrib;
			
 
				+
			
 
				+	/*
			
 
				+	 * When we attach the @se to the @cfs_rq, we must align the decay
			
 
				+	 * window because without that, really weird and wonderful things can
			
 
				+	 * happen.
			
 
				+	 *
			
 
				+	 * XXX illustrate
			
 
				+	 */
			
 
				 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
			
 
				-	cfs_rq->avg.load_avg += se->avg.load_avg;
			
 
				-	cfs_rq->avg.load_sum += se->avg.load_sum;
			
 
				+	se->avg.period_contrib = cfs_rq->avg.period_contrib;
			
 
				+
			
 
				+	/*
			
 
				+	 * Hell(o) Nasty stuff.. we need to recompute _sum based on the new
			
 
				+	 * period_contrib. This isn't strictly correct, but since we're
			
 
				+	 * entirely outside of the PELT hierarchy, nobody cares if we truncate
			
 
				+	 * _sum a little.
			
 
				+	 */
			
 
				+	se->avg.util_sum = se->avg.util_avg * divider;
			
 
				+
			
 
				+	se->avg.load_sum = divider;
			
 
				+	if (se_weight(se)) {
			
 
				+		se->avg.load_sum =
			
 
				+			div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
			
 
				+	}
			
 
				+
			
 
				+	se->avg.runnable_load_sum = se->avg.load_sum;
			
 
				+
			
 
				+	enqueue_load_avg(cfs_rq, se);
			
 
				 	cfs_rq->avg.util_avg += se->avg.util_avg;
			
 
				 	cfs_rq->avg.util_sum += se->avg.util_sum;
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+
			
 
				+	add_tg_cfs_propagate(cfs_rq, se->avg.load_sum);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
@@ -3439,39 +3711,47 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
				  */
			
 
				 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-
			
 
				-	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
			
 
				-	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
			
 
				+	dequeue_load_avg(cfs_rq, se);
			
 
				 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
			
 
				 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
			
 
				-	set_tg_cfs_propagate(cfs_rq);
			
 
				+
			
 
				+	add_tg_cfs_propagate(cfs_rq, -se->avg.load_sum);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
 
				 
			
 
				-/* Add the load generated by se into cfs_rq's load average */
			
 
				-static inline void
			
 
				-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+/*
			
 
				+ * Optional action to be done while updating the load average
			
 
				+ */
			
 
				+#define UPDATE_TG	0x1
			
 
				+#define SKIP_AGE_LOAD	0x2
			
 
				+#define DO_ATTACH	0x4
			
 
				+
			
 
				+/* Update task and its cfs_rq load average */
			
 
				+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
			
 
				 {
			
 
				-	struct sched_avg *sa = &se->avg;
			
 
				+	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				+	struct rq *rq = rq_of(cfs_rq);
			
 
				+	int cpu = cpu_of(rq);
			
 
				+	int decayed;
			
 
				+
			
 
				+	/*
			
 
				+	 * Track task load average for carrying it to new CPU after migrated, and
			
 
				+	 * track group sched_entity load average for task_h_load calc in migration
			
 
				+	 */
			
 
				+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD))
			
 
				+		__update_load_avg_se(now, cpu, cfs_rq, se);
			
 
				 
			
 
				-	cfs_rq->runnable_load_avg += sa->load_avg;
			
 
				-	cfs_rq->runnable_load_sum += sa->load_sum;
			
 
				+	decayed  = update_cfs_rq_load_avg(now, cfs_rq);
			
 
				+	decayed |= propagate_entity_load_avg(se);
			
 
				+
			
 
				+	if (!se->avg.last_update_time && (flags & DO_ATTACH)) {
			
 
				 
			
 
				-	if (!sa->last_update_time) {
			
 
				 		attach_entity_load_avg(cfs_rq, se);
			
 
				 		update_tg_load_avg(cfs_rq, 0);
			
 
				-	}
			
 
				-}
			
 
				 
			
 
				-/* Remove the runnable load generated by se from cfs_rq's runnable load average */
			
 
				-static inline void
			
 
				-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				-{
			
 
				-	cfs_rq->runnable_load_avg =
			
 
				-		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
			
 
				-	cfs_rq->runnable_load_sum =
			
 
				-		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
			
 
				+	} else if (decayed && (flags & UPDATE_TG))
			
 
				+		update_tg_load_avg(cfs_rq, 0);
			
 
				 }
			
 
				 
			
 
				 #ifndef CONFIG_64BIT
			
@@ -3515,6 +3795,7 @@ void sync_entity_load_avg(struct sched_entity *se)
 
				 void remove_entity_load_avg(struct sched_entity *se)
			
 
				 {
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+	unsigned long flags;
			
 
				 
			
 
				 	/*
			
 
				 	 * tasks cannot exit without having gone through wake_up_new_task() ->
			
@@ -3527,13 +3808,18 @@ void remove_entity_load_avg(struct sched_entity *se)
 
				 	 */
			
 
				 
			
 
				 	sync_entity_load_avg(se);
			
 
				-	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
			
 
				-	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
			
 
				+
			
 
				+	raw_spin_lock_irqsave(&cfs_rq->removed.lock, flags);
			
 
				+	++cfs_rq->removed.nr;
			
 
				+	cfs_rq->removed.util_avg	+= se->avg.util_avg;
			
 
				+	cfs_rq->removed.load_avg	+= se->avg.load_avg;
			
 
				+	cfs_rq->removed.runnable_sum	+= se->avg.load_sum; /* == runnable_sum */
			
 
				+	raw_spin_unlock_irqrestore(&cfs_rq->removed.lock, flags);
			
 
				 }
			
 
				 
			
 
				 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				-	return cfs_rq->runnable_load_avg;
			
 
				+	return cfs_rq->avg.runnable_load_avg;
			
 
				 }
			
 
				 
			
 
				 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
			
@@ -3553,16 +3839,13 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
 
				 
			
 
				 #define UPDATE_TG	0x0
			
 
				 #define SKIP_AGE_LOAD	0x0
			
 
				+#define DO_ATTACH	0x0
			
 
				 
			
 
				-static inline void update_load_avg(struct sched_entity *se, int not_used1)
			
 
				+static inline void update_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se, int not_used1)
			
 
				 {
			
 
				-	cfs_rq_util_change(cfs_rq_of(se));
			
 
				+	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
 
				 
			
 
				-static inline void
			
 
				-enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
			
 
				-static inline void
			
 
				-dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
			
 
				 static inline void remove_entity_load_avg(struct sched_entity *se) {}
			
 
				 
			
 
				 static inline void
			
@@ -3707,9 +3990,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	 *     its group cfs_rq
			
 
				 	 *   - Add its new weight to cfs_rq->load.weight
			
 
				 	 */
			
 
				-	update_load_avg(se, UPDATE_TG);
			
 
				-	enqueue_entity_load_avg(cfs_rq, se);
			
 
				-	update_cfs_shares(se);
			
 
				+	update_load_avg(cfs_rq, se, UPDATE_TG | DO_ATTACH);
			
 
				+	update_cfs_group(se);
			
 
				+	enqueue_runnable_load_avg(cfs_rq, se);
			
 
				 	account_entity_enqueue(cfs_rq, se);
			
 
				 
			
 
				 	if (flags & ENQUEUE_WAKEUP)
			
@@ -3791,8 +4074,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	 *   - For group entity, update its weight to reflect the new share
			
 
				 	 *     of its group cfs_rq.
			
 
				 	 */
			
 
				-	update_load_avg(se, UPDATE_TG);
			
 
				-	dequeue_entity_load_avg(cfs_rq, se);
			
 
				+	update_load_avg(cfs_rq, se, UPDATE_TG);
			
 
				+	dequeue_runnable_load_avg(cfs_rq, se);
			
 
				 
			
 
				 	update_stats_dequeue(cfs_rq, se, flags);
			
 
				 
			
@@ -3815,7 +4098,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	/* return excess runtime on last dequeue */
			
 
				 	return_cfs_rq_runtime(cfs_rq);
			
 
				 
			
 
				-	update_cfs_shares(se);
			
 
				+	update_cfs_group(se);
			
 
				 
			
 
				 	/*
			
 
				 	 * Now advance min_vruntime if @se was the entity holding it back,
			
@@ -3879,7 +4162,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		 */
			
 
				 		update_stats_wait_end(cfs_rq, se);
			
 
				 		__dequeue_entity(cfs_rq, se);
			
 
				-		update_load_avg(se, UPDATE_TG);
			
 
				+		update_load_avg(cfs_rq, se, UPDATE_TG);
			
 
				 	}
			
 
				 
			
 
				 	update_stats_curr_start(cfs_rq, se);
			
@@ -3981,7 +4264,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 
				 		/* Put 'current' back into the tree. */
			
 
				 		__enqueue_entity(cfs_rq, prev);
			
 
				 		/* in !on_rq case, update occurred at dequeue */
			
 
				-		update_load_avg(prev, 0);
			
 
				+		update_load_avg(cfs_rq, prev, 0);
			
 
				 	}
			
 
				 	cfs_rq->curr = NULL;
			
 
				 }
			
@@ -3997,8 +4280,8 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
				 	/*
			
 
				 	 * Ensure that runnable average is periodically updated.
			
 
				 	 */
			
 
				-	update_load_avg(curr, UPDATE_TG);
			
 
				-	update_cfs_shares(curr);
			
 
				+	update_load_avg(cfs_rq, curr, UPDATE_TG);
			
 
				+	update_cfs_group(curr);
			
 
				 
			
 
				 #ifdef CONFIG_SCHED_HRTICK
			
 
				 	/*
			
@@ -4915,8 +5198,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				-		update_load_avg(se, UPDATE_TG);
			
 
				-		update_cfs_shares(se);
			
 
				+		update_load_avg(cfs_rq, se, UPDATE_TG);
			
 
				+		update_cfs_group(se);
			
 
				 	}
			
 
				 
			
 
				 	if (!se)
			
@@ -4974,8 +5257,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				-		update_load_avg(se, UPDATE_TG);
			
 
				-		update_cfs_shares(se);
			
 
				+		update_load_avg(cfs_rq, se, UPDATE_TG);
			
 
				+		update_cfs_group(se);
			
 
				 	}
			
 
				 
			
 
				 	if (!se)
			
@@ -5449,6 +5732,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
 
				 /*
			
 
				  * find_idlest_group finds and returns the least busy CPU group within the
			
 
				  * domain.
			
 
				+ *
			
 
				+ * Assumes p is allowed on at least one CPU in sd.
			
 
				  */
			
 
				 static struct sched_group *
			
 
				 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
			
@@ -5456,8 +5741,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
				 {
			
 
				 	struct sched_group *idlest = NULL, *group = sd->groups;
			
 
				 	struct sched_group *most_spare_sg = NULL;
			
 
				-	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
			
 
				-	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
			
 
				+	unsigned long min_runnable_load = ULONG_MAX;
			
 
				+	unsigned long this_runnable_load = ULONG_MAX;
			
 
				+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
			
 
				 	unsigned long most_spare = 0, this_spare = 0;
			
 
				 	int load_idx = sd->forkexec_idx;
			
 
				 	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
			
@@ -5578,10 +5864,10 @@ skip_spare:
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * find_idlest_cpu - find the idlest cpu among the cpus in group.
			
 
				+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
			
 
				  */
			
 
				 static int
			
 
				-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
			
 
				+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
			
 
				 {
			
 
				 	unsigned long load, min_load = ULONG_MAX;
			
 
				 	unsigned int min_exit_latency = UINT_MAX;
			
@@ -5630,6 +5916,53 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 
				 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
			
 
				 }
			
 
				 
			
 
				+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
			
 
				+				  int cpu, int prev_cpu, int sd_flag)
			
 
				+{
			
 
				+	int new_cpu = cpu;
			
 
				+
			
 
				+	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
			
 
				+		return prev_cpu;
			
 
				+
			
 
				+	while (sd) {
			
 
				+		struct sched_group *group;
			
 
				+		struct sched_domain *tmp;
			
 
				+		int weight;
			
 
				+
			
 
				+		if (!(sd->flags & sd_flag)) {
			
 
				+			sd = sd->child;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		group = find_idlest_group(sd, p, cpu, sd_flag);
			
 
				+		if (!group) {
			
 
				+			sd = sd->child;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		new_cpu = find_idlest_group_cpu(group, p, cpu);
			
 
				+		if (new_cpu == cpu) {
			
 
				+			/* Now try balancing at a lower domain level of cpu */
			
 
				+			sd = sd->child;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		/* Now try balancing at a lower domain level of new_cpu */
			
 
				+		cpu = new_cpu;
			
 
				+		weight = sd->span_weight;
			
 
				+		sd = NULL;
			
 
				+		for_each_domain(cpu, tmp) {
			
 
				+			if (weight <= tmp->span_weight)
			
 
				+				break;
			
 
				+			if (tmp->flags & sd_flag)
			
 
				+				sd = tmp;
			
 
				+		}
			
 
				+		/* while loop will break here if sd == NULL */
			
 
				+	}
			
 
				+
			
 
				+	return new_cpu;
			
 
				+}
			
 
				+
			
 
				 #ifdef CONFIG_SCHED_SMT
			
 
				 
			
 
				 static inline void set_idle_cores(int cpu, int val)
			
@@ -5982,50 +6315,30 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
				 			new_cpu = cpu;
			
 
				 	}
			
 
				 
			
 
				+	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
			
 
				+		/*
			
 
				+		 * We're going to need the task's util for capacity_spare_wake
			
 
				+		 * in find_idlest_group. Sync it up to prev_cpu's
			
 
				+		 * last_update_time.
			
 
				+		 */
			
 
				+		sync_entity_load_avg(&p->se);
			
 
				+	}
			
 
				+
			
 
				 	if (!sd) {
			
 
				- pick_cpu:
			
 
				+pick_cpu:
			
 
				 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
			
 
				 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
			
 
				 
			
 
				-	} else while (sd) {
			
 
				-		struct sched_group *group;
			
 
				-		int weight;
			
 
				-
			
 
				-		if (!(sd->flags & sd_flag)) {
			
 
				-			sd = sd->child;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		group = find_idlest_group(sd, p, cpu, sd_flag);
			
 
				-		if (!group) {
			
 
				-			sd = sd->child;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		new_cpu = find_idlest_cpu(group, p, cpu);
			
 
				-		if (new_cpu == -1 || new_cpu == cpu) {
			
 
				-			/* Now try balancing at a lower domain level of cpu */
			
 
				-			sd = sd->child;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/* Now try balancing at a lower domain level of new_cpu */
			
 
				-		cpu = new_cpu;
			
 
				-		weight = sd->span_weight;
			
 
				-		sd = NULL;
			
 
				-		for_each_domain(cpu, tmp) {
			
 
				-			if (weight <= tmp->span_weight)
			
 
				-				break;
			
 
				-			if (tmp->flags & sd_flag)
			
 
				-				sd = tmp;
			
 
				-		}
			
 
				-		/* while loop will break here if sd == NULL */
			
 
				+	} else {
			
 
				+		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
 
				 
			
 
				 	return new_cpu;
			
 
				 }
			
 
				 
			
 
				+static void detach_entity_cfs_rq(struct sched_entity *se);
			
 
				+
			
 
				 /*
			
 
				  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
			
 
				  * cfs_rq_of(p) references at time of call are still valid and identify the
			
@@ -6059,14 +6372,25 @@ static void migrate_task_rq_fair(struct task_struct *p)
 
				 		se->vruntime -= min_vruntime;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * We are supposed to update the task to "current" time, then its up to date
			
 
				-	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
			
 
				-	 * what current time is, so simply throw away the out-of-date time. This
			
 
				-	 * will result in the wakee task is less decayed, but giving the wakee more
			
 
				-	 * load sounds not bad.
			
 
				-	 */
			
 
				-	remove_entity_load_avg(&p->se);
			
 
				+	if (p->on_rq == TASK_ON_RQ_MIGRATING) {
			
 
				+		/*
			
 
				+		 * In case of TASK_ON_RQ_MIGRATING we in fact hold the 'old'
			
 
				+		 * rq->lock and can modify state directly.
			
 
				+		 */
			
 
				+		lockdep_assert_held(&task_rq(p)->lock);
			
 
				+		detach_entity_cfs_rq(&p->se);
			
 
				+
			
 
				+	} else {
			
 
				+		/*
			
 
				+		 * We are supposed to update the task to "current" time, then
			
 
				+		 * its up to date and ready to go to new CPU/cfs_rq. But we
			
 
				+		 * have difficulty in getting what current time is, so simply
			
 
				+		 * throw away the out-of-date time. This will result in the
			
 
				+		 * wakee task is less decayed, but giving the wakee more load
			
 
				+		 * sounds not bad.
			
 
				+		 */
			
 
				+		remove_entity_load_avg(&p->se);
			
 
				+	}
			
 
				 
			
 
				 	/* Tell new CPU we are migrated */
			
 
				 	p->se.avg.last_update_time = 0;
			
@@ -6334,10 +6658,7 @@ again:
 
				 		set_next_entity(cfs_rq, se);
			
 
				 	}
			
 
				 
			
 
				-	if (hrtick_enabled(rq))
			
 
				-		hrtick_start_fair(rq, p);
			
 
				-
			
 
				-	return p;
			
 
				+	goto done;
			
 
				 simple:
			
 
				 #endif
			
 
				 
			
@@ -6351,6 +6672,16 @@ simple:
 
				 
			
 
				 	p = task_of(se);
			
 
				 
			
 
				+done: __maybe_unused
			
 
				+#ifdef CONFIG_SMP
			
 
				+	/*
			
 
				+	 * Move the next running task to the front of
			
 
				+	 * the list, so our cfs_tasks list becomes MRU
			
 
				+	 * one.
			
 
				+	 */
			
 
				+	list_move(&p->se.group_node, &rq->cfs_tasks);
			
 
				+#endif
			
 
				+
			
 
				 	if (hrtick_enabled(rq))
			
 
				 		hrtick_start_fair(rq, p);
			
 
				 
			
@@ -6786,11 +7117,12 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
 
				  */
			
 
				 static struct task_struct *detach_one_task(struct lb_env *env)
			
 
				 {
			
 
				-	struct task_struct *p, *n;
			
 
				+	struct task_struct *p;
			
 
				 
			
 
				 	lockdep_assert_held(&env->src_rq->lock);
			
 
				 
			
 
				-	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
			
 
				+	list_for_each_entry_reverse(p,
			
 
				+			&env->src_rq->cfs_tasks, se.group_node) {
			
 
				 		if (!can_migrate_task(p, env))
			
 
				 			continue;
			
 
				 
			
@@ -6836,7 +7168,7 @@ static int detach_tasks(struct lb_env *env)
 
				 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
			
 
				 			break;
			
 
				 
			
 
				-		p = list_first_entry(tasks, struct task_struct, se.group_node);
			
 
				+		p = list_last_entry(tasks, struct task_struct, se.group_node);
			
 
				 
			
 
				 		env->loop++;
			
 
				 		/* We've more or less seen every task there is, call it quits */
			
@@ -6886,7 +7218,7 @@ static int detach_tasks(struct lb_env *env)
 
				 
			
 
				 		continue;
			
 
				 next:
			
 
				-		list_move_tail(&p->se.group_node, tasks);
			
 
				+		list_move(&p->se.group_node, tasks);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -6962,7 +7294,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
 
				 	if (cfs_rq->avg.util_sum)
			
 
				 		return false;
			
 
				 
			
 
				-	if (cfs_rq->runnable_load_sum)
			
 
				+	if (cfs_rq->avg.runnable_load_sum)
			
 
				 		return false;
			
 
				 
			
 
				 	return true;
			
@@ -6994,7 +7326,7 @@ static void update_blocked_averages(int cpu)
 
				 		/* Propagate pending load changes to the parent, if any: */
			
 
				 		se = cfs_rq->tg->se[cpu];
			
 
				 		if (se && !skip_blocked_update(se))
			
 
				-			update_load_avg(se, 0);
			
 
				+			update_load_avg(cfs_rq_of(se), se, 0);
			
 
				 
			
 
				 		/*
			
 
				 		 * There can be a lot of idle CPU cgroups.  Don't let fully
			
@@ -7875,8 +8207,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 
				 	if (busiest->group_type == group_imbalanced)
			
 
				 		goto force_balance;
			
 
				 
			
 
				-	/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
			
 
				-	if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
			
 
				+	/*
			
 
				+	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
			
 
				+	 * capacities from resulting in underutilization due to avg_load.
			
 
				+	 */
			
 
				+	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
			
 
				 	    busiest->group_no_capacity)
			
 
				 		goto force_balance;
			
 
				 
			
@@ -8693,7 +9028,7 @@ void nohz_balance_enter_idle(int cpu)
 
				 		return;
			
 
				 
			
 
				 	/* Spare idle load balancing on CPUs that don't want to be disturbed: */
			
 
				-	if (!is_housekeeping_cpu(cpu))
			
 
				+	if (!housekeeping_cpu(cpu, HK_FLAG_SCHED))
			
 
				 		return;
			
 
				 
			
 
				 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
			
@@ -9158,7 +9493,7 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				-		update_load_avg(se, UPDATE_TG);
			
 
				+		update_load_avg(cfs_rq, se, UPDATE_TG);
			
 
				 	}
			
 
				 }
			
 
				 #else
			
@@ -9170,7 +9505,7 @@ static void detach_entity_cfs_rq(struct sched_entity *se)
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 
			
 
				 	/* Catch up with the cfs_rq and remove our load when we leave */
			
 
				-	update_load_avg(se, 0);
			
 
				+	update_load_avg(cfs_rq, se, 0);
			
 
				 	detach_entity_load_avg(cfs_rq, se);
			
 
				 	update_tg_load_avg(cfs_rq, false);
			
 
				 	propagate_entity_cfs_rq(se);
			
@@ -9189,7 +9524,7 @@ static void attach_entity_cfs_rq(struct sched_entity *se)
 
				 #endif
			
 
				 
			
 
				 	/* Synchronize entity with its cfs_rq */
			
 
				-	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
			
 
				+	update_load_avg(cfs_rq, se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
			
 
				 	attach_entity_load_avg(cfs_rq, se);
			
 
				 	update_tg_load_avg(cfs_rq, false);
			
 
				 	propagate_entity_cfs_rq(se);
			
@@ -9271,11 +9606,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 
				 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
			
 
				 #endif
			
 
				 #ifdef CONFIG_SMP
			
 
				-#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	cfs_rq->propagate_avg = 0;
			
 
				-#endif
			
 
				-	atomic_long_set(&cfs_rq->removed_load_avg, 0);
			
 
				-	atomic_long_set(&cfs_rq->removed_util_avg, 0);
			
 
				+	raw_spin_lock_init(&cfs_rq->removed.lock);
			
 
				 #endif
			
 
				 }
			
 
				 
			
@@ -9473,8 +9804,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 
				 		rq_lock_irqsave(rq, &rf);
			
 
				 		update_rq_clock(rq);
			
 
				 		for_each_sched_entity(se) {
			
 
				-			update_load_avg(se, UPDATE_TG);
			
 
				-			update_cfs_shares(se);
			
 
				+			update_load_avg(cfs_rq_of(se), se, UPDATE_TG);
			
 
				+			update_cfs_group(se);
			
 
				 		}
			
 
				 		rq_unlock_irqrestore(rq, &rf);
			
 
				 	}
			
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -209,6 +209,7 @@ exit_idle:
 
				  */
			
 
				 static void do_idle(void)
			
 
				 {
			
 
				+	int cpu = smp_processor_id();
			
 
				 	/*
			
 
				 	 * If the arch has a polling bit, we maintain an invariant:
			
 
				 	 *
			
@@ -219,14 +220,13 @@ static void do_idle(void)
 
				 	 */
			
 
				 
			
 
				 	__current_set_polling();
			
 
				-	quiet_vmstat();
			
 
				 	tick_nohz_idle_enter();
			
 
				 
			
 
				 	while (!need_resched()) {
			
 
				 		check_pgt_cache();
			
 
				 		rmb();
			
 
				 
			
 
				-		if (cpu_is_offline(smp_processor_id())) {
			
 
				+		if (cpu_is_offline(cpu)) {
			
 
				 			cpuhp_report_idle_dead();
			
 
				 			arch_cpu_idle_dead();
			
 
				 		}
			
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -0,0 +1,155 @@
 
				+/*
			
 
				+ *  Housekeeping management. Manage the targets for routine code that can run on
			
 
				+ *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
			
 
				+ *
			
 
				+ */
			
 
				+
			
 
				+#include <linux/sched/isolation.h>
			
 
				+#include <linux/tick.h>
			
 
				+#include <linux/init.h>
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/static_key.h>
			
 
				+#include <linux/ctype.h>
			
 
				+
			
 
				+DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
			
 
				+EXPORT_SYMBOL_GPL(housekeeping_overriden);
			
 
				+static cpumask_var_t housekeeping_mask;
			
 
				+static unsigned int housekeeping_flags;
			
 
				+
			
 
				+int housekeeping_any_cpu(enum hk_flags flags)
			
 
				+{
			
 
				+	if (static_branch_unlikely(&housekeeping_overriden))
			
 
				+		if (housekeeping_flags & flags)
			
 
				+			return cpumask_any_and(housekeeping_mask, cpu_online_mask);
			
 
				+	return smp_processor_id();
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(housekeeping_any_cpu);
			
 
				+
			
 
				+const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
			
 
				+{
			
 
				+	if (static_branch_unlikely(&housekeeping_overriden))
			
 
				+		if (housekeeping_flags & flags)
			
 
				+			return housekeeping_mask;
			
 
				+	return cpu_possible_mask;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(housekeeping_cpumask);
			
 
				+
			
 
				+void housekeeping_affine(struct task_struct *t, enum hk_flags flags)
			
 
				+{
			
 
				+	if (static_branch_unlikely(&housekeeping_overriden))
			
 
				+		if (housekeeping_flags & flags)
			
 
				+			set_cpus_allowed_ptr(t, housekeeping_mask);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(housekeeping_affine);
			
 
				+
			
 
				+bool housekeeping_test_cpu(int cpu, enum hk_flags flags)
			
 
				+{
			
 
				+	if (static_branch_unlikely(&housekeeping_overriden))
			
 
				+		if (housekeeping_flags & flags)
			
 
				+			return cpumask_test_cpu(cpu, housekeeping_mask);
			
 
				+	return true;
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(housekeeping_test_cpu);
			
 
				+
			
 
				+void __init housekeeping_init(void)
			
 
				+{
			
 
				+	if (!housekeeping_flags)
			
 
				+		return;
			
 
				+
			
 
				+	static_branch_enable(&housekeeping_overriden);
			
 
				+
			
 
				+	/* We need at least one CPU to handle housekeeping work */
			
 
				+	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
			
 
				+}
			
 
				+
			
 
				+static int __init housekeeping_setup(char *str, enum hk_flags flags)
			
 
				+{
			
 
				+	cpumask_var_t non_housekeeping_mask;
			
 
				+	int err;
			
 
				+
			
 
				+	alloc_bootmem_cpumask_var(&non_housekeeping_mask);
			
 
				+	err = cpulist_parse(str, non_housekeeping_mask);
			
 
				+	if (err < 0 || cpumask_last(non_housekeeping_mask) >= nr_cpu_ids) {
			
 
				+		pr_warn("Housekeeping: nohz_full= or isolcpus= incorrect CPU range\n");
			
 
				+		free_bootmem_cpumask_var(non_housekeeping_mask);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	if (!housekeeping_flags) {
			
 
				+		alloc_bootmem_cpumask_var(&housekeeping_mask);
			
 
				+		cpumask_andnot(housekeeping_mask,
			
 
				+			       cpu_possible_mask, non_housekeeping_mask);
			
 
				+		if (cpumask_empty(housekeeping_mask))
			
 
				+			cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
			
 
				+	} else {
			
 
				+		cpumask_var_t tmp;
			
 
				+
			
 
				+		alloc_bootmem_cpumask_var(&tmp);
			
 
				+		cpumask_andnot(tmp, cpu_possible_mask, non_housekeeping_mask);
			
 
				+		if (!cpumask_equal(tmp, housekeeping_mask)) {
			
 
				+			pr_warn("Housekeeping: nohz_full= must match isolcpus=\n");
			
 
				+			free_bootmem_cpumask_var(tmp);
			
 
				+			free_bootmem_cpumask_var(non_housekeeping_mask);
			
 
				+			return 0;
			
 
				+		}
			
 
				+		free_bootmem_cpumask_var(tmp);
			
 
				+	}
			
 
				+
			
 
				+	if ((flags & HK_FLAG_TICK) && !(housekeeping_flags & HK_FLAG_TICK)) {
			
 
				+		if (IS_ENABLED(CONFIG_NO_HZ_FULL)) {
			
 
				+			tick_nohz_full_setup(non_housekeeping_mask);
			
 
				+		} else {
			
 
				+			pr_warn("Housekeeping: nohz unsupported."
			
 
				+				" Build with CONFIG_NO_HZ_FULL\n");
			
 
				+			free_bootmem_cpumask_var(non_housekeeping_mask);
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	housekeeping_flags |= flags;
			
 
				+
			
 
				+	free_bootmem_cpumask_var(non_housekeeping_mask);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+static int __init housekeeping_nohz_full_setup(char *str)
			
 
				+{
			
 
				+	unsigned int flags;
			
 
				+
			
 
				+	flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
			
 
				+
			
 
				+	return housekeeping_setup(str, flags);
			
 
				+}
			
 
				+__setup("nohz_full=", housekeeping_nohz_full_setup);
			
 
				+
			
 
				+static int __init housekeeping_isolcpus_setup(char *str)
			
 
				+{
			
 
				+	unsigned int flags = 0;
			
 
				+
			
 
				+	while (isalpha(*str)) {
			
 
				+		if (!strncmp(str, "nohz,", 5)) {
			
 
				+			str += 5;
			
 
				+			flags |= HK_FLAG_TICK;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		if (!strncmp(str, "domain,", 7)) {
			
 
				+			str += 7;
			
 
				+			flags |= HK_FLAG_DOMAIN;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		pr_warn("isolcpus: Error, unknown flag\n");
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	/* Default behaviour for isolcpus without flags */
			
 
				+	if (!flags)
			
 
				+		flags |= HK_FLAG_DOMAIN;
			
 
				+
			
 
				+	return housekeeping_setup(str, flags);
			
 
				+}
			
 
				+__setup("isolcpus=", housekeeping_isolcpus_setup);
			
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -74,10 +74,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
				 	raw_spin_unlock(&rt_b->rt_runtime_lock);
			
 
				 }
			
 
				 
			
 
				-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
			
 
				-static void push_irq_work_func(struct irq_work *work);
			
 
				-#endif
			
 
				-
			
 
				 void init_rt_rq(struct rt_rq *rt_rq)
			
 
				 {
			
 
				 	struct rt_prio_array *array;
			
@@ -97,13 +93,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
 
				 	rt_rq->rt_nr_migratory = 0;
			
 
				 	rt_rq->overloaded = 0;
			
 
				 	plist_head_init(&rt_rq->pushable_tasks);
			
 
				-
			
 
				-#ifdef HAVE_RT_PUSH_IPI
			
 
				-	rt_rq->push_flags = 0;
			
 
				-	rt_rq->push_cpu = nr_cpu_ids;
			
 
				-	raw_spin_lock_init(&rt_rq->push_lock);
			
 
				-	init_irq_work(&rt_rq->push_work, push_irq_work_func);
			
 
				-#endif
			
 
				 #endif /* CONFIG_SMP */
			
 
				 	/* We start is dequeued state, because no RT tasks are queued */
			
 
				 	rt_rq->rt_queued = 0;
			
@@ -1876,241 +1865,166 @@ static void push_rt_tasks(struct rq *rq)
 
				 }
			
 
				 
			
 
				 #ifdef HAVE_RT_PUSH_IPI
			
 
				+
			
 
				 /*
			
 
				- * The search for the next cpu always starts at rq->cpu and ends
			
 
				- * when we reach rq->cpu again. It will never return rq->cpu.
			
 
				- * This returns the next cpu to check, or nr_cpu_ids if the loop
			
 
				- * is complete.
			
 
				+ * When a high priority task schedules out from a CPU and a lower priority
			
 
				+ * task is scheduled in, a check is made to see if there's any RT tasks
			
 
				+ * on other CPUs that are waiting to run because a higher priority RT task
			
 
				+ * is currently running on its CPU. In this case, the CPU with multiple RT
			
 
				+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
			
 
				+ * up that may be able to run one of its non-running queued RT tasks.
			
 
				+ *
			
 
				+ * All CPUs with overloaded RT tasks need to be notified as there is currently
			
 
				+ * no way to know which of these CPUs have the highest priority task waiting
			
 
				+ * to run. Instead of trying to take a spinlock on each of these CPUs,
			
 
				+ * which has shown to cause large latency when done on machines with many
			
 
				+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
			
 
				+ * RT tasks waiting to run.
			
 
				+ *
			
 
				+ * Just sending an IPI to each of the CPUs is also an issue, as on large
			
 
				+ * count CPU machines, this can cause an IPI storm on a CPU, especially
			
 
				+ * if its the only CPU with multiple RT tasks queued, and a large number
			
 
				+ * of CPUs scheduling a lower priority task at the same time.
			
 
				+ *
			
 
				+ * Each root domain has its own irq work function that can iterate over
			
 
				+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
			
 
				+ * tassk must be checked if there's one or many CPUs that are lowering
			
 
				+ * their priority, there's a single irq work iterator that will try to
			
 
				+ * push off RT tasks that are waiting to run.
			
 
				+ *
			
 
				+ * When a CPU schedules a lower priority task, it will kick off the
			
 
				+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
			
 
				+ * As it only takes the first CPU that schedules a lower priority task
			
 
				+ * to start the process, the rto_start variable is incremented and if
			
 
				+ * the atomic result is one, then that CPU will try to take the rto_lock.
			
 
				+ * This prevents high contention on the lock as the process handles all
			
 
				+ * CPUs scheduling lower priority tasks.
			
 
				+ *
			
 
				+ * All CPUs that are scheduling a lower priority task will increment the
			
 
				+ * rt_loop_next variable. This will make sure that the irq work iterator
			
 
				+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
			
 
				+ * priority task, even if the iterator is in the middle of a scan. Incrementing
			
 
				+ * the rt_loop_next will cause the iterator to perform another scan.
			
 
				  *
			
 
				- * rq->rt.push_cpu holds the last cpu returned by this function,
			
 
				- * or if this is the first instance, it must hold rq->cpu.
			
 
				  */
			
 
				 static int rto_next_cpu(struct rq *rq)
			
 
				 {
			
 
				-	int prev_cpu = rq->rt.push_cpu;
			
 
				+	struct root_domain *rd = rq->rd;
			
 
				+	int next;
			
 
				 	int cpu;
			
 
				 
			
 
				-	cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
			
 
				-
			
 
				 	/*
			
 
				-	 * If the previous cpu is less than the rq's CPU, then it already
			
 
				-	 * passed the end of the mask, and has started from the beginning.
			
 
				-	 * We end if the next CPU is greater or equal to rq's CPU.
			
 
				+	 * When starting the IPI RT pushing, the rto_cpu is set to -1,
			
 
				+	 * rt_next_cpu() will simply return the first CPU found in
			
 
				+	 * the rto_mask.
			
 
				+	 *
			
 
				+	 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
			
 
				+	 * will return the next CPU found in the rto_mask.
			
 
				+	 *
			
 
				+	 * If there are no more CPUs left in the rto_mask, then a check is made
			
 
				+	 * against rto_loop and rto_loop_next. rto_loop is only updated with
			
 
				+	 * the rto_lock held, but any CPU may increment the rto_loop_next
			
 
				+	 * without any locking.
			
 
				 	 */
			
 
				-	if (prev_cpu < rq->cpu) {
			
 
				-		if (cpu >= rq->cpu)
			
 
				-			return nr_cpu_ids;
			
 
				+	for (;;) {
			
 
				 
			
 
				-	} else if (cpu >= nr_cpu_ids) {
			
 
				-		/*
			
 
				-		 * We passed the end of the mask, start at the beginning.
			
 
				-		 * If the result is greater or equal to the rq's CPU, then
			
 
				-		 * the loop is finished.
			
 
				-		 */
			
 
				-		cpu = cpumask_first(rq->rd->rto_mask);
			
 
				-		if (cpu >= rq->cpu)
			
 
				-			return nr_cpu_ids;
			
 
				-	}
			
 
				-	rq->rt.push_cpu = cpu;
			
 
				+		/* When rto_cpu is -1 this acts like cpumask_first() */
			
 
				+		cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
			
 
				 
			
 
				-	/* Return cpu to let the caller know if the loop is finished or not */
			
 
				-	return cpu;
			
 
				-}
			
 
				+		rd->rto_cpu = cpu;
			
 
				 
			
 
				-static int find_next_push_cpu(struct rq *rq)
			
 
				-{
			
 
				-	struct rq *next_rq;
			
 
				-	int cpu;
			
 
				+		if (cpu < nr_cpu_ids)
			
 
				+			return cpu;
			
 
				 
			
 
				-	while (1) {
			
 
				-		cpu = rto_next_cpu(rq);
			
 
				-		if (cpu >= nr_cpu_ids)
			
 
				-			break;
			
 
				-		next_rq = cpu_rq(cpu);
			
 
				+		rd->rto_cpu = -1;
			
 
				+
			
 
				+		/*
			
 
				+		 * ACQUIRE ensures we see the @rto_mask changes
			
 
				+		 * made prior to the @next value observed.
			
 
				+		 *
			
 
				+		 * Matches WMB in rt_set_overload().
			
 
				+		 */
			
 
				+		next = atomic_read_acquire(&rd->rto_loop_next);
			
 
				 
			
 
				-		/* Make sure the next rq can push to this rq */
			
 
				-		if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
			
 
				+		if (rd->rto_loop == next)
			
 
				 			break;
			
 
				+
			
 
				+		rd->rto_loop = next;
			
 
				 	}
			
 
				 
			
 
				-	return cpu;
			
 
				+	return -1;
			
 
				 }
			
 
				 
			
 
				-#define RT_PUSH_IPI_EXECUTING		1
			
 
				-#define RT_PUSH_IPI_RESTART		2
			
 
				+static inline bool rto_start_trylock(atomic_t *v)
			
 
				+{
			
 
				+	return !atomic_cmpxchg_acquire(v, 0, 1);
			
 
				+}
			
 
				 
			
 
				-/*
			
 
				- * When a high priority task schedules out from a CPU and a lower priority
			
 
				- * task is scheduled in, a check is made to see if there's any RT tasks
			
 
				- * on other CPUs that are waiting to run because a higher priority RT task
			
 
				- * is currently running on its CPU. In this case, the CPU with multiple RT
			
 
				- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
			
 
				- * up that may be able to run one of its non-running queued RT tasks.
			
 
				- *
			
 
				- * On large CPU boxes, there's the case that several CPUs could schedule
			
 
				- * a lower priority task at the same time, in which case it will look for
			
 
				- * any overloaded CPUs that it could pull a task from. To do this, the runqueue
			
 
				- * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting
			
 
				- * for a single overloaded CPU's runqueue lock can produce a large latency.
			
 
				- * (This has actually been observed on large boxes running cyclictest).
			
 
				- * Instead of taking the runqueue lock of the overloaded CPU, each of the
			
 
				- * CPUs that scheduled a lower priority task simply sends an IPI to the
			
 
				- * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with
			
 
				- * lots of contention. The overloaded CPU will look to push its non-running
			
 
				- * RT task off, and if it does, it can then ignore the other IPIs coming
			
 
				- * in, and just pass those IPIs off to any other overloaded CPU.
			
 
				- *
			
 
				- * When a CPU schedules a lower priority task, it only sends an IPI to
			
 
				- * the "next" CPU that has overloaded RT tasks. This prevents IPI storms,
			
 
				- * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with
			
 
				- * RT overloaded tasks, would cause 100 IPIs to go out at once.
			
 
				- *
			
 
				- * The overloaded RT CPU, when receiving an IPI, will try to push off its
			
 
				- * overloaded RT tasks and then send an IPI to the next CPU that has
			
 
				- * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks
			
 
				- * have completed. Just because a CPU may have pushed off its own overloaded
			
 
				- * RT task does not mean it should stop sending the IPI around to other
			
 
				- * overloaded CPUs. There may be another RT task waiting to run on one of
			
 
				- * those CPUs that are of higher priority than the one that was just
			
 
				- * pushed.
			
 
				- *
			
 
				- * An optimization that could possibly be made is to make a CPU array similar
			
 
				- * to the cpupri array mask of all running RT tasks, but for the overloaded
			
 
				- * case, then the IPI could be sent to only the CPU with the highest priority
			
 
				- * RT task waiting, and that CPU could send off further IPIs to the CPU with
			
 
				- * the next highest waiting task. Since the overloaded case is much less likely
			
 
				- * to happen, the complexity of this implementation may not be worth it.
			
 
				- * Instead, just send an IPI around to all overloaded CPUs.
			
 
				- *
			
 
				- * The rq->rt.push_flags holds the status of the IPI that is going around.
			
 
				- * A run queue can only send out a single IPI at a time. The possible flags
			
 
				- * for rq->rt.push_flags are:
			
 
				- *
			
 
				- *    (None or zero):		No IPI is going around for the current rq
			
 
				- *    RT_PUSH_IPI_EXECUTING:	An IPI for the rq is being passed around
			
 
				- *    RT_PUSH_IPI_RESTART:	The priority of the running task for the rq
			
 
				- *				has changed, and the IPI should restart
			
 
				- *				circulating the overloaded CPUs again.
			
 
				- *
			
 
				- * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated
			
 
				- * before sending to the next CPU.
			
 
				- *
			
 
				- * Instead of having all CPUs that schedule a lower priority task send
			
 
				- * an IPI to the same "first" CPU in the RT overload mask, they send it
			
 
				- * to the next overloaded CPU after their own CPU. This helps distribute
			
 
				- * the work when there's more than one overloaded CPU and multiple CPUs
			
 
				- * scheduling in lower priority tasks.
			
 
				- *
			
 
				- * When a rq schedules a lower priority task than what was currently
			
 
				- * running, the next CPU with overloaded RT tasks is examined first.
			
 
				- * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower
			
 
				- * priority task, it will send an IPI first to CPU 5, then CPU 5 will
			
 
				- * send to CPU 1 if it is still overloaded. CPU 1 will clear the
			
 
				- * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set.
			
 
				- *
			
 
				- * The first CPU to notice IPI_RESTART is set, will clear that flag and then
			
 
				- * send an IPI to the next overloaded CPU after the rq->cpu and not the next
			
 
				- * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3
			
 
				- * schedules a lower priority task, and the IPI_RESTART gets set while the
			
 
				- * handling is being done on CPU 5, it will clear the flag and send it back to
			
 
				- * CPU 4 instead of CPU 1.
			
 
				- *
			
 
				- * Note, the above logic can be disabled by turning off the sched_feature
			
 
				- * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be
			
 
				- * taken by the CPU requesting a pull and the waiting RT task will be pulled
			
 
				- * by that CPU. This may be fine for machines with few CPUs.
			
 
				- */
			
 
				-static void tell_cpu_to_push(struct rq *rq)
			
 
				+static inline void rto_start_unlock(atomic_t *v)
			
 
				 {
			
 
				-	int cpu;
			
 
				+	atomic_set_release(v, 0);
			
 
				+}
			
 
				 
			
 
				-	if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
			
 
				-		raw_spin_lock(&rq->rt.push_lock);
			
 
				-		/* Make sure it's still executing */
			
 
				-		if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
			
 
				-			/*
			
 
				-			 * Tell the IPI to restart the loop as things have
			
 
				-			 * changed since it started.
			
 
				-			 */
			
 
				-			rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
			
 
				-			raw_spin_unlock(&rq->rt.push_lock);
			
 
				-			return;
			
 
				-		}
			
 
				-		raw_spin_unlock(&rq->rt.push_lock);
			
 
				-	}
			
 
				+static void tell_cpu_to_push(struct rq *rq)
			
 
				+{
			
 
				+	int cpu = -1;
			
 
				 
			
 
				-	/* When here, there's no IPI going around */
			
 
				+	/* Keep the loop going if the IPI is currently active */
			
 
				+	atomic_inc(&rq->rd->rto_loop_next);
			
 
				 
			
 
				-	rq->rt.push_cpu = rq->cpu;
			
 
				-	cpu = find_next_push_cpu(rq);
			
 
				-	if (cpu >= nr_cpu_ids)
			
 
				+	/* Only one CPU can initiate a loop at a time */
			
 
				+	if (!rto_start_trylock(&rq->rd->rto_loop_start))
			
 
				 		return;
			
 
				 
			
 
				-	rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
			
 
				+	raw_spin_lock(&rq->rd->rto_lock);
			
 
				+
			
 
				+	/*
			
 
				+	 * The rto_cpu is updated under the lock, if it has a valid cpu
			
 
				+	 * then the IPI is still running and will continue due to the
			
 
				+	 * update to loop_next, and nothing needs to be done here.
			
 
				+	 * Otherwise it is finishing up and an ipi needs to be sent.
			
 
				+	 */
			
 
				+	if (rq->rd->rto_cpu < 0)
			
 
				+		cpu = rto_next_cpu(rq);
			
 
				 
			
 
				-	irq_work_queue_on(&rq->rt.push_work, cpu);
			
 
				+	raw_spin_unlock(&rq->rd->rto_lock);
			
 
				+
			
 
				+	rto_start_unlock(&rq->rd->rto_loop_start);
			
 
				+
			
 
				+	if (cpu >= 0)
			
 
				+		irq_work_queue_on(&rq->rd->rto_push_work, cpu);
			
 
				 }
			
 
				 
			
 
				 /* Called from hardirq context */
			
 
				-static void try_to_push_tasks(void *arg)
			
 
				+void rto_push_irq_work_func(struct irq_work *work)
			
 
				 {
			
 
				-	struct rt_rq *rt_rq = arg;
			
 
				-	struct rq *rq, *src_rq;
			
 
				-	int this_cpu;
			
 
				+	struct rq *rq;
			
 
				 	int cpu;
			
 
				 
			
 
				-	this_cpu = rt_rq->push_cpu;
			
 
				+	rq = this_rq();
			
 
				 
			
 
				-	/* Paranoid check */
			
 
				-	BUG_ON(this_cpu != smp_processor_id());
			
 
				-
			
 
				-	rq = cpu_rq(this_cpu);
			
 
				-	src_rq = rq_of_rt_rq(rt_rq);
			
 
				-
			
 
				-again:
			
 
				+	/*
			
 
				+	 * We do not need to grab the lock to check for has_pushable_tasks.
			
 
				+	 * When it gets updated, a check is made if a push is possible.
			
 
				+	 */
			
 
				 	if (has_pushable_tasks(rq)) {
			
 
				 		raw_spin_lock(&rq->lock);
			
 
				-		push_rt_task(rq);
			
 
				+		push_rt_tasks(rq);
			
 
				 		raw_spin_unlock(&rq->lock);
			
 
				 	}
			
 
				 
			
 
				-	/* Pass the IPI to the next rt overloaded queue */
			
 
				-	raw_spin_lock(&rt_rq->push_lock);
			
 
				-	/*
			
 
				-	 * If the source queue changed since the IPI went out,
			
 
				-	 * we need to restart the search from that CPU again.
			
 
				-	 */
			
 
				-	if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
			
 
				-		rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
			
 
				-		rt_rq->push_cpu = src_rq->cpu;
			
 
				-	}
			
 
				+	raw_spin_lock(&rq->rd->rto_lock);
			
 
				 
			
 
				-	cpu = find_next_push_cpu(src_rq);
			
 
				+	/* Pass the IPI to the next rt overloaded queue */
			
 
				+	cpu = rto_next_cpu(rq);
			
 
				 
			
 
				-	if (cpu >= nr_cpu_ids)
			
 
				-		rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
			
 
				-	raw_spin_unlock(&rt_rq->push_lock);
			
 
				+	raw_spin_unlock(&rq->rd->rto_lock);
			
 
				 
			
 
				-	if (cpu >= nr_cpu_ids)
			
 
				+	if (cpu < 0)
			
 
				 		return;
			
 
				 
			
 
				-	/*
			
 
				-	 * It is possible that a restart caused this CPU to be
			
 
				-	 * chosen again. Don't bother with an IPI, just see if we
			
 
				-	 * have more to push.
			
 
				-	 */
			
 
				-	if (unlikely(cpu == rq->cpu))
			
 
				-		goto again;
			
 
				-
			
 
				 	/* Try the next RT overloaded CPU */
			
 
				-	irq_work_queue_on(&rt_rq->push_work, cpu);
			
 
				-}
			
 
				-
			
 
				-static void push_irq_work_func(struct irq_work *work)
			
 
				-{
			
 
				-	struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
			
 
				-
			
 
				-	try_to_push_tasks(rt_rq);
			
 
				+	irq_work_queue_on(&rq->rd->rto_push_work, cpu);
			
 
				 }
			
 
				 #endif /* HAVE_RT_PUSH_IPI */
			
 
				 
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,7 +227,7 @@ struct dl_bw {
 
				 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
			
 
				 
			
 
				 static inline
			
 
				-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
			
 
				+void __dl_sub(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
			
 
				 {
			
 
				 	dl_b->total_bw -= tsk_bw;
			
 
				 	__dl_update(dl_b, (s32)tsk_bw / cpus);
			
@@ -256,7 +256,6 @@ extern int sched_dl_overflow(struct task_struct *p, int policy,
 
				 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
			
 
				 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
			
 
				 extern bool __checkparam_dl(const struct sched_attr *attr);
			
 
				-extern void __dl_clear_params(struct task_struct *p);
			
 
				 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
			
 
				 extern int dl_task_can_attach(struct task_struct *p,
			
 
				 			      const struct cpumask *cs_cpus_allowed);
			
@@ -419,6 +418,7 @@ struct cfs_bandwidth { };
 
				 /* CFS-related fields in a runqueue */
			
 
				 struct cfs_rq {
			
 
				 	struct load_weight load;
			
 
				+	unsigned long runnable_weight;
			
 
				 	unsigned int nr_running, h_nr_running;
			
 
				 
			
 
				 	u64 exec_clock;
			
@@ -444,18 +444,22 @@ struct cfs_rq {
 
				 	 * CFS load tracking
			
 
				 	 */
			
 
				 	struct sched_avg avg;
			
 
				-	u64 runnable_load_sum;
			
 
				-	unsigned long runnable_load_avg;
			
 
				-#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				-	unsigned long tg_load_avg_contrib;
			
 
				-	unsigned long propagate_avg;
			
 
				-#endif
			
 
				-	atomic_long_t removed_load_avg, removed_util_avg;
			
 
				 #ifndef CONFIG_64BIT
			
 
				 	u64 load_last_update_time_copy;
			
 
				 #endif
			
 
				+	struct {
			
 
				+		raw_spinlock_t	lock ____cacheline_aligned;
			
 
				+		int		nr;
			
 
				+		unsigned long	load_avg;
			
 
				+		unsigned long	util_avg;
			
 
				+		unsigned long	runnable_sum;
			
 
				+	} removed;
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				+	unsigned long tg_load_avg_contrib;
			
 
				+	long propagate;
			
 
				+	long prop_runnable_sum;
			
 
				+
			
 
				 	/*
			
 
				 	 *   h_load = weight * f(tg)
			
 
				 	 *
			
@@ -502,7 +506,7 @@ static inline int rt_bandwidth_enabled(void)
 
				 }
			
 
				 
			
 
				 /* RT IPI pull logic requires IRQ_WORK */
			
 
				-#ifdef CONFIG_IRQ_WORK
			
 
				+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
			
 
				 # define HAVE_RT_PUSH_IPI
			
 
				 #endif
			
 
				 
			
@@ -524,12 +528,6 @@ struct rt_rq {
 
				 	unsigned long rt_nr_total;
			
 
				 	int overloaded;
			
 
				 	struct plist_head pushable_tasks;
			
 
				-#ifdef HAVE_RT_PUSH_IPI
			
 
				-	int push_flags;
			
 
				-	int push_cpu;
			
 
				-	struct irq_work push_work;
			
 
				-	raw_spinlock_t push_lock;
			
 
				-#endif
			
 
				 #endif /* CONFIG_SMP */
			
 
				 	int rt_queued;
			
 
				 
			
@@ -638,6 +636,19 @@ struct root_domain {
 
				 	struct dl_bw dl_bw;
			
 
				 	struct cpudl cpudl;
			
 
				 
			
 
				+#ifdef HAVE_RT_PUSH_IPI
			
 
				+	/*
			
 
				+	 * For IPI pull requests, loop across the rto_mask.
			
 
				+	 */
			
 
				+	struct irq_work rto_push_work;
			
 
				+	raw_spinlock_t rto_lock;
			
 
				+	/* These are only updated and read within rto_lock */
			
 
				+	int rto_loop;
			
 
				+	int rto_cpu;
			
 
				+	/* These atomics are updated outside of a lock */
			
 
				+	atomic_t rto_loop_next;
			
 
				+	atomic_t rto_loop_start;
			
 
				+#endif
			
 
				 	/*
			
 
				 	 * The "RT overload" flag: it gets set if a CPU has more than
			
 
				 	 * one runnable RT task.
			
@@ -655,6 +666,9 @@ extern void init_defrootdomain(void);
 
				 extern int sched_init_domains(const struct cpumask *cpu_map);
			
 
				 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
			
 
				 
			
 
				+#ifdef HAVE_RT_PUSH_IPI
			
 
				+extern void rto_push_irq_work_func(struct irq_work *work);
			
 
				+#endif
			
 
				 #endif /* CONFIG_SMP */
			
 
				 
			
 
				 /*
			
@@ -1219,8 +1233,6 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 
				 # define const_debug const
			
 
				 #endif
			
 
				 
			
 
				-extern const_debug unsigned int sysctl_sched_features;
			
 
				-
			
 
				 #define SCHED_FEAT(name, enabled)	\
			
 
				 	__SCHED_FEAT_##name ,
			
 
				 
			
@@ -1232,6 +1244,13 @@ enum {
 
				 #undef SCHED_FEAT
			
 
				 
			
 
				 #if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
			
 
				+
			
 
				+/*
			
 
				+ * To support run-time toggling of sched features, all the translation units
			
 
				+ * (but core.c) reference the sysctl_sched_features defined in core.c.
			
 
				+ */
			
 
				+extern const_debug unsigned int sysctl_sched_features;
			
 
				+
			
 
				 #define SCHED_FEAT(name, enabled)					\
			
 
				 static __always_inline bool static_branch_##name(struct static_key *key) \
			
 
				 {									\
			
@@ -1239,13 +1258,27 @@ static __always_inline bool static_branch_##name(struct static_key *key) \
 
				 }
			
 
				 
			
 
				 #include "features.h"
			
 
				-
			
 
				 #undef SCHED_FEAT
			
 
				 
			
 
				 extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
			
 
				 #define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
			
 
				+
			
 
				 #else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
			
 
				+
			
 
				+/*
			
 
				+ * Each translation unit has its own copy of sysctl_sched_features to allow
			
 
				+ * constants propagation at compile time and compiler optimization based on
			
 
				+ * features default.
			
 
				+ */
			
 
				+#define SCHED_FEAT(name, enabled)	\
			
 
				+	(1UL << __SCHED_FEAT_##name) * enabled |
			
 
				+static const_debug __maybe_unused unsigned int sysctl_sched_features =
			
 
				+#include "features.h"
			
 
				+	0;
			
 
				+#undef SCHED_FEAT
			
 
				+
			
 
				 #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
			
 
				+
			
 
				 #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
			
 
				 
			
 
				 extern struct static_key_false sched_numa_balancing;
			
@@ -1530,6 +1563,8 @@ extern void init_sched_dl_class(void);
 
				 extern void init_sched_rt_class(void);
			
 
				 extern void init_sched_fair_class(void);
			
 
				 
			
 
				+extern void reweight_task(struct task_struct *p, int prio);
			
 
				+
			
 
				 extern void resched_curr(struct rq *rq);
			
 
				 extern void resched_cpu(int cpu);
			
 
				 
			
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -4,6 +4,7 @@
 
				  */
			
 
				 #include <linux/sched.h>
			
 
				 #include <linux/mutex.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include "sched.h"
			
 
				 
			
@@ -269,6 +270,12 @@ static int init_rootdomain(struct root_domain *rd)
 
				 	if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
			
 
				 		goto free_dlo_mask;
			
 
				 
			
 
				+#ifdef HAVE_RT_PUSH_IPI
			
 
				+	rd->rto_cpu = -1;
			
 
				+	raw_spin_lock_init(&rd->rto_lock);
			
 
				+	init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
			
 
				+#endif
			
 
				+
			
 
				 	init_dl_bw(&rd->dl_bw);
			
 
				 	if (cpudl_init(&rd->cpudl) != 0)
			
 
				 		goto free_rto_mask;
			
@@ -464,21 +471,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 
				 	update_top_cache_domain(cpu);
			
 
				 }
			
 
				 
			
 
				-/* Setup the mask of CPUs configured for isolated domains */
			
 
				-static int __init isolated_cpu_setup(char *str)
			
 
				-{
			
 
				-	int ret;
			
 
				-
			
 
				-	alloc_bootmem_cpumask_var(&cpu_isolated_map);
			
 
				-	ret = cpulist_parse(str, cpu_isolated_map);
			
 
				-	if (ret) {
			
 
				-		pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
			
 
				-		return 0;
			
 
				-	}
			
 
				-	return 1;
			
 
				-}
			
 
				-__setup("isolcpus=", isolated_cpu_setup);
			
 
				-
			
 
				 struct s_data {
			
 
				 	struct sched_domain ** __percpu sd;
			
 
				 	struct root_domain	*rd;
			
@@ -1158,6 +1150,7 @@ sd_init(struct sched_domain_topology_level *tl,
 
				 		sd->smt_gain = 1178; /* ~15% */
			
 
				 
			
 
				 	} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
			
 
				+		sd->flags |= SD_PREFER_SIBLING;
			
 
				 		sd->imbalance_pct = 117;
			
 
				 		sd->cache_nice_tries = 1;
			
 
				 		sd->busy_idx = 2;
			
@@ -1332,6 +1325,10 @@ void sched_init_numa(void)
 
				 	if (!sched_domains_numa_distance)
			
 
				 		return;
			
 
				 
			
 
				+	/* Includes NUMA identity node at level 0. */
			
 
				+	sched_domains_numa_distance[level++] = curr_distance;
			
 
				+	sched_domains_numa_levels = level;
			
 
				+
			
 
				 	/*
			
 
				 	 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
			
 
				 	 * unique distances in the node_distance() table.
			
@@ -1379,8 +1376,7 @@ void sched_init_numa(void)
 
				 		return;
			
 
				 
			
 
				 	/*
			
 
				-	 * 'level' contains the number of unique distances, excluding the
			
 
				-	 * identity distance node_distance(i,i).
			
 
				+	 * 'level' contains the number of unique distances
			
 
				 	 *
			
 
				 	 * The sched_domains_numa_distance[] array includes the actual distance
			
 
				 	 * numbers.
			
@@ -1441,10 +1437,19 @@ void sched_init_numa(void)
 
				 	for (i = 0; sched_domain_topology[i].mask; i++)
			
 
				 		tl[i] = sched_domain_topology[i];
			
 
				 
			
 
				+	/*
			
 
				+	 * Add the NUMA identity distance, aka single NODE.
			
 
				+	 */
			
 
				+	tl[i++] = (struct sched_domain_topology_level){
			
 
				+		.mask = sd_numa_mask,
			
 
				+		.numa_level = 0,
			
 
				+		SD_INIT_NAME(NODE)
			
 
				+	};
			
 
				+
			
 
				 	/*
			
 
				 	 * .. and append 'j' levels of NUMA goodness.
			
 
				 	 */
			
 
				-	for (j = 0; j < level; i++, j++) {
			
 
				+	for (j = 1; j < level; i++, j++) {
			
 
				 		tl[i] = (struct sched_domain_topology_level){
			
 
				 			.mask = sd_numa_mask,
			
 
				 			.sd_flags = cpu_numa_flags,
			
@@ -1774,7 +1779,7 @@ int sched_init_domains(const struct cpumask *cpu_map)
 
				 	doms_cur = alloc_sched_domains(ndoms_cur);
			
 
				 	if (!doms_cur)
			
 
				 		doms_cur = &fallback_doms;
			
 
				-	cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
			
 
				+	cpumask_and(doms_cur[0], cpu_map, housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				 	err = build_sched_domains(doms_cur[0], NULL);
			
 
				 	register_sched_domain_sysctl();
			
 
				 
			
@@ -1857,7 +1862,8 @@ void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
 
				 		doms_new = alloc_sched_domains(1);
			
 
				 		if (doms_new) {
			
 
				 			n = 1;
			
 
				-			cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
			
 
				+			cpumask_and(doms_new[0], cpu_active_mask,
			
 
				+				    housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				 		}
			
 
				 	} else {
			
 
				 		n = ndoms_new;
			
@@ -1880,7 +1886,8 @@ match1:
 
				 	if (!doms_new) {
			
 
				 		n = 0;
			
 
				 		doms_new = &fallback_doms;
			
 
				-		cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
			
 
				+		cpumask_and(doms_new[0], cpu_active_mask,
			
 
				+			    housekeeping_cpumask(HK_FLAG_DOMAIN));
			
 
				 	}
			
 
				 
			
 
				 	/* Build new domains: */
			
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -27,6 +27,7 @@
 
				 #include <linux/irq_work.h>
			
 
				 #include <linux/posix-timers.h>
			
 
				 #include <linux/context_tracking.h>
			
 
				+#include <linux/mm.h>
			
 
				 
			
 
				 #include <asm/irq_regs.h>
			
 
				 
			
@@ -165,7 +166,6 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
 
				 
			
 
				 #ifdef CONFIG_NO_HZ_FULL
			
 
				 cpumask_var_t tick_nohz_full_mask;
			
 
				-cpumask_var_t housekeeping_mask;
			
 
				 bool tick_nohz_full_running;
			
 
				 static atomic_t tick_dep_mask;
			
 
				 
			
@@ -385,20 +385,13 @@ out:
 
				 	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				-/* Parse the boot-time nohz CPU list from the kernel parameters. */
			
 
				-static int __init tick_nohz_full_setup(char *str)
			
 
				+/* Get the boot-time nohz CPU list from the kernel parameters. */
			
 
				+void __init tick_nohz_full_setup(cpumask_var_t cpumask)
			
 
				 {
			
 
				 	alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
			
 
				-	if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
			
 
				-		pr_warn("NO_HZ: Incorrect nohz_full cpumask\n");
			
 
				-		free_bootmem_cpumask_var(tick_nohz_full_mask);
			
 
				-		return 1;
			
 
				-	}
			
 
				+	cpumask_copy(tick_nohz_full_mask, cpumask);
			
 
				 	tick_nohz_full_running = true;
			
 
				-
			
 
				-	return 1;
			
 
				 }
			
 
				-__setup("nohz_full=", tick_nohz_full_setup);
			
 
				 
			
 
				 static int tick_nohz_cpu_down(unsigned int cpu)
			
 
				 {
			
@@ -437,13 +430,6 @@ void __init tick_nohz_init(void)
 
				 			return;
			
 
				 	}
			
 
				 
			
 
				-	if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
			
 
				-		WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
			
 
				-		cpumask_clear(tick_nohz_full_mask);
			
 
				-		tick_nohz_full_running = false;
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				 	/*
			
 
				 	 * Full dynticks uses irq work to drive the tick rescheduling on safe
			
 
				 	 * locking contexts. But then we need irq work to raise its own
			
@@ -452,7 +438,6 @@ void __init tick_nohz_init(void)
 
				 	if (!arch_irq_work_has_interrupt()) {
			
 
				 		pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n");
			
 
				 		cpumask_clear(tick_nohz_full_mask);
			
 
				-		cpumask_copy(housekeeping_mask, cpu_possible_mask);
			
 
				 		tick_nohz_full_running = false;
			
 
				 		return;
			
 
				 	}
			
@@ -465,9 +450,6 @@ void __init tick_nohz_init(void)
 
				 		cpumask_clear_cpu(cpu, tick_nohz_full_mask);
			
 
				 	}
			
 
				 
			
 
				-	cpumask_andnot(housekeeping_mask,
			
 
				-		       cpu_possible_mask, tick_nohz_full_mask);
			
 
				-
			
 
				 	for_each_cpu(cpu, tick_nohz_full_mask)
			
 
				 		context_tracking_cpu_set(cpu);
			
 
				 
			
@@ -477,12 +459,6 @@ void __init tick_nohz_init(void)
 
				 	WARN_ON(ret < 0);
			
 
				 	pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n",
			
 
				 		cpumask_pr_args(tick_nohz_full_mask));
			
 
				-
			
 
				-	/*
			
 
				-	 * We need at least one CPU to handle housekeeping work such
			
 
				-	 * as timekeeping, unbound timers, workqueues, ...
			
 
				-	 */
			
 
				-	WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
			
 
				 }
			
 
				 #endif
			
 
				 
			
@@ -787,6 +763,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
 
				 	if (!ts->tick_stopped) {
			
 
				 		calc_load_nohz_start();
			
 
				 		cpu_load_update_nohz_start();
			
 
				+		quiet_vmstat();
			
 
				 
			
 
				 		ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
			
 
				 		ts->tick_stopped = 1;
			
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -921,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 
				 
			
 
				 	trace_assign_type(field, iter->ent);
			
 
				 
			
 
				-	T = __task_state_to_char(field->next_state);
			
 
				-	S = __task_state_to_char(field->prev_state);
			
 
				+	T = task_index_to_char(field->next_state);
			
 
				+	S = task_index_to_char(field->prev_state);
			
 
				 	trace_find_cmdline(field->next_pid, comm);
			
 
				 	trace_seq_printf(&iter->seq,
			
 
				 			 " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
			
@@ -957,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 
				 	trace_assign_type(field, iter->ent);
			
 
				 
			
 
				 	if (!S)
			
 
				-		S = __task_state_to_char(field->prev_state);
			
 
				-	T = __task_state_to_char(field->next_state);
			
 
				+		S = task_index_to_char(field->prev_state);
			
 
				+	T = task_index_to_char(field->next_state);
			
 
				 	trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
			
 
				 			 field->prev_pid,
			
 
				 			 field->prev_prio,
			
@@ -993,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 
				 	trace_assign_type(field, iter->ent);
			
 
				 
			
 
				 	if (!S)
			
 
				-		S = __task_state_to_char(field->prev_state);
			
 
				-	T = __task_state_to_char(field->next_state);
			
 
				+		S = task_index_to_char(field->prev_state);
			
 
				+	T = task_index_to_char(field->next_state);
			
 
				 
			
 
				 	SEQ_PUT_HEX_FIELD(s, field->prev_pid);
			
 
				 	SEQ_PUT_HEX_FIELD(s, field->prev_prio);
			
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -398,10 +398,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
 
				 	entry	= ring_buffer_event_data(event);
			
 
				 	entry->prev_pid			= prev->pid;
			
 
				 	entry->prev_prio		= prev->prio;
			
 
				-	entry->prev_state		= __get_task_state(prev);
			
 
				+	entry->prev_state		= task_state_index(prev);
			
 
				 	entry->next_pid			= next->pid;
			
 
				 	entry->next_prio		= next->prio;
			
 
				-	entry->next_state		= __get_task_state(next);
			
 
				+	entry->next_state		= task_state_index(next);
			
 
				 	entry->next_cpu	= task_cpu(next);
			
 
				 
			
 
				 	if (!call_filter_check_discard(call, entry, buffer, event))
			
@@ -426,10 +426,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 
				 	entry	= ring_buffer_event_data(event);
			
 
				 	entry->prev_pid			= curr->pid;
			
 
				 	entry->prev_prio		= curr->prio;
			
 
				-	entry->prev_state		= __get_task_state(curr);
			
 
				+	entry->prev_state		= task_state_index(curr);
			
 
				 	entry->next_pid			= wakee->pid;
			
 
				 	entry->next_prio		= wakee->prio;
			
 
				-	entry->next_state		= __get_task_state(wakee);
			
 
				+	entry->next_state		= task_state_index(wakee);
			
 
				 	entry->next_cpu			= task_cpu(wakee);
			
 
				 
			
 
				 	if (!call_filter_check_discard(call, entry, buffer, event))
			
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,7 @@
 
				 #include <linux/workqueue.h>
			
 
				 #include <linux/sched/clock.h>
			
 
				 #include <linux/sched/debug.h>
			
 
				+#include <linux/sched/isolation.h>
			
 
				 
			
 
				 #include <asm/irq_regs.h>
			
 
				 #include <linux/kvm_para.h>
			
@@ -774,15 +775,11 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
 
				 
			
 
				 void __init lockup_detector_init(void)
			
 
				 {
			
 
				-#ifdef CONFIG_NO_HZ_FULL
			
 
				-	if (tick_nohz_full_enabled()) {
			
 
				+	if (tick_nohz_full_enabled())
			
 
				 		pr_info("Disabling watchdog on nohz_full cores by default\n");
			
 
				-		cpumask_copy(&watchdog_cpumask, housekeeping_mask);
			
 
				-	} else
			
 
				-		cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
			
 
				-#else
			
 
				-	cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
			
 
				-#endif
			
 
				+
			
 
				+	cpumask_copy(&watchdog_cpumask,
			
 
				+		     housekeeping_cpumask(HK_FLAG_TIMER));
			
 
				 
			
 
				 	if (!watchdog_nmi_probe())
			
 
				 		nmi_watchdog_available = true;