há 9 anos atrás · 92c020d08d
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -513,6 +513,9 @@ config HAVE_CONTEXT_TRACKING
 
				 config HAVE_VIRT_CPU_ACCOUNTING
			
 
				 	bool
			
 
				 
			
 
				+config ARCH_HAS_SCALED_CPUTIME
			
 
				+	bool
			
 
				+
			
 
				 config HAVE_VIRT_CPU_ACCOUNTING_GEN
			
 
				 	bool
			
 
				 	default y if 64BIT
			
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk)
 
				 
			
 
				 	if (ti->ac_utime) {
			
 
				 		delta_utime = cycle_to_cputime(ti->ac_utime);
			
 
				-		account_user_time(tsk, delta_utime, delta_utime);
			
 
				+		account_user_time(tsk, delta_utime);
			
 
				 		ti->ac_utime = 0;
			
 
				 	}
			
 
				 }
			
@@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk)
 
				 {
			
 
				 	cputime_t delta = vtime_delta(tsk);
			
 
				 
			
 
				-	account_system_time(tsk, 0, delta, delta);
			
 
				+	account_system_time(tsk, 0, delta);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(vtime_account_system);
			
 
				 
			
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -160,6 +160,7 @@ config PPC
 
				 	select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS
			
 
				 	select GENERIC_CPU_AUTOPROBE
			
 
				 	select HAVE_VIRT_CPU_ACCOUNTING
			
 
				+	select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE
			
 
				 	select HAVE_ARCH_HARDENED_USERCOPY
			
 
				 	select HAVE_KERNEL_GZIP
			
 
				 
			
--- a/arch/powerpc/include/asm/cputime.h
+++ b/arch/powerpc/include/asm/cputime.h
@@ -46,26 +46,12 @@ extern cputime_t cputime_one_jiffy;
 
				  * Convert cputime <-> jiffies
			
 
				  */
			
 
				 extern u64 __cputime_jiffies_factor;
			
 
				-DECLARE_PER_CPU(unsigned long, cputime_last_delta);
			
 
				-DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta);
			
 
				 
			
 
				 static inline unsigned long cputime_to_jiffies(const cputime_t ct)
			
 
				 {
			
 
				 	return mulhdu((__force u64) ct, __cputime_jiffies_factor);
			
 
				 }
			
 
				 
			
 
				-/* Estimate the scaled cputime by scaling the real cputime based on
			
 
				- * the last scaled to real ratio */
			
 
				-static inline cputime_t cputime_to_scaled(const cputime_t ct)
			
 
				-{
			
 
				-	if (cpu_has_feature(CPU_FTR_SPURR) &&
			
 
				-	    __this_cpu_read(cputime_last_delta))
			
 
				-		return (__force u64) ct *
			
 
				-			__this_cpu_read(cputime_scaled_last_delta) /
			
 
				-			__this_cpu_read(cputime_last_delta);
			
 
				-	return ct;
			
 
				-}
			
 
				-
			
 
				 static inline cputime_t jiffies_to_cputime(const unsigned long jif)
			
 
				 {
			
 
				 	u64 ct;
			
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -164,8 +164,6 @@ u64 __cputime_sec_factor;
 
				 EXPORT_SYMBOL(__cputime_sec_factor);
			
 
				 u64 __cputime_clockt_factor;
			
 
				 EXPORT_SYMBOL(__cputime_clockt_factor);
			
 
				-DEFINE_PER_CPU(unsigned long, cputime_last_delta);
			
 
				-DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta);
			
 
				 
			
 
				 cputime_t cputime_one_jiffy;
			
 
				 
			
@@ -360,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk)
 
				 	unsigned long delta, sys_scaled, stolen;
			
 
				 
			
 
				 	delta = vtime_delta(tsk, &sys_scaled, &stolen);
			
 
				-	account_system_time(tsk, 0, delta, sys_scaled);
			
 
				+	account_system_time(tsk, 0, delta);
			
 
				+	tsk->stimescaled += sys_scaled;
			
 
				 	if (stolen)
			
 
				 		account_steal_time(stolen);
			
 
				 }
			
@@ -393,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk)
 
				 	acct->user_time = 0;
			
 
				 	acct->user_time_scaled = 0;
			
 
				 	acct->utime_sspurr = 0;
			
 
				-	account_user_time(tsk, utime, utimescaled);
			
 
				+	account_user_time(tsk, utime);
			
 
				+	tsk->utimescaled += utimescaled;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_PPC32
			
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -171,6 +171,7 @@ config S390
 
				 	select SYSCTL_EXCEPTION_TRACE
			
 
				 	select TTY
			
 
				 	select VIRT_CPU_ACCOUNTING
			
 
				+	select ARCH_HAS_SCALED_CPUTIME
			
 
				 	select VIRT_TO_BUS
			
 
				 	select HAVE_NMI
			
 
				 
			
--- a/arch/s390/kernel/vtime.c
+++ b/arch/s390/kernel/vtime.c
@@ -137,8 +137,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset)
 
				 		user_scaled = (user_scaled * mult) / div;
			
 
				 		system_scaled = (system_scaled * mult) / div;
			
 
				 	}
			
 
				-	account_user_time(tsk, user, user_scaled);
			
 
				-	account_system_time(tsk, hardirq_offset, system, system_scaled);
			
 
				+	account_user_time(tsk, user);
			
 
				+	tsk->utimescaled += user_scaled;
			
 
				+	account_system_time(tsk, hardirq_offset, system);
			
 
				+	tsk->stimescaled += system_scaled;
			
 
				 
			
 
				 	steal = S390_lowcore.steal_timer;
			
 
				 	if ((s64) steal > 0) {
			
@@ -202,7 +204,8 @@ void vtime_account_irq_enter(struct task_struct *tsk)
 
				 
			
 
				 		system_scaled = (system_scaled * mult) / div;
			
 
				 	}
			
 
				-	account_system_time(tsk, 0, system, system_scaled);
			
 
				+	account_system_time(tsk, 0, system);
			
 
				+	tsk->stimescaled += system_scaled;
			
 
				 
			
 
				 	virt_timer_forward(system);
			
 
				 }
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -939,6 +939,27 @@ config SCHED_MC
 
				 	  making when dealing with multi-core CPU chips at a cost of slightly
			
 
				 	  increased overhead in some places. If unsure say N here.
			
 
				 
			
 
				+config SCHED_MC_PRIO
			
 
				+	bool "CPU core priorities scheduler support"
			
 
				+	depends on SCHED_MC && CPU_SUP_INTEL
			
 
				+	select X86_INTEL_PSTATE
			
 
				+	select CPU_FREQ
			
 
				+	default y
			
 
				+	---help---
			
 
				+	  Intel Turbo Boost Max Technology 3.0 enabled CPUs have a
			
 
				+	  core ordering determined at manufacturing time, which allows
			
 
				+	  certain cores to reach higher turbo frequencies (when running
			
 
				+	  single threaded workloads) than others.
			
 
				+
			
 
				+	  Enabling this kernel feature teaches the scheduler about
			
 
				+	  the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the
			
 
				+	  scheduler's CPU selection logic accordingly, so that higher
			
 
				+	  overall system performance can be achieved.
			
 
				+
			
 
				+	  This feature will have no effect on CPUs without this feature.
			
 
				+
			
 
				+	  If unsure say Y here.
			
 
				+
			
 
				 source "kernel/Kconfig.preempt"
			
 
				 
			
 
				 config UP_LATE_INIT
			
--- a/arch/x86/include/asm/preempt.h
+++ b/arch/x86/include/asm/preempt.h
@@ -24,7 +24,13 @@ static __always_inline int preempt_count(void)
 
				 
			
 
				 static __always_inline void preempt_count_set(int pc)
			
 
				 {
			
 
				-	raw_cpu_write_4(__preempt_count, pc);
			
 
				+	int old, new;
			
 
				+
			
 
				+	do {
			
 
				+		old = raw_cpu_read_4(__preempt_count);
			
 
				+		new = (old & PREEMPT_NEED_RESCHED) |
			
 
				+			(pc & ~PREEMPT_NEED_RESCHED);
			
 
				+	} while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old);
			
 
				 }
			
 
				 
			
 
				 /*
			
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -146,4 +146,36 @@ struct pci_bus;
 
				 int x86_pci_root_bus_node(int bus);
			
 
				 void x86_pci_root_bus_resources(int bus, struct list_head *resources);
			
 
				 
			
 
				+extern bool x86_topology_update;
			
 
				+
			
 
				+#ifdef CONFIG_SCHED_MC_PRIO
			
 
				+#include <asm/percpu.h>
			
 
				+
			
 
				+DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
			
 
				+extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
			
 
				+
			
 
				+/* Interface to set priority of a cpu */
			
 
				+void sched_set_itmt_core_prio(int prio, int core_cpu);
			
 
				+
			
 
				+/* Interface to notify scheduler that system supports ITMT */
			
 
				+int sched_set_itmt_support(void);
			
 
				+
			
 
				+/* Interface to notify scheduler that system revokes ITMT support */
			
 
				+void sched_clear_itmt_support(void);
			
 
				+
			
 
				+#else /* CONFIG_SCHED_MC_PRIO */
			
 
				+
			
 
				+#define sysctl_sched_itmt_enabled	0
			
 
				+static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
			
 
				+{
			
 
				+}
			
 
				+static inline int sched_set_itmt_support(void)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+static inline void sched_clear_itmt_support(void)
			
 
				+{
			
 
				+}
			
 
				+#endif /* CONFIG_SCHED_MC_PRIO */
			
 
				+
			
 
				 #endif /* _ASM_X86_TOPOLOGY_H */
			
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 
				 	__chk_range_not_ok((unsigned long __force)(addr), size, limit); \
			
 
				 })
			
 
				 
			
 
				+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
			
 
				+# define WARN_ON_IN_IRQ()	WARN_ON_ONCE(!in_task())
			
 
				+#else
			
 
				+# define WARN_ON_IN_IRQ()
			
 
				+#endif
			
 
				+
			
 
				 /**
			
 
				  * access_ok: - Checks if a user space pointer is valid
			
 
				  * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE.  Note that
			
@@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 
				  * checks that the pointer is in the user space range - after calling
			
 
				  * this function, memory access functions may still return -EFAULT.
			
 
				  */
			
 
				-#define access_ok(type, addr, size) \
			
 
				-	likely(!__range_not_ok(addr, size, user_addr_max()))
			
 
				+#define access_ok(type, addr, size)					\
			
 
				+({									\
			
 
				+	WARN_ON_IN_IRQ();						\
			
 
				+	likely(!__range_not_ok(addr, size, user_addr_max()));		\
			
 
				+})
			
 
				 
			
 
				 /*
			
 
				  * These are the main single-value transfer routines.  They automatically
			
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -123,6 +123,7 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 
				 
			
 
				 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
			
 
				 obj-$(CONFIG_TRACING)			+= tracepoint.o
			
 
				+obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
			
 
				 
			
 
				 ifdef CONFIG_FRAME_POINTER
			
 
				 obj-y					+= unwind_frame.o
			
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev,
 
				 	static int use_apm_idle; /* = 0 */
			
 
				 	static unsigned int last_jiffies; /* = 0 */
			
 
				 	static unsigned int last_stime; /* = 0 */
			
 
				-	cputime_t stime;
			
 
				+	cputime_t stime, utime;
			
 
				 
			
 
				 	int apm_idle_done = 0;
			
 
				 	unsigned int jiffies_since_last_check = jiffies - last_jiffies;
			
 
				 	unsigned int bucket;
			
 
				 
			
 
				 recalc:
			
 
				-	task_cputime(current, NULL, &stime);
			
 
				+	task_cputime(current, &utime, &stime);
			
 
				 	if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
			
 
				 		use_apm_idle = 0;
			
 
				 	} else if (jiffies_since_last_check > idle_period) {
			
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -0,0 +1,215 @@
 
				+/*
			
 
				+ * itmt.c: Support Intel Turbo Boost Max Technology 3.0
			
 
				+ *
			
 
				+ * (C) Copyright 2016 Intel Corporation
			
 
				+ * Author: Tim Chen <tim.c.chen@linux.intel.com>
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or
			
 
				+ * modify it under the terms of the GNU General Public License
			
 
				+ * as published by the Free Software Foundation; version 2
			
 
				+ * of the License.
			
 
				+ *
			
 
				+ * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT),
			
 
				+ * the maximum turbo frequencies of some cores in a CPU package may be
			
 
				+ * higher than for the other cores in the same package.  In that case,
			
 
				+ * better performance can be achieved by making the scheduler prefer
			
 
				+ * to run tasks on the CPUs with higher max turbo frequencies.
			
 
				+ *
			
 
				+ * This file provides functions and data structures for enabling the
			
 
				+ * scheduler to favor scheduling on cores can be boosted to a higher
			
 
				+ * frequency under ITMT.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/cpumask.h>
			
 
				+#include <linux/cpuset.h>
			
 
				+#include <linux/mutex.h>
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/sysctl.h>
			
 
				+#include <linux/nodemask.h>
			
 
				+
			
 
				+static DEFINE_MUTEX(itmt_update_mutex);
			
 
				+DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
			
 
				+
			
 
				+/* Boolean to track if system has ITMT capabilities */
			
 
				+static bool __read_mostly sched_itmt_capable;
			
 
				+
			
 
				+/*
			
 
				+ * Boolean to control whether we want to move processes to cpu capable
			
 
				+ * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
			
 
				+ * Technology 3.0.
			
 
				+ *
			
 
				+ * It can be set via /proc/sys/kernel/sched_itmt_enabled
			
 
				+ */
			
 
				+unsigned int __read_mostly sysctl_sched_itmt_enabled;
			
 
				+
			
 
				+static int sched_itmt_update_handler(struct ctl_table *table, int write,
			
 
				+				     void __user *buffer, size_t *lenp,
			
 
				+				     loff_t *ppos)
			
 
				+{
			
 
				+	unsigned int old_sysctl;
			
 
				+	int ret;
			
 
				+
			
 
				+	mutex_lock(&itmt_update_mutex);
			
 
				+
			
 
				+	if (!sched_itmt_capable) {
			
 
				+		mutex_unlock(&itmt_update_mutex);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	old_sysctl = sysctl_sched_itmt_enabled;
			
 
				+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
			
 
				+
			
 
				+	if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
			
 
				+		x86_topology_update = true;
			
 
				+		rebuild_sched_domains();
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&itmt_update_mutex);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static unsigned int zero;
			
 
				+static unsigned int one = 1;
			
 
				+static struct ctl_table itmt_kern_table[] = {
			
 
				+	{
			
 
				+		.procname	= "sched_itmt_enabled",
			
 
				+		.data		= &sysctl_sched_itmt_enabled,
			
 
				+		.maxlen		= sizeof(unsigned int),
			
 
				+		.mode		= 0644,
			
 
				+		.proc_handler	= sched_itmt_update_handler,
			
 
				+		.extra1		= &zero,
			
 
				+		.extra2		= &one,
			
 
				+	},
			
 
				+	{}
			
 
				+};
			
 
				+
			
 
				+static struct ctl_table itmt_root_table[] = {
			
 
				+	{
			
 
				+		.procname	= "kernel",
			
 
				+		.mode		= 0555,
			
 
				+		.child		= itmt_kern_table,
			
 
				+	},
			
 
				+	{}
			
 
				+};
			
 
				+
			
 
				+static struct ctl_table_header *itmt_sysctl_header;
			
 
				+
			
 
				+/**
			
 
				+ * sched_set_itmt_support() - Indicate platform supports ITMT
			
 
				+ *
			
 
				+ * This function is used by the OS to indicate to scheduler that the platform
			
 
				+ * is capable of supporting the ITMT feature.
			
 
				+ *
			
 
				+ * The current scheme has the pstate driver detects if the system
			
 
				+ * is ITMT capable and call sched_set_itmt_support.
			
 
				+ *
			
 
				+ * This must be done only after sched_set_itmt_core_prio
			
 
				+ * has been called to set the cpus' priorities.
			
 
				+ * It must not be called with cpu hot plug lock
			
 
				+ * held as we need to acquire the lock to rebuild sched domains
			
 
				+ * later.
			
 
				+ *
			
 
				+ * Return: 0 on success
			
 
				+ */
			
 
				+int sched_set_itmt_support(void)
			
 
				+{
			
 
				+	mutex_lock(&itmt_update_mutex);
			
 
				+
			
 
				+	if (sched_itmt_capable) {
			
 
				+		mutex_unlock(&itmt_update_mutex);
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	itmt_sysctl_header = register_sysctl_table(itmt_root_table);
			
 
				+	if (!itmt_sysctl_header) {
			
 
				+		mutex_unlock(&itmt_update_mutex);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	sched_itmt_capable = true;
			
 
				+
			
 
				+	sysctl_sched_itmt_enabled = 1;
			
 
				+
			
 
				+	if (sysctl_sched_itmt_enabled) {
			
 
				+		x86_topology_update = true;
			
 
				+		rebuild_sched_domains();
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&itmt_update_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * sched_clear_itmt_support() - Revoke platform's support of ITMT
			
 
				+ *
			
 
				+ * This function is used by the OS to indicate that it has
			
 
				+ * revoked the platform's support of ITMT feature.
			
 
				+ *
			
 
				+ * It must not be called with cpu hot plug lock
			
 
				+ * held as we need to acquire the lock to rebuild sched domains
			
 
				+ * later.
			
 
				+ */
			
 
				+void sched_clear_itmt_support(void)
			
 
				+{
			
 
				+	mutex_lock(&itmt_update_mutex);
			
 
				+
			
 
				+	if (!sched_itmt_capable) {
			
 
				+		mutex_unlock(&itmt_update_mutex);
			
 
				+		return;
			
 
				+	}
			
 
				+	sched_itmt_capable = false;
			
 
				+
			
 
				+	if (itmt_sysctl_header) {
			
 
				+		unregister_sysctl_table(itmt_sysctl_header);
			
 
				+		itmt_sysctl_header = NULL;
			
 
				+	}
			
 
				+
			
 
				+	if (sysctl_sched_itmt_enabled) {
			
 
				+		/* disable sched_itmt if we are no longer ITMT capable */
			
 
				+		sysctl_sched_itmt_enabled = 0;
			
 
				+		x86_topology_update = true;
			
 
				+		rebuild_sched_domains();
			
 
				+	}
			
 
				+
			
 
				+	mutex_unlock(&itmt_update_mutex);
			
 
				+}
			
 
				+
			
 
				+int arch_asym_cpu_priority(int cpu)
			
 
				+{
			
 
				+	return per_cpu(sched_core_priority, cpu);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * sched_set_itmt_core_prio() - Set CPU priority based on ITMT
			
 
				+ * @prio:	Priority of cpu core
			
 
				+ * @core_cpu:	The cpu number associated with the core
			
 
				+ *
			
 
				+ * The pstate driver will find out the max boost frequency
			
 
				+ * and call this function to set a priority proportional
			
 
				+ * to the max boost frequency. CPU with higher boost
			
 
				+ * frequency will receive higher priority.
			
 
				+ *
			
 
				+ * No need to rebuild sched domain after updating
			
 
				+ * the CPU priorities. The sched domains have no
			
 
				+ * dependency on CPU priorities.
			
 
				+ */
			
 
				+void sched_set_itmt_core_prio(int prio, int core_cpu)
			
 
				+{
			
 
				+	int cpu, i = 1;
			
 
				+
			
 
				+	for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) {
			
 
				+		int smt_prio;
			
 
				+
			
 
				+		/*
			
 
				+		 * Ensure that the siblings are moved to the end
			
 
				+		 * of the priority chain and only used when
			
 
				+		 * all other high priority cpus are out of capacity.
			
 
				+		 */
			
 
				+		smt_prio = prio * smp_num_siblings / i;
			
 
				+		per_cpu(sched_core_priority, cpu) = smt_prio;
			
 
				+		i++;
			
 
				+	}
			
 
				+}
			
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -109,6 +109,17 @@ static bool logical_packages_frozen __read_mostly;
 
				 /* Maximum number of SMT threads on any online core */
			
 
				 int __max_smt_threads __read_mostly;
			
 
				 
			
 
				+/* Flag to indicate if a complete sched domain rebuild is required */
			
 
				+bool x86_topology_update;
			
 
				+
			
 
				+int arch_update_cpu_topology(void)
			
 
				+{
			
 
				+	int retval = x86_topology_update;
			
 
				+
			
 
				+	x86_topology_update = false;
			
 
				+	return retval;
			
 
				+}
			
 
				+
			
 
				 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
			
 
				 {
			
 
				 	unsigned long flags;
			
@@ -471,22 +482,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				+#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC)
			
 
				+static inline int x86_sched_itmt_flags(void)
			
 
				+{
			
 
				+	return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0;
			
 
				+}
			
 
				+
			
 
				+#ifdef CONFIG_SCHED_MC
			
 
				+static int x86_core_flags(void)
			
 
				+{
			
 
				+	return cpu_core_flags() | x86_sched_itmt_flags();
			
 
				+}
			
 
				+#endif
			
 
				+#ifdef CONFIG_SCHED_SMT
			
 
				+static int x86_smt_flags(void)
			
 
				+{
			
 
				+	return cpu_smt_flags() | x86_sched_itmt_flags();
			
 
				+}
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				 static struct sched_domain_topology_level x86_numa_in_package_topology[] = {
			
 
				 #ifdef CONFIG_SCHED_SMT
			
 
				-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
			
 
				+	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
			
 
				 #endif
			
 
				 #ifdef CONFIG_SCHED_MC
			
 
				-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
			
 
				+	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
			
 
				 #endif
			
 
				 	{ NULL, },
			
 
				 };
			
 
				 
			
 
				 static struct sched_domain_topology_level x86_topology[] = {
			
 
				 #ifdef CONFIG_SCHED_SMT
			
 
				-	{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
			
 
				+	{ cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) },
			
 
				 #endif
			
 
				 #ifdef CONFIG_SCHED_MC
			
 
				-	{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
			
 
				+	{ cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) },
			
 
				 #endif
			
 
				 	{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
			
 
				 	{ NULL, },
			
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -331,6 +331,16 @@ static void acpi_bus_osc_support(void)
 
				 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_HOTPLUG_OST_SUPPORT;
			
 
				 	capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_PCLPI_SUPPORT;
			
 
				 
			
 
				+#ifdef CONFIG_X86
			
 
				+	if (boot_cpu_has(X86_FEATURE_HWP)) {
			
 
				+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_SUPPORT;
			
 
				+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPCV2_SUPPORT;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	if (IS_ENABLED(CONFIG_SCHED_MC_PRIO))
			
 
				+		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_CPC_DIVERSE_HIGH_SUPPORT;
			
 
				+
			
 
				 	if (!ghes_disable)
			
 
				 		capbuf[OSC_SUPPORT_DWORD] |= OSC_SB_APEI_SUPPORT;
			
 
				 	if (ACPI_FAILURE(acpi_get_handle(NULL, "\\_SB", &handle)))
			
--- a/drivers/cpufreq/Kconfig.x86
+++ b/drivers/cpufreq/Kconfig.x86
@@ -6,6 +6,7 @@ config X86_INTEL_PSTATE
 
				        bool "Intel P state control"
			
 
				        depends on X86
			
 
				        select ACPI_PROCESSOR if ACPI
			
 
				+       select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO
			
 
				        help
			
 
				           This driver provides a P state for Intel core processors.
			
 
				 	  The driver implements an internal governor and will become
			
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -44,6 +44,7 @@
 
				 
			
 
				 #ifdef CONFIG_ACPI
			
 
				 #include <acpi/processor.h>
			
 
				+#include <acpi/cppc_acpi.h>
			
 
				 #endif
			
 
				 
			
 
				 #define FRAC_BITS 8
			
@@ -379,14 +380,67 @@ static bool intel_pstate_get_ppc_enable_status(void)
 
				 	return acpi_ppc;
			
 
				 }
			
 
				 
			
 
				+#ifdef CONFIG_ACPI_CPPC_LIB
			
 
				+
			
 
				+/* The work item is needed to avoid CPU hotplug locking issues */
			
 
				+static void intel_pstste_sched_itmt_work_fn(struct work_struct *work)
			
 
				+{
			
 
				+	sched_set_itmt_support();
			
 
				+}
			
 
				+
			
 
				+static DECLARE_WORK(sched_itmt_work, intel_pstste_sched_itmt_work_fn);
			
 
				+
			
 
				+static void intel_pstate_set_itmt_prio(int cpu)
			
 
				+{
			
 
				+	struct cppc_perf_caps cppc_perf;
			
 
				+	static u32 max_highest_perf = 0, min_highest_perf = U32_MAX;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = cppc_get_perf_caps(cpu, &cppc_perf);
			
 
				+	if (ret)
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * The priorities can be set regardless of whether or not
			
 
				+	 * sched_set_itmt_support(true) has been called and it is valid to
			
 
				+	 * update them at any time after it has been called.
			
 
				+	 */
			
 
				+	sched_set_itmt_core_prio(cppc_perf.highest_perf, cpu);
			
 
				+
			
 
				+	if (max_highest_perf <= min_highest_perf) {
			
 
				+		if (cppc_perf.highest_perf > max_highest_perf)
			
 
				+			max_highest_perf = cppc_perf.highest_perf;
			
 
				+
			
 
				+		if (cppc_perf.highest_perf < min_highest_perf)
			
 
				+			min_highest_perf = cppc_perf.highest_perf;
			
 
				+
			
 
				+		if (max_highest_perf > min_highest_perf) {
			
 
				+			/*
			
 
				+			 * This code can be run during CPU online under the
			
 
				+			 * CPU hotplug locks, so sched_set_itmt_support()
			
 
				+			 * cannot be called from here.  Queue up a work item
			
 
				+			 * to invoke it.
			
 
				+			 */
			
 
				+			schedule_work(&sched_itmt_work);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+#else
			
 
				+static void intel_pstate_set_itmt_prio(int cpu)
			
 
				+{
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				 static void intel_pstate_init_acpi_perf_limits(struct cpufreq_policy *policy)
			
 
				 {
			
 
				 	struct cpudata *cpu;
			
 
				 	int ret;
			
 
				 	int i;
			
 
				 
			
 
				-	if (hwp_active)
			
 
				+	if (hwp_active) {
			
 
				+		intel_pstate_set_itmt_prio(policy->cpu);
			
 
				 		return;
			
 
				+	}
			
 
				 
			
 
				 	if (!intel_pstate_get_ppc_enable_status())
			
 
				 		return;
			
--- a/include/asm-generic/cputime_jiffies.h
+++ b/include/asm-generic/cputime_jiffies.h
@@ -7,7 +7,6 @@ typedef unsigned long __nocast cputime_t;
 
				 
			
 
				 #define cputime_one_jiffy		jiffies_to_cputime(1)
			
 
				 #define cputime_to_jiffies(__ct)	(__force unsigned long)(__ct)
			
 
				-#define cputime_to_scaled(__ct)		(__ct)
			
 
				 #define jiffies_to_cputime(__hz)	(__force cputime_t)(__hz)
			
 
				 
			
 
				 typedef u64 __nocast cputime64_t;
			
--- a/include/asm-generic/cputime_nsecs.h
+++ b/include/asm-generic/cputime_nsecs.h
@@ -34,7 +34,6 @@ typedef u64 __nocast cputime64_t;
 
				  */
			
 
				 #define cputime_to_jiffies(__ct)	\
			
 
				 	cputime_div(__ct, NSEC_PER_SEC / HZ)
			
 
				-#define cputime_to_scaled(__ct)		(__ct)
			
 
				 #define jiffies_to_cputime(__jif)	\
			
 
				 	(__force cputime_t)((__jif) * (NSEC_PER_SEC / HZ))
			
 
				 #define cputime64_to_jiffies64(__ct)	\
			
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -469,6 +469,7 @@ acpi_status acpi_run_osc(acpi_handle handle, struct acpi_osc_context *context);
 
				 #define OSC_SB_CPCV2_SUPPORT			0x00000040
			
 
				 #define OSC_SB_PCLPI_SUPPORT			0x00000080
			
 
				 #define OSC_SB_OSLPI_SUPPORT			0x00000100
			
 
				+#define OSC_SB_CPC_DIVERSE_HIGH_SUPPORT		0x00001000
			
 
				 
			
 
				 extern bool osc_sb_apei_support_acked;
			
 
				 extern bool osc_pc_lpi_support_confirmed;
			
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -78,8 +78,8 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu)
 
				 	return kstat_cpu(cpu).irqs_sum;
			
 
				 }
			
 
				 
			
 
				-extern void account_user_time(struct task_struct *, cputime_t, cputime_t);
			
 
				-extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);
			
 
				+extern void account_user_time(struct task_struct *, cputime_t);
			
 
				+extern void account_system_time(struct task_struct *, int, cputime_t);
			
 
				 extern void account_steal_time(cputime_t);
			
 
				 extern void account_idle_time(cputime_t);
			
 
				 
			
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -48,6 +48,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 
				 	__k;								   \
			
 
				 })
			
 
				 
			
 
				+void free_kthread_struct(struct task_struct *k);
			
 
				 void kthread_bind(struct task_struct *k, unsigned int cpu);
			
 
				 void kthread_bind_mask(struct task_struct *k, const struct cpumask *mask);
			
 
				 int kthread_stop(struct task_struct *k);
			
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -65,19 +65,24 @@
 
				 
			
 
				 /*
			
 
				  * Are we doing bottom half or hardware interrupt processing?
			
 
				- * Are we in a softirq context? Interrupt context?
			
 
				- * in_softirq - Are we currently processing softirq or have bh disabled?
			
 
				- * in_serving_softirq - Are we currently processing softirq?
			
 
				+ *
			
 
				+ * in_irq()       - We're in (hard) IRQ context
			
 
				+ * in_softirq()   - We have BH disabled, or are processing softirqs
			
 
				+ * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled
			
 
				+ * in_serving_softirq() - We're in softirq context
			
 
				+ * in_nmi()       - We're in NMI context
			
 
				+ * in_task()	  - We're in task context
			
 
				+ *
			
 
				+ * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really
			
 
				+ *       should not be used in new code.
			
 
				  */
			
 
				 #define in_irq()		(hardirq_count())
			
 
				 #define in_softirq()		(softirq_count())
			
 
				 #define in_interrupt()		(irq_count())
			
 
				 #define in_serving_softirq()	(softirq_count() & SOFTIRQ_OFFSET)
			
 
				-
			
 
				-/*
			
 
				- * Are we in NMI context?
			
 
				- */
			
 
				-#define in_nmi()	(preempt_count() & NMI_MASK)
			
 
				+#define in_nmi()		(preempt_count() & NMI_MASK)
			
 
				+#define in_task()		(!(preempt_count() & \
			
 
				+				   (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET)))
			
 
				 
			
 
				 /*
			
 
				  * The preempt_count offset after preempt_disable();
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -262,20 +262,9 @@ extern char ___assert_task_state[1 - 2*!!(
 
				 #define set_task_state(tsk, state_value)			\
			
 
				 	do {							\
			
 
				 		(tsk)->task_state_change = _THIS_IP_;		\
			
 
				-		smp_store_mb((tsk)->state, (state_value));		\
			
 
				+		smp_store_mb((tsk)->state, (state_value));	\
			
 
				 	} while (0)
			
 
				 
			
 
				-/*
			
 
				- * set_current_state() includes a barrier so that the write of current->state
			
 
				- * is correctly serialised wrt the caller's subsequent test of whether to
			
 
				- * actually sleep:
			
 
				- *
			
 
				- *	set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				- *	if (do_i_need_to_sleep())
			
 
				- *		schedule();
			
 
				- *
			
 
				- * If the caller does not need such serialisation then use __set_current_state()
			
 
				- */
			
 
				 #define __set_current_state(state_value)			\
			
 
				 	do {							\
			
 
				 		current->task_state_change = _THIS_IP_;		\
			
@@ -284,11 +273,19 @@ extern char ___assert_task_state[1 - 2*!!(
 
				 #define set_current_state(state_value)				\
			
 
				 	do {							\
			
 
				 		current->task_state_change = _THIS_IP_;		\
			
 
				-		smp_store_mb(current->state, (state_value));		\
			
 
				+		smp_store_mb(current->state, (state_value));	\
			
 
				 	} while (0)
			
 
				 
			
 
				 #else
			
 
				 
			
 
				+/*
			
 
				+ * @tsk had better be current, or you get to keep the pieces.
			
 
				+ *
			
 
				+ * The only reason is that computing current can be more expensive than
			
 
				+ * using a pointer that's already available.
			
 
				+ *
			
 
				+ * Therefore, see set_current_state().
			
 
				+ */
			
 
				 #define __set_task_state(tsk, state_value)		\
			
 
				 	do { (tsk)->state = (state_value); } while (0)
			
 
				 #define set_task_state(tsk, state_value)		\
			
@@ -299,11 +296,34 @@ extern char ___assert_task_state[1 - 2*!!(
 
				  * is correctly serialised wrt the caller's subsequent test of whether to
			
 
				  * actually sleep:
			
 
				  *
			
 
				+ *   for (;;) {
			
 
				  *	set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				- *	if (do_i_need_to_sleep())
			
 
				- *		schedule();
			
 
				+ *	if (!need_sleep)
			
 
				+ *		break;
			
 
				+ *
			
 
				+ *	schedule();
			
 
				+ *   }
			
 
				+ *   __set_current_state(TASK_RUNNING);
			
 
				+ *
			
 
				+ * If the caller does not need such serialisation (because, for instance, the
			
 
				+ * condition test and condition change and wakeup are under the same lock) then
			
 
				+ * use __set_current_state().
			
 
				+ *
			
 
				+ * The above is typically ordered against the wakeup, which does:
			
 
				+ *
			
 
				+ *	need_sleep = false;
			
 
				+ *	wake_up_state(p, TASK_UNINTERRUPTIBLE);
			
 
				+ *
			
 
				+ * Where wake_up_state() (and all other wakeup primitives) imply enough
			
 
				+ * barriers to order the store of the variable against wakeup.
			
 
				+ *
			
 
				+ * Wakeup will do: if (@state & p->state) p->state = TASK_RUNNING, that is,
			
 
				+ * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a
			
 
				+ * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING).
			
 
				  *
			
 
				- * If the caller does not need such serialisation then use __set_current_state()
			
 
				+ * This is obviously fine, since they both store the exact same value.
			
 
				+ *
			
 
				+ * Also see the comments of try_to_wake_up().
			
 
				  */
			
 
				 #define __set_current_state(state_value)		\
			
 
				 	do { current->state = (state_value); } while (0)
			
@@ -1057,6 +1077,8 @@ static inline int cpu_numa_flags(void)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+extern int arch_asym_cpu_priority(int cpu);
			
 
				+
			
 
				 struct sched_domain_attr {
			
 
				 	int relax_domain_level;
			
 
				 };
			
@@ -1627,7 +1649,10 @@ struct task_struct {
 
				 	int __user *set_child_tid;		/* CLONE_CHILD_SETTID */
			
 
				 	int __user *clear_child_tid;		/* CLONE_CHILD_CLEARTID */
			
 
				 
			
 
				-	cputime_t utime, stime, utimescaled, stimescaled;
			
 
				+	cputime_t utime, stime;
			
 
				+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
			
 
				+	cputime_t utimescaled, stimescaled;
			
 
				+#endif
			
 
				 	cputime_t gtime;
			
 
				 	struct prev_cputime prev_cputime;
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
@@ -2220,34 +2245,38 @@ struct task_struct *try_get_task_struct(struct task_struct **ptask);
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
 
				 extern void task_cputime(struct task_struct *t,
			
 
				 			 cputime_t *utime, cputime_t *stime);
			
 
				-extern void task_cputime_scaled(struct task_struct *t,
			
 
				-				cputime_t *utimescaled, cputime_t *stimescaled);
			
 
				 extern cputime_t task_gtime(struct task_struct *t);
			
 
				 #else
			
 
				 static inline void task_cputime(struct task_struct *t,
			
 
				 				cputime_t *utime, cputime_t *stime)
			
 
				 {
			
 
				-	if (utime)
			
 
				-		*utime = t->utime;
			
 
				-	if (stime)
			
 
				-		*stime = t->stime;
			
 
				+	*utime = t->utime;
			
 
				+	*stime = t->stime;
			
 
				 }
			
 
				 
			
 
				+static inline cputime_t task_gtime(struct task_struct *t)
			
 
				+{
			
 
				+	return t->gtime;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
			
 
				 static inline void task_cputime_scaled(struct task_struct *t,
			
 
				 				       cputime_t *utimescaled,
			
 
				 				       cputime_t *stimescaled)
			
 
				 {
			
 
				-	if (utimescaled)
			
 
				-		*utimescaled = t->utimescaled;
			
 
				-	if (stimescaled)
			
 
				-		*stimescaled = t->stimescaled;
			
 
				+	*utimescaled = t->utimescaled;
			
 
				+	*stimescaled = t->stimescaled;
			
 
				 }
			
 
				-
			
 
				-static inline cputime_t task_gtime(struct task_struct *t)
			
 
				+#else
			
 
				+static inline void task_cputime_scaled(struct task_struct *t,
			
 
				+				       cputime_t *utimescaled,
			
 
				+				       cputime_t *stimescaled)
			
 
				 {
			
 
				-	return t->gtime;
			
 
				+	task_cputime(t, utimescaled, stimescaled);
			
 
				 }
			
 
				 #endif
			
 
				+
			
 
				 extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
			
 
				 extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
			
 
				 
			
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -36,7 +36,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 
				 extern unsigned int sysctl_sched_migration_cost;
			
 
				 extern unsigned int sysctl_sched_nr_migrate;
			
 
				 extern unsigned int sysctl_sched_time_avg;
			
 
				-extern unsigned int sysctl_sched_shares_window;
			
 
				 
			
 
				 int sched_proc_update_handler(struct ctl_table *table, int write,
			
 
				 		void __user *buffer, size_t *length,
			
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -354,6 +354,8 @@ void free_task(struct task_struct *tsk)
 
				 	ftrace_graph_exit_task(tsk);
			
 
				 	put_seccomp_filter(tsk);
			
 
				 	arch_release_task_struct(tsk);
			
 
				+	if (tsk->flags & PF_KTHREAD)
			
 
				+		free_kthread_struct(tsk);
			
 
				 	free_task_struct(tsk);
			
 
				 }
			
 
				 EXPORT_SYMBOL(free_task);
			
@@ -1551,7 +1553,9 @@ static __latent_entropy struct task_struct *copy_process(
 
				 	init_sigpending(&p->pending);
			
 
				 
			
 
				 	p->utime = p->stime = p->gtime = 0;
			
 
				+#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME
			
 
				 	p->utimescaled = p->stimescaled = 0;
			
 
				+#endif
			
 
				 	prev_cputime_init(&p->prev_cputime);
			
 
				 
			
 
				 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
			
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -53,20 +53,29 @@ enum KTHREAD_BITS {
 
				 	KTHREAD_IS_PARKED,
			
 
				 };
			
 
				 
			
 
				-#define __to_kthread(vfork)	\
			
 
				-	container_of(vfork, struct kthread, exited)
			
 
				+static inline void set_kthread_struct(void *kthread)
			
 
				+{
			
 
				+	/*
			
 
				+	 * We abuse ->set_child_tid to avoid the new member and because it
			
 
				+	 * can't be wrongly copied by copy_process(). We also rely on fact
			
 
				+	 * that the caller can't exec, so PF_KTHREAD can't be cleared.
			
 
				+	 */
			
 
				+	current->set_child_tid = (__force void __user *)kthread;
			
 
				+}
			
 
				 
			
 
				 static inline struct kthread *to_kthread(struct task_struct *k)
			
 
				 {
			
 
				-	return __to_kthread(k->vfork_done);
			
 
				+	WARN_ON(!(k->flags & PF_KTHREAD));
			
 
				+	return (__force void *)k->set_child_tid;
			
 
				 }
			
 
				 
			
 
				-static struct kthread *to_live_kthread(struct task_struct *k)
			
 
				+void free_kthread_struct(struct task_struct *k)
			
 
				 {
			
 
				-	struct completion *vfork = ACCESS_ONCE(k->vfork_done);
			
 
				-	if (likely(vfork) && try_get_task_stack(k))
			
 
				-		return __to_kthread(vfork);
			
 
				-	return NULL;
			
 
				+	/*
			
 
				+	 * Can be NULL if this kthread was created by kernel_thread()
			
 
				+	 * or if kmalloc() in kthread() failed.
			
 
				+	 */
			
 
				+	kfree(to_kthread(k));
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -181,14 +190,11 @@ static int kthread(void *_create)
 
				 	int (*threadfn)(void *data) = create->threadfn;
			
 
				 	void *data = create->data;
			
 
				 	struct completion *done;
			
 
				-	struct kthread self;
			
 
				+	struct kthread *self;
			
 
				 	int ret;
			
 
				 
			
 
				-	self.flags = 0;
			
 
				-	self.data = data;
			
 
				-	init_completion(&self.exited);
			
 
				-	init_completion(&self.parked);
			
 
				-	current->vfork_done = &self.exited;
			
 
				+	self = kmalloc(sizeof(*self), GFP_KERNEL);
			
 
				+	set_kthread_struct(self);
			
 
				 
			
 
				 	/* If user was SIGKILLed, I release the structure. */
			
 
				 	done = xchg(&create->done, NULL);
			
@@ -196,6 +202,19 @@ static int kthread(void *_create)
 
				 		kfree(create);
			
 
				 		do_exit(-EINTR);
			
 
				 	}
			
 
				+
			
 
				+	if (!self) {
			
 
				+		create->result = ERR_PTR(-ENOMEM);
			
 
				+		complete(done);
			
 
				+		do_exit(-ENOMEM);
			
 
				+	}
			
 
				+
			
 
				+	self->flags = 0;
			
 
				+	self->data = data;
			
 
				+	init_completion(&self->exited);
			
 
				+	init_completion(&self->parked);
			
 
				+	current->vfork_done = &self->exited;
			
 
				+
			
 
				 	/* OK, tell user we're spawned, wait for stop or wakeup */
			
 
				 	__set_current_state(TASK_UNINTERRUPTIBLE);
			
 
				 	create->result = current;
			
@@ -203,12 +222,10 @@ static int kthread(void *_create)
 
				 	schedule();
			
 
				 
			
 
				 	ret = -EINTR;
			
 
				-
			
 
				-	if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
			
 
				-		__kthread_parkme(&self);
			
 
				+	if (!test_bit(KTHREAD_SHOULD_STOP, &self->flags)) {
			
 
				+		__kthread_parkme(self);
			
 
				 		ret = threadfn(data);
			
 
				 	}
			
 
				-	/* we can't just return, we must preserve "self" on stack */
			
 
				 	do_exit(ret);
			
 
				 }
			
 
				 
			
@@ -409,8 +426,18 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 
				 	return p;
			
 
				 }
			
 
				 
			
 
				-static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
			
 
				+/**
			
 
				+ * kthread_unpark - unpark a thread created by kthread_create().
			
 
				+ * @k:		thread created by kthread_create().
			
 
				+ *
			
 
				+ * Sets kthread_should_park() for @k to return false, wakes it, and
			
 
				+ * waits for it to return. If the thread is marked percpu then its
			
 
				+ * bound to the cpu again.
			
 
				+ */
			
 
				+void kthread_unpark(struct task_struct *k)
			
 
				 {
			
 
				+	struct kthread *kthread = to_kthread(k);
			
 
				+
			
 
				 	clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
			
 
				 	/*
			
 
				 	 * We clear the IS_PARKED bit here as we don't wait
			
@@ -428,24 +455,6 @@ static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
 
				 		wake_up_state(k, TASK_PARKED);
			
 
				 	}
			
 
				 }
			
 
				-
			
 
				-/**
			
 
				- * kthread_unpark - unpark a thread created by kthread_create().
			
 
				- * @k:		thread created by kthread_create().
			
 
				- *
			
 
				- * Sets kthread_should_park() for @k to return false, wakes it, and
			
 
				- * waits for it to return. If the thread is marked percpu then its
			
 
				- * bound to the cpu again.
			
 
				- */
			
 
				-void kthread_unpark(struct task_struct *k)
			
 
				-{
			
 
				-	struct kthread *kthread = to_live_kthread(k);
			
 
				-
			
 
				-	if (kthread) {
			
 
				-		__kthread_unpark(k, kthread);
			
 
				-		put_task_stack(k);
			
 
				-	}
			
 
				-}
			
 
				 EXPORT_SYMBOL_GPL(kthread_unpark);
			
 
				 
			
 
				 /**
			
@@ -462,21 +471,20 @@ EXPORT_SYMBOL_GPL(kthread_unpark);
 
				  */
			
 
				 int kthread_park(struct task_struct *k)
			
 
				 {
			
 
				-	struct kthread *kthread = to_live_kthread(k);
			
 
				-	int ret = -ENOSYS;
			
 
				-
			
 
				-	if (kthread) {
			
 
				-		if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
			
 
				-			set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
			
 
				-			if (k != current) {
			
 
				-				wake_up_process(k);
			
 
				-				wait_for_completion(&kthread->parked);
			
 
				-			}
			
 
				+	struct kthread *kthread = to_kthread(k);
			
 
				+
			
 
				+	if (WARN_ON(k->flags & PF_EXITING))
			
 
				+		return -ENOSYS;
			
 
				+
			
 
				+	if (!test_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
			
 
				+		set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
			
 
				+		if (k != current) {
			
 
				+			wake_up_process(k);
			
 
				+			wait_for_completion(&kthread->parked);
			
 
				 		}
			
 
				-		put_task_stack(k);
			
 
				-		ret = 0;
			
 
				 	}
			
 
				-	return ret;
			
 
				+
			
 
				+	return 0;
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(kthread_park);
			
 
				 
			
@@ -503,14 +511,11 @@ int kthread_stop(struct task_struct *k)
 
				 	trace_sched_kthread_stop(k);
			
 
				 
			
 
				 	get_task_struct(k);
			
 
				-	kthread = to_live_kthread(k);
			
 
				-	if (kthread) {
			
 
				-		set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
			
 
				-		__kthread_unpark(k, kthread);
			
 
				-		wake_up_process(k);
			
 
				-		wait_for_completion(&kthread->exited);
			
 
				-		put_task_stack(k);
			
 
				-	}
			
 
				+	kthread = to_kthread(k);
			
 
				+	set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
			
 
				+	kthread_unpark(k);
			
 
				+	wake_up_process(k);
			
 
				+	wait_for_completion(&kthread->exited);
			
 
				 	ret = k->exit_code;
			
 
				 	put_task_struct(k);
			
 
				 
			
@@ -636,6 +641,7 @@ __kthread_create_worker(int cpu, unsigned int flags,
 
				 {
			
 
				 	struct kthread_worker *worker;
			
 
				 	struct task_struct *task;
			
 
				+	int node = -1;
			
 
				 
			
 
				 	worker = kzalloc(sizeof(*worker), GFP_KERNEL);
			
 
				 	if (!worker)
			
@@ -643,25 +649,17 @@ __kthread_create_worker(int cpu, unsigned int flags,
 
				 
			
 
				 	kthread_init_worker(worker);
			
 
				 
			
 
				-	if (cpu >= 0) {
			
 
				-		char name[TASK_COMM_LEN];
			
 
				-
			
 
				-		/*
			
 
				-		 * kthread_create_worker_on_cpu() allows to pass a generic
			
 
				-		 * namefmt in compare with kthread_create_on_cpu. We need
			
 
				-		 * to format it here.
			
 
				-		 */
			
 
				-		vsnprintf(name, sizeof(name), namefmt, args);
			
 
				-		task = kthread_create_on_cpu(kthread_worker_fn, worker,
			
 
				-					     cpu, name);
			
 
				-	} else {
			
 
				-		task = __kthread_create_on_node(kthread_worker_fn, worker,
			
 
				-						-1, namefmt, args);
			
 
				-	}
			
 
				+	if (cpu >= 0)
			
 
				+		node = cpu_to_node(cpu);
			
 
				 
			
 
				+	task = __kthread_create_on_node(kthread_worker_fn, worker,
			
 
				+						node, namefmt, args);
			
 
				 	if (IS_ERR(task))
			
 
				 		goto fail_task;
			
 
				 
			
 
				+	if (cpu >= 0)
			
 
				+		kthread_bind(task, cpu);
			
 
				+
			
 
				 	worker->flags = flags;
			
 
				 	worker->task = task;
			
 
				 	wake_up_process(task);
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1995,14 +1995,15 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
 
				  * @state: the mask of task states that can be woken
			
 
				  * @wake_flags: wake modifier flags (WF_*)
			
 
				  *
			
 
				- * Put it on the run-queue if it's not already there. The "current"
			
 
				- * thread is always on the run-queue (except when the actual
			
 
				- * re-schedule is in progress), and as such you're allowed to do
			
 
				- * the simpler "current->state = TASK_RUNNING" to mark yourself
			
 
				- * runnable without the overhead of this.
			
 
				+ * If (@state & @p->state) @p->state = TASK_RUNNING.
			
 
				  *
			
 
				- * Return: %true if @p was woken up, %false if it was already running.
			
 
				- * or @state didn't match @p's state.
			
 
				+ * If the task was not queued/runnable, also place it back on a runqueue.
			
 
				+ *
			
 
				+ * Atomic against schedule() which would dequeue a task, also see
			
 
				+ * set_current_state().
			
 
				+ *
			
 
				+ * Return: %true if @p->state changes (an actual wakeup was done),
			
 
				+ *	   %false otherwise.
			
 
				  */
			
 
				 static int
			
 
				 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
			
@@ -5707,7 +5708,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 
				 		printk(KERN_CONT " %*pbl",
			
 
				 		       cpumask_pr_args(sched_group_cpus(group)));
			
 
				 		if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
			
 
				-			printk(KERN_CONT " (cpu_capacity = %d)",
			
 
				+			printk(KERN_CONT " (cpu_capacity = %lu)",
			
 
				 				group->sgc->capacity);
			
 
				 		}
			
 
				 
			
@@ -6184,6 +6185,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 
				 		 * die on a /0 trap.
			
 
				 		 */
			
 
				 		sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
			
 
				+		sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
			
 
				 
			
 
				 		/*
			
 
				 		 * Make sure the first group of this domain contains the
			
@@ -6301,7 +6303,22 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 
				 	WARN_ON(!sg);
			
 
				 
			
 
				 	do {
			
 
				+		int cpu, max_cpu = -1;
			
 
				+
			
 
				 		sg->group_weight = cpumask_weight(sched_group_cpus(sg));
			
 
				+
			
 
				+		if (!(sd->flags & SD_ASYM_PACKING))
			
 
				+			goto next;
			
 
				+
			
 
				+		for_each_cpu(cpu, sched_group_cpus(sg)) {
			
 
				+			if (max_cpu < 0)
			
 
				+				max_cpu = cpu;
			
 
				+			else if (sched_asym_prefer(cpu, max_cpu))
			
 
				+				max_cpu = cpu;
			
 
				+		}
			
 
				+		sg->asym_prefer_cpu = max_cpu;
			
 
				+
			
 
				+next:
			
 
				 		sg = sg->next;
			
 
				 	} while (sg != sd->groups);
			
 
				 
			
@@ -7602,6 +7619,7 @@ void __init sched_init(void)
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 		root_task_group.shares = ROOT_TASK_GROUP_LOAD;
			
 
				 		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
			
 
				+		rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
			
 
				 		/*
			
 
				 		 * How much cpu bandwidth does root_task_group get?
			
 
				 		 *
			
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -297,7 +297,7 @@ static int cpuacct_stats_show(struct seq_file *sf, void *v)
 
				 	for (stat = 0; stat < CPUACCT_STAT_NSTATS; stat++) {
			
 
				 		seq_printf(sf, "%s %lld\n",
			
 
				 			   cpuacct_stat_desc[stat],
			
 
				-			   cputime64_to_clock_t(val[stat]));
			
 
				+			   (long long)cputime64_to_clock_t(val[stat]));
			
 
				 	}
			
 
				 
			
 
				 	return 0;
			
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -128,16 +128,13 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 
				  * Account user cpu time to a process.
			
 
				  * @p: the process that the cpu time gets accounted to
			
 
				  * @cputime: the cpu time spent in user space since the last update
			
 
				- * @cputime_scaled: cputime scaled by cpu frequency
			
 
				  */
			
 
				-void account_user_time(struct task_struct *p, cputime_t cputime,
			
 
				-		       cputime_t cputime_scaled)
			
 
				+void account_user_time(struct task_struct *p, cputime_t cputime)
			
 
				 {
			
 
				 	int index;
			
 
				 
			
 
				 	/* Add user time to process. */
			
 
				 	p->utime += cputime;
			
 
				-	p->utimescaled += cputime_scaled;
			
 
				 	account_group_user_time(p, cputime);
			
 
				 
			
 
				 	index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
			
@@ -153,16 +150,13 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 
				  * Account guest cpu time to a process.
			
 
				  * @p: the process that the cpu time gets accounted to
			
 
				  * @cputime: the cpu time spent in virtual machine since the last update
			
 
				- * @cputime_scaled: cputime scaled by cpu frequency
			
 
				  */
			
 
				-static void account_guest_time(struct task_struct *p, cputime_t cputime,
			
 
				-			       cputime_t cputime_scaled)
			
 
				+static void account_guest_time(struct task_struct *p, cputime_t cputime)
			
 
				 {
			
 
				 	u64 *cpustat = kcpustat_this_cpu->cpustat;
			
 
				 
			
 
				 	/* Add guest time to process. */
			
 
				 	p->utime += cputime;
			
 
				-	p->utimescaled += cputime_scaled;
			
 
				 	account_group_user_time(p, cputime);
			
 
				 	p->gtime += cputime;
			
 
				 
			
@@ -180,16 +174,13 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 
				  * Account system cpu time to a process and desired cpustat field
			
 
				  * @p: the process that the cpu time gets accounted to
			
 
				  * @cputime: the cpu time spent in kernel space since the last update
			
 
				- * @cputime_scaled: cputime scaled by cpu frequency
			
 
				- * @target_cputime64: pointer to cpustat field that has to be updated
			
 
				+ * @index: pointer to cpustat field that has to be updated
			
 
				  */
			
 
				 static inline
			
 
				-void __account_system_time(struct task_struct *p, cputime_t cputime,
			
 
				-			cputime_t cputime_scaled, int index)
			
 
				+void __account_system_time(struct task_struct *p, cputime_t cputime, int index)
			
 
				 {
			
 
				 	/* Add system time to process. */
			
 
				 	p->stime += cputime;
			
 
				-	p->stimescaled += cputime_scaled;
			
 
				 	account_group_system_time(p, cputime);
			
 
				 
			
 
				 	/* Add system time to cpustat. */
			
@@ -204,15 +195,14 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 
				  * @p: the process that the cpu time gets accounted to
			
 
				  * @hardirq_offset: the offset to subtract from hardirq_count()
			
 
				  * @cputime: the cpu time spent in kernel space since the last update
			
 
				- * @cputime_scaled: cputime scaled by cpu frequency
			
 
				  */
			
 
				 void account_system_time(struct task_struct *p, int hardirq_offset,
			
 
				-			 cputime_t cputime, cputime_t cputime_scaled)
			
 
				+			 cputime_t cputime)
			
 
				 {
			
 
				 	int index;
			
 
				 
			
 
				 	if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
			
 
				-		account_guest_time(p, cputime, cputime_scaled);
			
 
				+		account_guest_time(p, cputime);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
@@ -223,7 +213,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 
				 	else
			
 
				 		index = CPUTIME_SYSTEM;
			
 
				 
			
 
				-	__account_system_time(p, cputime, cputime_scaled, index);
			
 
				+	__account_system_time(p, cputime, index);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -390,7 +380,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 
				 					 struct rq *rq, int ticks)
			
 
				 {
			
 
				 	u64 cputime = (__force u64) cputime_one_jiffy * ticks;
			
 
				-	cputime_t scaled, other;
			
 
				+	cputime_t other;
			
 
				 
			
 
				 	/*
			
 
				 	 * When returning from idle, many ticks can get accounted at
			
@@ -403,7 +393,6 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 
				 	if (other >= cputime)
			
 
				 		return;
			
 
				 	cputime -= other;
			
 
				-	scaled = cputime_to_scaled(cputime);
			
 
				 
			
 
				 	if (this_cpu_ksoftirqd() == p) {
			
 
				 		/*
			
@@ -411,15 +400,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
 
				 		 * So, we have to handle it separately here.
			
 
				 		 * Also, p->stime needs to be updated for ksoftirqd.
			
 
				 		 */
			
 
				-		__account_system_time(p, cputime, scaled, CPUTIME_SOFTIRQ);
			
 
				+		__account_system_time(p, cputime, CPUTIME_SOFTIRQ);
			
 
				 	} else if (user_tick) {
			
 
				-		account_user_time(p, cputime, scaled);
			
 
				+		account_user_time(p, cputime);
			
 
				 	} else if (p == rq->idle) {
			
 
				 		account_idle_time(cputime);
			
 
				 	} else if (p->flags & PF_VCPU) { /* System time or guest time */
			
 
				-		account_guest_time(p, cputime, scaled);
			
 
				+		account_guest_time(p, cputime);
			
 
				 	} else {
			
 
				-		__account_system_time(p, cputime, scaled,	CPUTIME_SYSTEM);
			
 
				+		__account_system_time(p, cputime, CPUTIME_SYSTEM);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -502,7 +491,7 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
 
				  */
			
 
				 void account_process_tick(struct task_struct *p, int user_tick)
			
 
				 {
			
 
				-	cputime_t cputime, scaled, steal;
			
 
				+	cputime_t cputime, steal;
			
 
				 	struct rq *rq = this_rq();
			
 
				 
			
 
				 	if (vtime_accounting_cpu_enabled())
			
@@ -520,12 +509,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
 
				 		return;
			
 
				 
			
 
				 	cputime -= steal;
			
 
				-	scaled = cputime_to_scaled(cputime);
			
 
				 
			
 
				 	if (user_tick)
			
 
				-		account_user_time(p, cputime, scaled);
			
 
				+		account_user_time(p, cputime);
			
 
				 	else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
			
 
				-		account_system_time(p, HARDIRQ_OFFSET, cputime, scaled);
			
 
				+		account_system_time(p, HARDIRQ_OFFSET, cputime);
			
 
				 	else
			
 
				 		account_idle_time(cputime);
			
 
				 }
			
@@ -746,7 +734,7 @@ static void __vtime_account_system(struct task_struct *tsk)
 
				 {
			
 
				 	cputime_t delta_cpu = get_vtime_delta(tsk);
			
 
				 
			
 
				-	account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
			
 
				+	account_system_time(tsk, irq_count(), delta_cpu);
			
 
				 }
			
 
				 
			
 
				 void vtime_account_system(struct task_struct *tsk)
			
@@ -767,7 +755,7 @@ void vtime_account_user(struct task_struct *tsk)
 
				 	tsk->vtime_snap_whence = VTIME_SYS;
			
 
				 	if (vtime_delta(tsk)) {
			
 
				 		delta_cpu = get_vtime_delta(tsk);
			
 
				-		account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
			
 
				+		account_user_time(tsk, delta_cpu);
			
 
				 	}
			
 
				 	write_seqcount_end(&tsk->vtime_seqcount);
			
 
				 }
			
@@ -863,29 +851,25 @@ cputime_t task_gtime(struct task_struct *t)
 
				  * add up the pending nohz execution time since the last
			
 
				  * cputime snapshot.
			
 
				  */
			
 
				-static void
			
 
				-fetch_task_cputime(struct task_struct *t,
			
 
				-		   cputime_t *u_dst, cputime_t *s_dst,
			
 
				-		   cputime_t *u_src, cputime_t *s_src,
			
 
				-		   cputime_t *udelta, cputime_t *sdelta)
			
 
				+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
			
 
				 {
			
 
				+	cputime_t delta;
			
 
				 	unsigned int seq;
			
 
				-	unsigned long long delta;
			
 
				 
			
 
				-	do {
			
 
				-		*udelta = 0;
			
 
				-		*sdelta = 0;
			
 
				+	if (!vtime_accounting_enabled()) {
			
 
				+		*utime = t->utime;
			
 
				+		*stime = t->stime;
			
 
				+		return;
			
 
				+	}
			
 
				 
			
 
				+	do {
			
 
				 		seq = read_seqcount_begin(&t->vtime_seqcount);
			
 
				 
			
 
				-		if (u_dst)
			
 
				-			*u_dst = *u_src;
			
 
				-		if (s_dst)
			
 
				-			*s_dst = *s_src;
			
 
				+		*utime = t->utime;
			
 
				+		*stime = t->stime;
			
 
				 
			
 
				 		/* Task is sleeping, nothing to add */
			
 
				-		if (t->vtime_snap_whence == VTIME_INACTIVE ||
			
 
				-		    is_idle_task(t))
			
 
				+		if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t))
			
 
				 			continue;
			
 
				 
			
 
				 		delta = vtime_delta(t);
			
@@ -894,54 +878,10 @@ fetch_task_cputime(struct task_struct *t,
 
				 		 * Task runs either in user or kernel space, add pending nohz time to
			
 
				 		 * the right place.
			
 
				 		 */
			
 
				-		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
			
 
				-			*udelta = delta;
			
 
				-		} else {
			
 
				-			if (t->vtime_snap_whence == VTIME_SYS)
			
 
				-				*sdelta = delta;
			
 
				-		}
			
 
				+		if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU)
			
 
				+			*utime += delta;
			
 
				+		else if (t->vtime_snap_whence == VTIME_SYS)
			
 
				+			*stime += delta;
			
 
				 	} while (read_seqcount_retry(&t->vtime_seqcount, seq));
			
 
				 }
			
 
				-
			
 
				-
			
 
				-void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
			
 
				-{
			
 
				-	cputime_t udelta, sdelta;
			
 
				-
			
 
				-	if (!vtime_accounting_enabled()) {
			
 
				-		if (utime)
			
 
				-			*utime = t->utime;
			
 
				-		if (stime)
			
 
				-			*stime = t->stime;
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	fetch_task_cputime(t, utime, stime, &t->utime,
			
 
				-			   &t->stime, &udelta, &sdelta);
			
 
				-	if (utime)
			
 
				-		*utime += udelta;
			
 
				-	if (stime)
			
 
				-		*stime += sdelta;
			
 
				-}
			
 
				-
			
 
				-void task_cputime_scaled(struct task_struct *t,
			
 
				-			 cputime_t *utimescaled, cputime_t *stimescaled)
			
 
				-{
			
 
				-	cputime_t udelta, sdelta;
			
 
				-
			
 
				-	if (!vtime_accounting_enabled()) {
			
 
				-		if (utimescaled)
			
 
				-			*utimescaled = t->utimescaled;
			
 
				-		if (stimescaled)
			
 
				-			*stimescaled = t->stimescaled;
			
 
				-		return;
			
 
				-	}
			
 
				-
			
 
				-	fetch_task_cputime(t, utimescaled, stimescaled,
			
 
				-			   &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
			
 
				-	if (utimescaled)
			
 
				-		*utimescaled += cputime_to_scaled(udelta);
			
 
				-	if (stimescaled)
			
 
				-		*stimescaled += cputime_to_scaled(sdelta);
			
 
				-}
			
 
				 #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
			
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -586,7 +586,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 
				 
			
 
				 	/*
			
 
				 	 * The task might have changed its scheduling policy to something
			
 
				-	 * different than SCHED_DEADLINE (through switched_fromd_dl()).
			
 
				+	 * different than SCHED_DEADLINE (through switched_from_dl()).
			
 
				 	 */
			
 
				 	if (!dl_task(p)) {
			
 
				 		__dl_clear_params(p);
			
@@ -1137,7 +1137,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct pin_cookie coo
 
				 		pull_dl_task(rq);
			
 
				 		lockdep_repin_lock(&rq->lock, cookie);
			
 
				 		/*
			
 
				-		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
			
 
				+		 * pull_dl_task() can drop (and re-acquire) rq->lock; this
			
 
				 		 * means a stop task can slip in, in which case we need to
			
 
				 		 * re-start task selection.
			
 
				 		 */
			
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,7 +37,6 @@
 
				 
			
 
				 /*
			
 
				  * Targeted preemption latency for CPU-bound tasks:
			
 
				- * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  *
			
 
				  * NOTE: this latency value is not the same as the concept of
			
 
				  * 'timeslice length' - timeslices in CFS are of variable length
			
@@ -46,31 +45,35 @@
 
				  *
			
 
				  * (to see the precise effective timeslice length of your workload,
			
 
				  *  run vmstat and monitor the context-switches (cs) field)
			
 
				+ *
			
 
				+ * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  */
			
 
				-unsigned int sysctl_sched_latency = 6000000ULL;
			
 
				-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
			
 
				+unsigned int sysctl_sched_latency			= 6000000ULL;
			
 
				+unsigned int normalized_sysctl_sched_latency		= 6000000ULL;
			
 
				 
			
 
				 /*
			
 
				  * The initial- and re-scaling of tunables is configurable
			
 
				- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
			
 
				  *
			
 
				  * Options are:
			
 
				- * SCHED_TUNABLESCALING_NONE - unscaled, always *1
			
 
				- * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
			
 
				- * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
			
 
				+ *
			
 
				+ *   SCHED_TUNABLESCALING_NONE - unscaled, always *1
			
 
				+ *   SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
			
 
				+ *   SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
			
 
				+ *
			
 
				+ * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
			
 
				  */
			
 
				-enum sched_tunable_scaling sysctl_sched_tunable_scaling
			
 
				-	= SCHED_TUNABLESCALING_LOG;
			
 
				+enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG;
			
 
				 
			
 
				 /*
			
 
				  * Minimal preemption granularity for CPU-bound tasks:
			
 
				+ *
			
 
				  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  */
			
 
				-unsigned int sysctl_sched_min_granularity = 750000ULL;
			
 
				-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
			
 
				+unsigned int sysctl_sched_min_granularity		= 750000ULL;
			
 
				+unsigned int normalized_sysctl_sched_min_granularity	= 750000ULL;
			
 
				 
			
 
				 /*
			
 
				- * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
			
 
				+ * This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
			
 
				  */
			
 
				 static unsigned int sched_nr_latency = 8;
			
 
				 
			
@@ -82,23 +85,27 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
 
				 
			
 
				 /*
			
 
				  * SCHED_OTHER wake-up granularity.
			
 
				- * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  *
			
 
				  * This option delays the preemption effects of decoupled workloads
			
 
				  * and reduces their over-scheduling. Synchronous workloads will still
			
 
				  * have immediate wakeup/sleep latencies.
			
 
				+ *
			
 
				+ * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
			
 
				  */
			
 
				-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
			
 
				-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
			
 
				+unsigned int sysctl_sched_wakeup_granularity		= 1000000UL;
			
 
				+unsigned int normalized_sysctl_sched_wakeup_granularity	= 1000000UL;
			
 
				 
			
 
				-const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
			
 
				+const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
			
 
				 
			
 
				+#ifdef CONFIG_SMP
			
 
				 /*
			
 
				- * The exponential sliding  window over which load is averaged for shares
			
 
				- * distribution.
			
 
				- * (default: 10msec)
			
 
				+ * For asym packing, by default the lower numbered cpu has higher priority.
			
 
				  */
			
 
				-unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
			
 
				+int __weak arch_asym_cpu_priority(int cpu)
			
 
				+{
			
 
				+	return -cpu;
			
 
				+}
			
 
				+#endif
			
 
				 
			
 
				 #ifdef CONFIG_CFS_BANDWIDTH
			
 
				 /*
			
@@ -109,16 +116,18 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 
				  * to consumption or the quota being specified to be smaller than the slice)
			
 
				  * we will always only issue the remaining available time.
			
 
				  *
			
 
				- * default: 5 msec, units: microseconds
			
 
				-  */
			
 
				-unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
			
 
				+ * (default: 5 msec, units: microseconds)
			
 
				+ */
			
 
				+unsigned int sysctl_sched_cfs_bandwidth_slice		= 5000UL;
			
 
				 #endif
			
 
				 
			
 
				 /*
			
 
				  * The margin used when comparing utilization with CPU capacity:
			
 
				- * util * 1024 < capacity * margin
			
 
				+ * util * margin < capacity * 1024
			
 
				+ *
			
 
				+ * (default: ~20%)
			
 
				  */
			
 
				-unsigned int capacity_margin = 1280; /* ~20% */
			
 
				+unsigned int capacity_margin				= 1280;
			
 
				 
			
 
				 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
			
 
				 {
			
@@ -290,19 +299,59 @@ static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 
				 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
			
 
				 {
			
 
				 	if (!cfs_rq->on_list) {
			
 
				+		struct rq *rq = rq_of(cfs_rq);
			
 
				+		int cpu = cpu_of(rq);
			
 
				 		/*
			
 
				 		 * Ensure we either appear before our parent (if already
			
 
				 		 * enqueued) or force our parent to appear after us when it is
			
 
				-		 * enqueued.  The fact that we always enqueue bottom-up
			
 
				-		 * reduces this to two cases.
			
 
				+		 * enqueued. The fact that we always enqueue bottom-up
			
 
				+		 * reduces this to two cases and a special case for the root
			
 
				+		 * cfs_rq. Furthermore, it also means that we will always reset
			
 
				+		 * tmp_alone_branch either when the branch is connected
			
 
				+		 * to a tree or when we reach the beg of the tree
			
 
				 		 */
			
 
				 		if (cfs_rq->tg->parent &&
			
 
				-		    cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
			
 
				-			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
			
 
				-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
			
 
				-		} else {
			
 
				+		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
			
 
				+			/*
			
 
				+			 * If parent is already on the list, we add the child
			
 
				+			 * just before. Thanks to circular linked property of
			
 
				+			 * the list, this means to put the child at the tail
			
 
				+			 * of the list that starts by parent.
			
 
				+			 */
			
 
				 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
			
 
				-				&rq_of(cfs_rq)->leaf_cfs_rq_list);
			
 
				+				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
			
 
				+			/*
			
 
				+			 * The branch is now connected to its tree so we can
			
 
				+			 * reset tmp_alone_branch to the beginning of the
			
 
				+			 * list.
			
 
				+			 */
			
 
				+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
			
 
				+		} else if (!cfs_rq->tg->parent) {
			
 
				+			/*
			
 
				+			 * cfs rq without parent should be put
			
 
				+			 * at the tail of the list.
			
 
				+			 */
			
 
				+			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
			
 
				+				&rq->leaf_cfs_rq_list);
			
 
				+			/*
			
 
				+			 * We have reach the beg of a tree so we can reset
			
 
				+			 * tmp_alone_branch to the beginning of the list.
			
 
				+			 */
			
 
				+			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
			
 
				+		} else {
			
 
				+			/*
			
 
				+			 * The parent has not already been added so we want to
			
 
				+			 * make sure that it will be put after us.
			
 
				+			 * tmp_alone_branch points to the beg of the branch
			
 
				+			 * where we will add parent.
			
 
				+			 */
			
 
				+			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
			
 
				+				rq->tmp_alone_branch);
			
 
				+			/*
			
 
				+			 * update tmp_alone_branch to points to the new beg
			
 
				+			 * of the branch
			
 
				+			 */
			
 
				+			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
			
 
				 		}
			
 
				 
			
 
				 		cfs_rq->on_list = 1;
			
@@ -708,9 +757,7 @@ void init_entity_runnable_average(struct sched_entity *se)
 
				 }
			
 
				 
			
 
				 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
			
 
				-static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
			
 
				-static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force);
			
 
				-static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
			
 
				+static void attach_entity_cfs_rq(struct sched_entity *se);
			
 
				 
			
 
				 /*
			
 
				  * With new tasks being created, their initial util_avgs are extrapolated
			
@@ -742,7 +789,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 	struct sched_avg *sa = &se->avg;
			
 
				 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				 
			
 
				 	if (cap > 0) {
			
 
				 		if (cfs_rq->avg.util_avg != 0) {
			
@@ -770,14 +816,12 @@ void post_init_entity_util_avg(struct sched_entity *se)
 
				 			 * such that the next switched_to_fair() has the
			
 
				 			 * expected state.
			
 
				 			 */
			
 
				-			se->avg.last_update_time = now;
			
 
				+			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
			
 
				 			return;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				-	attach_entity_load_avg(cfs_rq, se);
			
 
				-	update_tg_load_avg(cfs_rq, false);
			
 
				+	attach_entity_cfs_rq(se);
			
 
				 }
			
 
				 
			
 
				 #else /* !CONFIG_SMP */
			
@@ -2890,6 +2934,26 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 
				 	return decayed;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Signed add and clamp on underflow.
			
 
				+ *
			
 
				+ * Explicitly do a load-store to ensure the intermediate value never hits
			
 
				+ * memory. This allows lockless observations without ever seeing the negative
			
 
				+ * values.
			
 
				+ */
			
 
				+#define add_positive(_ptr, _val) do {                           \
			
 
				+	typeof(_ptr) ptr = (_ptr);                              \
			
 
				+	typeof(_val) val = (_val);                              \
			
 
				+	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
			
 
				+								\
			
 
				+	res = var + val;                                        \
			
 
				+								\
			
 
				+	if (val < 0 && res > var)                               \
			
 
				+		res = 0;                                        \
			
 
				+								\
			
 
				+	WRITE_ONCE(*ptr, res);                                  \
			
 
				+} while (0)
			
 
				+
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 /**
			
 
				  * update_tg_load_avg - update the tg's load avg
			
@@ -2969,8 +3033,138 @@ void set_task_rq_fair(struct sched_entity *se,
 
				 		se->avg.last_update_time = n_last_update_time;
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+/* Take into account change of utilization of a child task group */
			
 
				+static inline void
			
 
				+update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				+	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
			
 
				+
			
 
				+	/* Nothing to update */
			
 
				+	if (!delta)
			
 
				+		return;
			
 
				+
			
 
				+	/* Set new sched_entity's utilization */
			
 
				+	se->avg.util_avg = gcfs_rq->avg.util_avg;
			
 
				+	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
			
 
				+
			
 
				+	/* Update parent cfs_rq utilization */
			
 
				+	add_positive(&cfs_rq->avg.util_avg, delta);
			
 
				+	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
			
 
				+}
			
 
				+
			
 
				+/* Take into account change of load of a child task group */
			
 
				+static inline void
			
 
				+update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
			
 
				+	long delta, load = gcfs_rq->avg.load_avg;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the load of group cfs_rq is null, the load of the
			
 
				+	 * sched_entity will also be null so we can skip the formula
			
 
				+	 */
			
 
				+	if (load) {
			
 
				+		long tg_load;
			
 
				+
			
 
				+		/* Get tg's load and ensure tg_load > 0 */
			
 
				+		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
			
 
				+
			
 
				+		/* Ensure tg_load >= load and updated with current load*/
			
 
				+		tg_load -= gcfs_rq->tg_load_avg_contrib;
			
 
				+		tg_load += load;
			
 
				+
			
 
				+		/*
			
 
				+		 * We need to compute a correction term in the case that the
			
 
				+		 * task group is consuming more CPU than a task of equal
			
 
				+		 * weight. A task with a weight equals to tg->shares will have
			
 
				+		 * a load less or equal to scale_load_down(tg->shares).
			
 
				+		 * Similarly, the sched_entities that represent the task group
			
 
				+		 * at parent level, can't have a load higher than
			
 
				+		 * scale_load_down(tg->shares). And the Sum of sched_entities'
			
 
				+		 * load must be <= scale_load_down(tg->shares).
			
 
				+		 */
			
 
				+		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
			
 
				+			/* scale gcfs_rq's load into tg's shares*/
			
 
				+			load *= scale_load_down(gcfs_rq->tg->shares);
			
 
				+			load /= tg_load;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	delta = load - se->avg.load_avg;
			
 
				+
			
 
				+	/* Nothing to update */
			
 
				+	if (!delta)
			
 
				+		return;
			
 
				+
			
 
				+	/* Set new sched_entity's load */
			
 
				+	se->avg.load_avg = load;
			
 
				+	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
			
 
				+
			
 
				+	/* Update parent cfs_rq load */
			
 
				+	add_positive(&cfs_rq->avg.load_avg, delta);
			
 
				+	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the sched_entity is already enqueued, we also have to update the
			
 
				+	 * runnable load avg.
			
 
				+	 */
			
 
				+	if (se->on_rq) {
			
 
				+		/* Update parent cfs_rq runnable_load_avg */
			
 
				+		add_positive(&cfs_rq->runnable_load_avg, delta);
			
 
				+		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
			
 
				+{
			
 
				+	cfs_rq->propagate_avg = 1;
			
 
				+}
			
 
				+
			
 
				+static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *cfs_rq = group_cfs_rq(se);
			
 
				+
			
 
				+	if (!cfs_rq->propagate_avg)
			
 
				+		return 0;
			
 
				+
			
 
				+	cfs_rq->propagate_avg = 0;
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				+/* Update task and its cfs_rq load average */
			
 
				+static inline int propagate_entity_load_avg(struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *cfs_rq;
			
 
				+
			
 
				+	if (entity_is_task(se))
			
 
				+		return 0;
			
 
				+
			
 
				+	if (!test_and_clear_tg_cfs_propagate(se))
			
 
				+		return 0;
			
 
				+
			
 
				+	cfs_rq = cfs_rq_of(se);
			
 
				+
			
 
				+	set_tg_cfs_propagate(cfs_rq);
			
 
				+
			
 
				+	update_tg_cfs_util(cfs_rq, se);
			
 
				+	update_tg_cfs_load(cfs_rq, se);
			
 
				+
			
 
				+	return 1;
			
 
				+}
			
 
				+
			
 
				 #else /* CONFIG_FAIR_GROUP_SCHED */
			
 
				+
			
 
				 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
			
 
				+
			
 
				+static inline int propagate_entity_load_avg(struct sched_entity *se)
			
 
				+{
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
			
 
				+
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
			
@@ -3041,6 +3235,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
				 		sub_positive(&sa->load_avg, r);
			
 
				 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
			
 
				 		removed_load = 1;
			
 
				+		set_tg_cfs_propagate(cfs_rq);
			
 
				 	}
			
 
				 
			
 
				 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
			
@@ -3048,6 +3243,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
				 		sub_positive(&sa->util_avg, r);
			
 
				 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
			
 
				 		removed_util = 1;
			
 
				+		set_tg_cfs_propagate(cfs_rq);
			
 
				 	}
			
 
				 
			
 
				 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
			
@@ -3064,23 +3260,35 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
				 	return decayed || removed_load;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Optional action to be done while updating the load average
			
 
				+ */
			
 
				+#define UPDATE_TG	0x1
			
 
				+#define SKIP_AGE_LOAD	0x2
			
 
				+
			
 
				 /* Update task and its cfs_rq load average */
			
 
				-static inline void update_load_avg(struct sched_entity *se, int update_tg)
			
 
				+static inline void update_load_avg(struct sched_entity *se, int flags)
			
 
				 {
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				 	struct rq *rq = rq_of(cfs_rq);
			
 
				 	int cpu = cpu_of(rq);
			
 
				+	int decayed;
			
 
				 
			
 
				 	/*
			
 
				 	 * Track task load average for carrying it to new CPU after migrated, and
			
 
				 	 * track group sched_entity load average for task_h_load calc in migration
			
 
				 	 */
			
 
				-	__update_load_avg(now, cpu, &se->avg,
			
 
				+	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
			
 
				+		__update_load_avg(now, cpu, &se->avg,
			
 
				 			  se->on_rq * scale_load_down(se->load.weight),
			
 
				 			  cfs_rq->curr == se, NULL);
			
 
				+	}
			
 
				 
			
 
				-	if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
			
 
				+	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
			
 
				+	decayed |= propagate_entity_load_avg(se);
			
 
				+
			
 
				+	if (decayed && (flags & UPDATE_TG))
			
 
				 		update_tg_load_avg(cfs_rq, 0);
			
 
				 }
			
 
				 
			
@@ -3094,31 +3302,12 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg)
 
				  */
			
 
				 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	if (!sched_feat(ATTACH_AGE_LOAD))
			
 
				-		goto skip_aging;
			
 
				-
			
 
				-	/*
			
 
				-	 * If we got migrated (either between CPUs or between cgroups) we'll
			
 
				-	 * have aged the average right before clearing @last_update_time.
			
 
				-	 *
			
 
				-	 * Or we're fresh through post_init_entity_util_avg().
			
 
				-	 */
			
 
				-	if (se->avg.last_update_time) {
			
 
				-		__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
			
 
				-				  &se->avg, 0, 0, NULL);
			
 
				-
			
 
				-		/*
			
 
				-		 * XXX: we could have just aged the entire load away if we've been
			
 
				-		 * absent from the fair class for too long.
			
 
				-		 */
			
 
				-	}
			
 
				-
			
 
				-skip_aging:
			
 
				 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
			
 
				 	cfs_rq->avg.load_avg += se->avg.load_avg;
			
 
				 	cfs_rq->avg.load_sum += se->avg.load_sum;
			
 
				 	cfs_rq->avg.util_avg += se->avg.util_avg;
			
 
				 	cfs_rq->avg.util_sum += se->avg.util_sum;
			
 
				+	set_tg_cfs_propagate(cfs_rq);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
@@ -3133,14 +3322,12 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
 
				  */
			
 
				 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
			
 
				-			  &se->avg, se->on_rq * scale_load_down(se->load.weight),
			
 
				-			  cfs_rq->curr == se, NULL);
			
 
				 
			
 
				 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
			
 
				 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
			
 
				 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
			
 
				 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
			
 
				+	set_tg_cfs_propagate(cfs_rq);
			
 
				 
			
 
				 	cfs_rq_util_change(cfs_rq);
			
 
				 }
			
@@ -3150,34 +3337,20 @@ static inline void
 
				 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				 	struct sched_avg *sa = &se->avg;
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				-	int migrated, decayed;
			
 
				-
			
 
				-	migrated = !sa->last_update_time;
			
 
				-	if (!migrated) {
			
 
				-		__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
			
 
				-			se->on_rq * scale_load_down(se->load.weight),
			
 
				-			cfs_rq->curr == se, NULL);
			
 
				-	}
			
 
				-
			
 
				-	decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
			
 
				 
			
 
				 	cfs_rq->runnable_load_avg += sa->load_avg;
			
 
				 	cfs_rq->runnable_load_sum += sa->load_sum;
			
 
				 
			
 
				-	if (migrated)
			
 
				+	if (!sa->last_update_time) {
			
 
				 		attach_entity_load_avg(cfs_rq, se);
			
 
				-
			
 
				-	if (decayed || migrated)
			
 
				 		update_tg_load_avg(cfs_rq, 0);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
			
 
				 static inline void
			
 
				 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
			
 
				 {
			
 
				-	update_load_avg(se, 1);
			
 
				-
			
 
				 	cfs_rq->runnable_load_avg =
			
 
				 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
			
 
				 	cfs_rq->runnable_load_sum =
			
@@ -3205,6 +3378,19 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 
				 }
			
 
				 #endif
			
 
				 
			
 
				+/*
			
 
				+ * Synchronize entity load avg of dequeued entity without locking
			
 
				+ * the previous rq.
			
 
				+ */
			
 
				+void sync_entity_load_avg(struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+	u64 last_update_time;
			
 
				+
			
 
				+	last_update_time = cfs_rq_last_update_time(cfs_rq);
			
 
				+	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Task first catches up with cfs_rq, and then subtract
			
 
				  * itself from the cfs_rq (task must be off the queue now).
			
@@ -3212,7 +3398,6 @@ static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
 
				 void remove_entity_load_avg(struct sched_entity *se)
			
 
				 {
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	u64 last_update_time;
			
 
				 
			
 
				 	/*
			
 
				 	 * tasks cannot exit without having gone through wake_up_new_task() ->
			
@@ -3224,9 +3409,7 @@ void remove_entity_load_avg(struct sched_entity *se)
 
				 	 * calls this.
			
 
				 	 */
			
 
				 
			
 
				-	last_update_time = cfs_rq_last_update_time(cfs_rq);
			
 
				-
			
 
				-	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
			
 
				+	sync_entity_load_avg(se);
			
 
				 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
			
 
				 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
			
 
				 }
			
@@ -3251,7 +3434,10 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static inline void update_load_avg(struct sched_entity *se, int not_used)
			
 
				+#define UPDATE_TG	0x0
			
 
				+#define SKIP_AGE_LOAD	0x0
			
 
				+
			
 
				+static inline void update_load_avg(struct sched_entity *se, int not_used1)
			
 
				 {
			
 
				 	cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
			
 
				 }
			
@@ -3396,6 +3582,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	if (renorm && !curr)
			
 
				 		se->vruntime += cfs_rq->min_vruntime;
			
 
				 
			
 
				+	update_load_avg(se, UPDATE_TG);
			
 
				 	enqueue_entity_load_avg(cfs_rq, se);
			
 
				 	account_entity_enqueue(cfs_rq, se);
			
 
				 	update_cfs_shares(cfs_rq);
			
@@ -3470,6 +3657,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
				 	 * Update run-time statistics of the 'current'.
			
 
				 	 */
			
 
				 	update_curr(cfs_rq);
			
 
				+	update_load_avg(se, UPDATE_TG);
			
 
				 	dequeue_entity_load_avg(cfs_rq, se);
			
 
				 
			
 
				 	update_stats_dequeue(cfs_rq, se, flags);
			
@@ -3557,7 +3745,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
				 		 */
			
 
				 		update_stats_wait_end(cfs_rq, se);
			
 
				 		__dequeue_entity(cfs_rq, se);
			
 
				-		update_load_avg(se, 1);
			
 
				+		update_load_avg(se, UPDATE_TG);
			
 
				 	}
			
 
				 
			
 
				 	update_stats_curr_start(cfs_rq, se);
			
@@ -3675,7 +3863,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
				 	/*
			
 
				 	 * Ensure that runnable average is periodically updated.
			
 
				 	 */
			
 
				-	update_load_avg(curr, 1);
			
 
				+	update_load_avg(curr, UPDATE_TG);
			
 
				 	update_cfs_shares(cfs_rq);
			
 
				 
			
 
				 #ifdef CONFIG_SCHED_HRTICK
			
@@ -4572,7 +4760,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				-		update_load_avg(se, 1);
			
 
				+		update_load_avg(se, UPDATE_TG);
			
 
				 		update_cfs_shares(cfs_rq);
			
 
				 	}
			
 
				 
			
@@ -4631,7 +4819,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 
				 		if (cfs_rq_throttled(cfs_rq))
			
 
				 			break;
			
 
				 
			
 
				-		update_load_avg(se, 1);
			
 
				+		update_load_avg(se, UPDATE_TG);
			
 
				 		update_cfs_shares(cfs_rq);
			
 
				 	}
			
 
				 
			
@@ -5199,6 +5387,14 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
 
				 	return 1;
			
 
				 }
			
 
				 
			
 
				+static inline int task_util(struct task_struct *p);
			
 
				+static int cpu_util_wake(int cpu, struct task_struct *p);
			
 
				+
			
 
				+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
			
 
				+{
			
 
				+	return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * find_idlest_group finds and returns the least busy CPU group within the
			
 
				  * domain.
			
@@ -5208,15 +5404,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
				 		  int this_cpu, int sd_flag)
			
 
				 {
			
 
				 	struct sched_group *idlest = NULL, *group = sd->groups;
			
 
				-	unsigned long min_load = ULONG_MAX, this_load = 0;
			
 
				+	struct sched_group *most_spare_sg = NULL;
			
 
				+	unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
			
 
				+	unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
			
 
				+	unsigned long most_spare = 0, this_spare = 0;
			
 
				 	int load_idx = sd->forkexec_idx;
			
 
				-	int imbalance = 100 + (sd->imbalance_pct-100)/2;
			
 
				+	int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
			
 
				+	unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
			
 
				+				(sd->imbalance_pct-100) / 100;
			
 
				 
			
 
				 	if (sd_flag & SD_BALANCE_WAKE)
			
 
				 		load_idx = sd->wake_idx;
			
 
				 
			
 
				 	do {
			
 
				-		unsigned long load, avg_load;
			
 
				+		unsigned long load, avg_load, runnable_load;
			
 
				+		unsigned long spare_cap, max_spare_cap;
			
 
				 		int local_group;
			
 
				 		int i;
			
 
				 
			
@@ -5228,8 +5430,13 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
				 		local_group = cpumask_test_cpu(this_cpu,
			
 
				 					       sched_group_cpus(group));
			
 
				 
			
 
				-		/* Tally up the load of all CPUs in the group */
			
 
				+		/*
			
 
				+		 * Tally up the load of all CPUs in the group and find
			
 
				+		 * the group containing the CPU with most spare capacity.
			
 
				+		 */
			
 
				 		avg_load = 0;
			
 
				+		runnable_load = 0;
			
 
				+		max_spare_cap = 0;
			
 
				 
			
 
				 		for_each_cpu(i, sched_group_cpus(group)) {
			
 
				 			/* Bias balancing toward cpus of our domain */
			
@@ -5238,22 +5445,84 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
 
				 			else
			
 
				 				load = target_load(i, load_idx);
			
 
				 
			
 
				-			avg_load += load;
			
 
				+			runnable_load += load;
			
 
				+
			
 
				+			avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
			
 
				+
			
 
				+			spare_cap = capacity_spare_wake(i, p);
			
 
				+
			
 
				+			if (spare_cap > max_spare_cap)
			
 
				+				max_spare_cap = spare_cap;
			
 
				 		}
			
 
				 
			
 
				 		/* Adjust by relative CPU capacity of the group */
			
 
				-		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
			
 
				+		avg_load = (avg_load * SCHED_CAPACITY_SCALE) /
			
 
				+					group->sgc->capacity;
			
 
				+		runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) /
			
 
				+					group->sgc->capacity;
			
 
				 
			
 
				 		if (local_group) {
			
 
				-			this_load = avg_load;
			
 
				-		} else if (avg_load < min_load) {
			
 
				-			min_load = avg_load;
			
 
				-			idlest = group;
			
 
				+			this_runnable_load = runnable_load;
			
 
				+			this_avg_load = avg_load;
			
 
				+			this_spare = max_spare_cap;
			
 
				+		} else {
			
 
				+			if (min_runnable_load > (runnable_load + imbalance)) {
			
 
				+				/*
			
 
				+				 * The runnable load is significantly smaller
			
 
				+				 * so we can pick this new cpu
			
 
				+				 */
			
 
				+				min_runnable_load = runnable_load;
			
 
				+				min_avg_load = avg_load;
			
 
				+				idlest = group;
			
 
				+			} else if ((runnable_load < (min_runnable_load + imbalance)) &&
			
 
				+				   (100*min_avg_load > imbalance_scale*avg_load)) {
			
 
				+				/*
			
 
				+				 * The runnable loads are close so take the
			
 
				+				 * blocked load into account through avg_load.
			
 
				+				 */
			
 
				+				min_avg_load = avg_load;
			
 
				+				idlest = group;
			
 
				+			}
			
 
				+
			
 
				+			if (most_spare < max_spare_cap) {
			
 
				+				most_spare = max_spare_cap;
			
 
				+				most_spare_sg = group;
			
 
				+			}
			
 
				 		}
			
 
				 	} while (group = group->next, group != sd->groups);
			
 
				 
			
 
				-	if (!idlest || 100*this_load < imbalance*min_load)
			
 
				+	/*
			
 
				+	 * The cross-over point between using spare capacity or least load
			
 
				+	 * is too conservative for high utilization tasks on partially
			
 
				+	 * utilized systems if we require spare_capacity > task_util(p),
			
 
				+	 * so we allow for some task stuffing by using
			
 
				+	 * spare_capacity > task_util(p)/2.
			
 
				+	 *
			
 
				+	 * Spare capacity can't be used for fork because the utilization has
			
 
				+	 * not been set yet, we must first select a rq to compute the initial
			
 
				+	 * utilization.
			
 
				+	 */
			
 
				+	if (sd_flag & SD_BALANCE_FORK)
			
 
				+		goto skip_spare;
			
 
				+
			
 
				+	if (this_spare > task_util(p) / 2 &&
			
 
				+	    imbalance_scale*this_spare > 100*most_spare)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (most_spare > task_util(p) / 2)
			
 
				+		return most_spare_sg;
			
 
				+
			
 
				+skip_spare:
			
 
				+	if (!idlest)
			
 
				+		return NULL;
			
 
				+
			
 
				+	if (min_runnable_load > (this_runnable_load + imbalance))
			
 
				 		return NULL;
			
 
				+
			
 
				+	if ((this_runnable_load < (min_runnable_load + imbalance)) &&
			
 
				+	     (100*this_avg_load < imbalance_scale*min_avg_load))
			
 
				+		return NULL;
			
 
				+
			
 
				 	return idlest;
			
 
				 }
			
 
				 
			
@@ -5589,6 +5858,24 @@ static inline int task_util(struct task_struct *p)
 
				 	return p->se.avg.util_avg;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * cpu_util_wake: Compute cpu utilization with any contributions from
			
 
				+ * the waking task p removed.
			
 
				+ */
			
 
				+static int cpu_util_wake(int cpu, struct task_struct *p)
			
 
				+{
			
 
				+	unsigned long util, capacity;
			
 
				+
			
 
				+	/* Task has no contribution or is new */
			
 
				+	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
			
 
				+		return cpu_util(cpu);
			
 
				+
			
 
				+	capacity = capacity_orig_of(cpu);
			
 
				+	util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
			
 
				+
			
 
				+	return (util >= capacity) ? capacity : util;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
			
 
				  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
			
@@ -5607,6 +5894,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 
				 	if (max_cap - min_cap < max_cap >> 3)
			
 
				 		return 0;
			
 
				 
			
 
				+	/* Bring task utilization in sync with prev_cpu */
			
 
				+	sync_entity_load_avg(&p->se);
			
 
				+
			
 
				 	return min_cap * 1024 < task_util(p) * capacity_margin;
			
 
				 }
			
 
				 
			
@@ -6641,6 +6931,10 @@ static void update_blocked_averages(int cpu)
 
				 
			
 
				 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
			
 
				 			update_tg_load_avg(cfs_rq, 0);
			
 
				+
			
 
				+		/* Propagate pending load changes to the parent */
			
 
				+		if (cfs_rq->tg->se[cpu])
			
 
				+			update_load_avg(cfs_rq->tg->se[cpu], 0);
			
 
				 	}
			
 
				 	raw_spin_unlock_irqrestore(&rq->lock, flags);
			
 
				 }
			
@@ -6845,13 +7139,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 
				 
			
 
				 	cpu_rq(cpu)->cpu_capacity = capacity;
			
 
				 	sdg->sgc->capacity = capacity;
			
 
				+	sdg->sgc->min_capacity = capacity;
			
 
				 }
			
 
				 
			
 
				 void update_group_capacity(struct sched_domain *sd, int cpu)
			
 
				 {
			
 
				 	struct sched_domain *child = sd->child;
			
 
				 	struct sched_group *group, *sdg = sd->groups;
			
 
				-	unsigned long capacity;
			
 
				+	unsigned long capacity, min_capacity;
			
 
				 	unsigned long interval;
			
 
				 
			
 
				 	interval = msecs_to_jiffies(sd->balance_interval);
			
@@ -6864,6 +7159,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
				 	}
			
 
				 
			
 
				 	capacity = 0;
			
 
				+	min_capacity = ULONG_MAX;
			
 
				 
			
 
				 	if (child->flags & SD_OVERLAP) {
			
 
				 		/*
			
@@ -6888,11 +7184,12 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
				 			 */
			
 
				 			if (unlikely(!rq->sd)) {
			
 
				 				capacity += capacity_of(cpu);
			
 
				-				continue;
			
 
				+			} else {
			
 
				+				sgc = rq->sd->groups->sgc;
			
 
				+				capacity += sgc->capacity;
			
 
				 			}
			
 
				 
			
 
				-			sgc = rq->sd->groups->sgc;
			
 
				-			capacity += sgc->capacity;
			
 
				+			min_capacity = min(capacity, min_capacity);
			
 
				 		}
			
 
				 	} else  {
			
 
				 		/*
			
@@ -6902,12 +7199,16 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 
				 
			
 
				 		group = child->groups;
			
 
				 		do {
			
 
				-			capacity += group->sgc->capacity;
			
 
				+			struct sched_group_capacity *sgc = group->sgc;
			
 
				+
			
 
				+			capacity += sgc->capacity;
			
 
				+			min_capacity = min(sgc->min_capacity, min_capacity);
			
 
				 			group = group->next;
			
 
				 		} while (group != child->groups);
			
 
				 	}
			
 
				 
			
 
				 	sdg->sgc->capacity = capacity;
			
 
				+	sdg->sgc->min_capacity = min_capacity;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -6930,8 +7231,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 
				  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
			
 
				  * Something like:
			
 
				  *
			
 
				- * 	{ 0 1 2 3 } { 4 5 6 7 }
			
 
				- * 	        *     * * *
			
 
				+ *	{ 0 1 2 3 } { 4 5 6 7 }
			
 
				+ *	        *     * * *
			
 
				  *
			
 
				  * If we were to balance group-wise we'd place two tasks in the first group and
			
 
				  * two tasks in the second group. Clearly this is undesired as it will overload
			
@@ -7002,6 +7303,17 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
			
 
				+ * per-CPU capacity than sched_group ref.
			
 
				+ */
			
 
				+static inline bool
			
 
				+group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
			
 
				+{
			
 
				+	return sg->sgc->min_capacity * capacity_margin <
			
 
				+						ref->sgc->min_capacity * 1024;
			
 
				+}
			
 
				+
			
 
				 static inline enum
			
 
				 group_type group_classify(struct sched_group *group,
			
 
				 			  struct sg_lb_stats *sgs)
			
@@ -7105,6 +7417,20 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
				 	if (sgs->avg_load <= busiest->avg_load)
			
 
				 		return false;
			
 
				 
			
 
				+	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
			
 
				+		goto asym_packing;
			
 
				+
			
 
				+	/*
			
 
				+	 * Candidate sg has no more than one task per CPU and
			
 
				+	 * has higher per-CPU capacity. Migrating tasks to less
			
 
				+	 * capable CPUs may harm throughput. Maximize throughput,
			
 
				+	 * power/energy consequences are not considered.
			
 
				+	 */
			
 
				+	if (sgs->sum_nr_running <= sgs->group_weight &&
			
 
				+	    group_smaller_cpu_capacity(sds->local, sg))
			
 
				+		return false;
			
 
				+
			
 
				+asym_packing:
			
 
				 	/* This is the busiest node in its class. */
			
 
				 	if (!(env->sd->flags & SD_ASYM_PACKING))
			
 
				 		return true;
			
@@ -7113,16 +7439,18 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
				 	if (env->idle == CPU_NOT_IDLE)
			
 
				 		return true;
			
 
				 	/*
			
 
				-	 * ASYM_PACKING needs to move all the work to the lowest
			
 
				-	 * numbered CPUs in the group, therefore mark all groups
			
 
				-	 * higher than ourself as busy.
			
 
				+	 * ASYM_PACKING needs to move all the work to the highest
			
 
				+	 * prority CPUs in the group, therefore mark all groups
			
 
				+	 * of lower priority than ourself as busy.
			
 
				 	 */
			
 
				-	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
			
 
				+	if (sgs->sum_nr_running &&
			
 
				+	    sched_asym_prefer(env->dst_cpu, sg->asym_prefer_cpu)) {
			
 
				 		if (!sds->busiest)
			
 
				 			return true;
			
 
				 
			
 
				-		/* Prefer to move from highest possible cpu's work */
			
 
				-		if (group_first_cpu(sds->busiest) < group_first_cpu(sg))
			
 
				+		/* Prefer to move from lowest priority cpu's work */
			
 
				+		if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
			
 
				+				      sg->asym_prefer_cpu))
			
 
				 			return true;
			
 
				 	}
			
 
				 
			
@@ -7274,8 +7602,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
 
				 	if (!sds->busiest)
			
 
				 		return 0;
			
 
				 
			
 
				-	busiest_cpu = group_first_cpu(sds->busiest);
			
 
				-	if (env->dst_cpu > busiest_cpu)
			
 
				+	busiest_cpu = sds->busiest->asym_prefer_cpu;
			
 
				+	if (sched_asym_prefer(busiest_cpu, env->dst_cpu))
			
 
				 		return 0;
			
 
				 
			
 
				 	env->imbalance = DIV_ROUND_CLOSEST(
			
@@ -7613,10 +7941,11 @@ static int need_active_balance(struct lb_env *env)
 
				 
			
 
				 		/*
			
 
				 		 * ASYM_PACKING needs to force migrate tasks from busy but
			
 
				-		 * higher numbered CPUs in order to pack all tasks in the
			
 
				-		 * lowest numbered CPUs.
			
 
				+		 * lower priority CPUs in order to pack all tasks in the
			
 
				+		 * highest priority CPUs.
			
 
				 		 */
			
 
				-		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
			
 
				+		if ((sd->flags & SD_ASYM_PACKING) &&
			
 
				+		    sched_asym_prefer(env->dst_cpu, env->src_cpu))
			
 
				 			return 1;
			
 
				 	}
			
 
				 
			
@@ -8465,7 +8794,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
 
				 	unsigned long now = jiffies;
			
 
				 	struct sched_domain_shared *sds;
			
 
				 	struct sched_domain *sd;
			
 
				-	int nr_busy, cpu = rq->cpu;
			
 
				+	int nr_busy, i, cpu = rq->cpu;
			
 
				 	bool kick = false;
			
 
				 
			
 
				 	if (unlikely(rq->idle_balance))
			
@@ -8516,12 +8845,18 @@ static inline bool nohz_kick_needed(struct rq *rq)
 
				 	}
			
 
				 
			
 
				 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
			
 
				-	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
			
 
				-				  sched_domain_span(sd)) < cpu)) {
			
 
				-		kick = true;
			
 
				-		goto unlock;
			
 
				-	}
			
 
				+	if (sd) {
			
 
				+		for_each_cpu(i, sched_domain_span(sd)) {
			
 
				+			if (i == cpu ||
			
 
				+			    !cpumask_test_cpu(i, nohz.idle_cpus_mask))
			
 
				+				continue;
			
 
				 
			
 
				+			if (sched_asym_prefer(i, cpu)) {
			
 
				+				kick = true;
			
 
				+				goto unlock;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				 unlock:
			
 
				 	rcu_read_unlock();
			
 
				 	return kick;
			
@@ -8687,32 +9022,45 @@ static inline bool vruntime_normalized(struct task_struct *p)
 
				 	return false;
			
 
				 }
			
 
				 
			
 
				-static void detach_task_cfs_rq(struct task_struct *p)
			
 
				+#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				+/*
			
 
				+ * Propagate the changes of the sched_entity across the tg tree to make it
			
 
				+ * visible to the root
			
 
				+ */
			
 
				+static void propagate_entity_cfs_rq(struct sched_entity *se)
			
 
				 {
			
 
				-	struct sched_entity *se = &p->se;
			
 
				-	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				+	struct cfs_rq *cfs_rq;
			
 
				 
			
 
				-	if (!vruntime_normalized(p)) {
			
 
				-		/*
			
 
				-		 * Fix up our vruntime so that the current sleep doesn't
			
 
				-		 * cause 'unlimited' sleep bonus.
			
 
				-		 */
			
 
				-		place_entity(cfs_rq, se, 0);
			
 
				-		se->vruntime -= cfs_rq->min_vruntime;
			
 
				+	/* Start to propagate at parent */
			
 
				+	se = se->parent;
			
 
				+
			
 
				+	for_each_sched_entity(se) {
			
 
				+		cfs_rq = cfs_rq_of(se);
			
 
				+
			
 
				+		if (cfs_rq_throttled(cfs_rq))
			
 
				+			break;
			
 
				+
			
 
				+		update_load_avg(se, UPDATE_TG);
			
 
				 	}
			
 
				+}
			
 
				+#else
			
 
				+static void propagate_entity_cfs_rq(struct sched_entity *se) { }
			
 
				+#endif
			
 
				+
			
 
				+static void detach_entity_cfs_rq(struct sched_entity *se)
			
 
				+{
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				 
			
 
				 	/* Catch up with the cfs_rq and remove our load when we leave */
			
 
				-	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				+	update_load_avg(se, 0);
			
 
				 	detach_entity_load_avg(cfs_rq, se);
			
 
				 	update_tg_load_avg(cfs_rq, false);
			
 
				+	propagate_entity_cfs_rq(se);
			
 
				 }
			
 
				 
			
 
				-static void attach_task_cfs_rq(struct task_struct *p)
			
 
				+static void attach_entity_cfs_rq(struct sched_entity *se)
			
 
				 {
			
 
				-	struct sched_entity *se = &p->se;
			
 
				 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				-	u64 now = cfs_rq_clock_task(cfs_rq);
			
 
				 
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/*
			
@@ -8722,10 +9070,36 @@ static void attach_task_cfs_rq(struct task_struct *p)
 
				 	se->depth = se->parent ? se->parent->depth + 1 : 0;
			
 
				 #endif
			
 
				 
			
 
				-	/* Synchronize task with its cfs_rq */
			
 
				-	update_cfs_rq_load_avg(now, cfs_rq, false);
			
 
				+	/* Synchronize entity with its cfs_rq */
			
 
				+	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
			
 
				 	attach_entity_load_avg(cfs_rq, se);
			
 
				 	update_tg_load_avg(cfs_rq, false);
			
 
				+	propagate_entity_cfs_rq(se);
			
 
				+}
			
 
				+
			
 
				+static void detach_task_cfs_rq(struct task_struct *p)
			
 
				+{
			
 
				+	struct sched_entity *se = &p->se;
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+
			
 
				+	if (!vruntime_normalized(p)) {
			
 
				+		/*
			
 
				+		 * Fix up our vruntime so that the current sleep doesn't
			
 
				+		 * cause 'unlimited' sleep bonus.
			
 
				+		 */
			
 
				+		place_entity(cfs_rq, se, 0);
			
 
				+		se->vruntime -= cfs_rq->min_vruntime;
			
 
				+	}
			
 
				+
			
 
				+	detach_entity_cfs_rq(se);
			
 
				+}
			
 
				+
			
 
				+static void attach_task_cfs_rq(struct task_struct *p)
			
 
				+{
			
 
				+	struct sched_entity *se = &p->se;
			
 
				+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
			
 
				+
			
 
				+	attach_entity_cfs_rq(se);
			
 
				 
			
 
				 	if (!vruntime_normalized(p))
			
 
				 		se->vruntime += cfs_rq->min_vruntime;
			
@@ -8779,6 +9153,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 
				 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
			
 
				 #endif
			
 
				 #ifdef CONFIG_SMP
			
 
				+#ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				+	cfs_rq->propagate_avg = 0;
			
 
				+#endif
			
 
				 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
			
 
				 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
			
 
				 #endif
			
@@ -8887,7 +9264,7 @@ void online_fair_sched_group(struct task_group *tg)
 
				 		se = tg->se[i];
			
 
				 
			
 
				 		raw_spin_lock_irq(&rq->lock);
			
 
				-		post_init_entity_util_avg(se);
			
 
				+		attach_entity_cfs_rq(se);
			
 
				 		sync_throttle(tg, i);
			
 
				 		raw_spin_unlock_irq(&rq->lock);
			
 
				 	}
			
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -404,6 +404,7 @@ struct cfs_rq {
 
				 	unsigned long runnable_load_avg;
			
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	unsigned long tg_load_avg_contrib;
			
 
				+	unsigned long propagate_avg;
			
 
				 #endif
			
 
				 	atomic_long_t removed_load_avg, removed_util_avg;
			
 
				 #ifndef CONFIG_64BIT
			
@@ -539,6 +540,11 @@ struct dl_rq {
 
				 
			
 
				 #ifdef CONFIG_SMP
			
 
				 
			
 
				+static inline bool sched_asym_prefer(int a, int b)
			
 
				+{
			
 
				+	return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b);
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * We add the notion of a root-domain which will be used to define per-domain
			
 
				  * variables. Each exclusive cpuset essentially defines an island domain by
			
@@ -623,6 +629,7 @@ struct rq {
 
				 #ifdef CONFIG_FAIR_GROUP_SCHED
			
 
				 	/* list of leaf cfs_rq on this cpu: */
			
 
				 	struct list_head leaf_cfs_rq_list;
			
 
				+	struct list_head *tmp_alone_branch;
			
 
				 #endif /* CONFIG_FAIR_GROUP_SCHED */
			
 
				 
			
 
				 	/*
			
@@ -892,7 +899,8 @@ struct sched_group_capacity {
 
				 	 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
			
 
				 	 * for a single CPU.
			
 
				 	 */
			
 
				-	unsigned int capacity;
			
 
				+	unsigned long capacity;
			
 
				+	unsigned long min_capacity; /* Min per-CPU capacity in group */
			
 
				 	unsigned long next_update;
			
 
				 	int imbalance; /* XXX unrelated to capacity but shared group state */
			
 
				 
			
@@ -905,6 +913,7 @@ struct sched_group {
 
				 
			
 
				 	unsigned int group_weight;
			
 
				 	struct sched_group_capacity *sgc;
			
 
				+	int asym_prefer_cpu;		/* cpu of highest priority in group */
			
 
				 
			
 
				 	/*
			
 
				 	 * The CPUs this group covers.
			
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -347,13 +347,6 @@ static struct ctl_table kern_table[] = {
 
				 		.mode		= 0644,
			
 
				 		.proc_handler	= proc_dointvec,
			
 
				 	},
			
 
				-	{
			
 
				-		.procname	= "sched_shares_window_ns",
			
 
				-		.data		= &sysctl_sched_shares_window,
			
 
				-		.maxlen		= sizeof(unsigned int),
			
 
				-		.mode		= 0644,
			
 
				-		.proc_handler	= proc_dointvec,
			
 
				-	},
			
 
				 #ifdef CONFIG_SCHEDSTATS
			
 
				 	{
			
 
				 		.procname	= "sched_schedstats",
			
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -133,9 +133,9 @@ static inline unsigned long long prof_ticks(struct task_struct *p)
 
				 }
			
 
				 static inline unsigned long long virt_ticks(struct task_struct *p)
			
 
				 {
			
 
				-	cputime_t utime;
			
 
				+	cputime_t utime, stime;
			
 
				 
			
 
				-	task_cputime(p, &utime, NULL);
			
 
				+	task_cputime(p, &utime, &stime);
			
 
				 
			
 
				 	return cputime_to_expires(utime);
			
 
				 }