Browse Source

Merge git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched

* git://git.kernel.org/pub/scm/linux/kernel/git/mingo/linux-2.6-sched: (96 commits)
  sched: keep total / count stats in addition to the max for
  sched, futex: detach sched.h and futex.h
  sched: fix: don't take a mutex from interrupt context
  sched: print backtrace of running tasks too
  printk: use ktime_get()
  softlockup: fix signedness
  sched: latencytop support
  sched: fix goto retry in pick_next_task_rt()
  timers: don't #error on higher HZ values
  sched: monitor clock underflows in /proc/sched_debug
  sched: fix rq->clock warps on frequency changes
  sched: fix, always create kernel threads with normal priority
  debug: clean up kernel/profile.c
  sched: remove the !PREEMPT_BKL code
  sched: make PREEMPT_BKL the default
  debug: track and print last unloaded module in the oops trace
  debug: show being-loaded/being-unloaded indicator for modules
  sched: rt-watchdog: fix .rlim_max = RLIM_INFINITY
  sched: rt-group: reduce rescheduling
  hrtimer: unlock hrtimer_wakeup
  ...
Linus Torvalds 17 years ago
parent
commit
0008bf5440
81 changed files with 6262 additions and 1776 deletions
  1. 199 11
      Documentation/RCU/RTFP.txt
  2. 17 2
      Documentation/RCU/rcu.txt
  3. 5 6
      Documentation/RCU/torture.txt
  4. 6 5
      Documentation/cpu-hotplug.txt
  5. 0 11
      arch/arm/kernel/time.c
  6. 0 4
      arch/ia64/kernel/setup.c
  7. 0 27
      arch/ia64/kernel/time.c
  8. 0 11
      arch/ia64/sn/kernel/setup.c
  9. 5 5
      arch/mips/kernel/mips-mt-fpaff.c
  10. 4 4
      arch/powerpc/platforms/pseries/hotplug-cpu.c
  11. 4 4
      arch/powerpc/platforms/pseries/rtasd.c
  12. 4 4
      arch/x86/kernel/cpu/mtrr/main.c
  13. 3 3
      arch/x86/kernel/entry_64.S
  14. 8 8
      arch/x86/kernel/microcode.c
  15. 3 0
      arch/x86/kernel/signal_32.c
  16. 3 0
      arch/x86/kernel/signal_64.c
  17. 27 0
      arch/x86/kernel/stacktrace.c
  18. 4 4
      drivers/lguest/x86/core.c
  19. 2 2
      drivers/s390/char/sclp_config.c
  20. 0 1
      fs/Kconfig
  21. 78 0
      fs/proc/base.c
  22. 3 2
      include/asm-generic/resource.h
  23. 2 0
      include/asm-x86/thread_info_32.h
  24. 5 0
      include/asm-x86/thread_info_64.h
  25. 13 4
      include/linux/cpu.h
  26. 5 0
      include/linux/debug_locks.h
  27. 5 1
      include/linux/futex.h
  28. 1 5
      include/linux/hardirq.h
  29. 11 3
      include/linux/hrtimer.h
  30. 5 2
      include/linux/init_task.h
  31. 1 0
      include/linux/interrupt.h
  32. 6 0
      include/linux/jiffies.h
  33. 2 2
      include/linux/kernel.h
  34. 44 0
      include/linux/latencytop.h
  35. 1 3
      include/linux/notifier.h
  36. 164 0
      include/linux/rcuclassic.h
  37. 53 120
      include/linux/rcupdate.h
  38. 86 0
      include/linux/rcupreempt.h
  39. 99 0
      include/linux/rcupreempt_trace.h
  40. 77 6
      include/linux/sched.h
  41. 1 13
      include/linux/smp_lock.h
  42. 3 0
      include/linux/stacktrace.h
  43. 4 1
      include/linux/topology.h
  44. 28 0
      init/Kconfig
  45. 1 0
      init/main.c
  46. 2 0
      kernel/Kconfig.hz
  47. 6 7
      kernel/Kconfig.preempt
  48. 6 0
      kernel/Makefile
  49. 115 49
      kernel/cpu.c
  50. 7 7
      kernel/cpuset.c
  51. 11 0
      kernel/fork.c
  52. 142 114
      kernel/hrtimer.c
  53. 11 1
      kernel/kthread.c
  54. 239 0
      kernel/latencytop.c
  55. 11 1
      kernel/lockdep.c
  56. 21 6
      kernel/module.c
  57. 30 0
      kernel/posix-cpu-timers.c
  58. 41 16
      kernel/printk.c
  59. 49 50
      kernel/profile.c
  60. 575 0
      kernel/rcuclassic.c
  61. 35 541
      kernel/rcupdate.c
  62. 953 0
      kernel/rcupreempt.c
  63. 330 0
      kernel/rcupreempt_trace.c
  64. 3 3
      kernel/rcutorture.c
  65. 1014 370
      kernel/sched.c
  66. 5 0
      kernel/sched_debug.c
  67. 349 42
      kernel/sched_fair.c
  68. 41 1
      kernel/sched_idletask.c
  69. 1023 89
      kernel/sched_rt.c
  70. 105 11
      kernel/softlockup.c
  71. 2 2
      kernel/stop_machine.c
  72. 74 3
      kernel/sysctl.c
  73. 5 8
      kernel/time/tick-sched.c
  74. 2 1
      kernel/timer.c
  75. 20 27
      kernel/user.c
  76. 15 20
      kernel/workqueue.c
  77. 14 0
      lib/Kconfig.debug
  78. 0 123
      lib/kernel_lock.c
  79. 1 1
      mm/oom_kill.c
  80. 11 7
      mm/slab.c
  81. 2 2
      net/core/flow.c

+ 199 - 11
Documentation/RCU/RTFP.txt

@@ -9,8 +9,8 @@ The first thing resembling RCU was published in 1980, when Kung and Lehman
 [Kung80] recommended use of a garbage collector to defer destruction
 [Kung80] recommended use of a garbage collector to defer destruction
 of nodes in a parallel binary search tree in order to simplify its
 of nodes in a parallel binary search tree in order to simplify its
 implementation.  This works well in environments that have garbage
 implementation.  This works well in environments that have garbage
-collectors, but current production garbage collectors incur significant
-read-side overhead.
+collectors, but most production garbage collectors incur significant
+overhead.
 
 
 In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring
 In 1982, Manber and Ladner [Manber82,Manber84] recommended deferring
 destruction until all threads running at that time have terminated, again
 destruction until all threads running at that time have terminated, again
@@ -99,16 +99,25 @@ locking, reduces contention, reduces memory latency for readers, and
 parallelizes pipeline stalls and memory latency for writers.  However,
 parallelizes pipeline stalls and memory latency for writers.  However,
 these techniques still impose significant read-side overhead in the
 these techniques still impose significant read-side overhead in the
 form of memory barriers.  Researchers at Sun worked along similar lines
 form of memory barriers.  Researchers at Sun worked along similar lines
-in the same timeframe [HerlihyLM02,HerlihyLMS03].  These techniques
-can be thought of as inside-out reference counts, where the count is
-represented by the number of hazard pointers referencing a given data
-structure (rather than the more conventional counter field within the
-data structure itself).
+in the same timeframe [HerlihyLM02].  These techniques can be thought
+of as inside-out reference counts, where the count is represented by the
+number of hazard pointers referencing a given data structure (rather than
+the more conventional counter field within the data structure itself).
+
+By the same token, RCU can be thought of as a "bulk reference count",
+where some form of reference counter covers all reference by a given CPU
+or thread during a set timeframe.  This timeframe is related to, but
+not necessarily exactly the same as, an RCU grace period.  In classic
+RCU, the reference counter is the per-CPU bit in the "bitmask" field,
+and each such bit covers all references that might have been made by
+the corresponding CPU during the prior grace period.  Of course, RCU
+can be thought of in other terms as well.
 
 
 In 2003, the K42 group described how RCU could be used to create
 In 2003, the K42 group described how RCU could be used to create
-hot-pluggable implementations of operating-system functions.  Later that
-year saw a paper describing an RCU implementation of System V IPC
-[Arcangeli03], and an introduction to RCU in Linux Journal [McKenney03a].
+hot-pluggable implementations of operating-system functions [Appavoo03a].
+Later that year saw a paper describing an RCU implementation of System
+V IPC [Arcangeli03], and an introduction to RCU in Linux Journal
+[McKenney03a].
 
 
 2004 has seen a Linux-Journal article on use of RCU in dcache
 2004 has seen a Linux-Journal article on use of RCU in dcache
 [McKenney04a], a performance comparison of locking to RCU on several
 [McKenney04a], a performance comparison of locking to RCU on several
@@ -117,10 +126,19 @@ number of operating-system kernels [PaulEdwardMcKenneyPhD], a paper
 describing how to make RCU safe for soft-realtime applications [Sarma04c],
 describing how to make RCU safe for soft-realtime applications [Sarma04c],
 and a paper describing SELinux performance with RCU [JamesMorris04b].
 and a paper describing SELinux performance with RCU [JamesMorris04b].
 
 
-2005 has seen further adaptation of RCU to realtime use, permitting
+2005 brought further adaptation of RCU to realtime use, permitting
 preemption of RCU realtime critical sections [PaulMcKenney05a,
 preemption of RCU realtime critical sections [PaulMcKenney05a,
 PaulMcKenney05b].
 PaulMcKenney05b].
 
 
+2006 saw the first best-paper award for an RCU paper [ThomasEHart2006a],
+as well as further work on efficient implementations of preemptible
+RCU [PaulEMcKenney2006b], but priority-boosting of RCU read-side critical
+sections proved elusive.  An RCU implementation permitting general
+blocking in read-side critical sections appeared [PaulEMcKenney2006c],
+Robert Olsson described an RCU-protected trie-hash combination
+[RobertOlsson2006a].
+
+
 Bibtex Entries
 Bibtex Entries
 
 
 @article{Kung80
 @article{Kung80
@@ -203,6 +221,41 @@ Bibtex Entries
 ,Address="New Orleans, LA"
 ,Address="New Orleans, LA"
 }
 }
 
 
+@conference{Pu95a,
+Author = "Calton Pu and Tito Autrey and Andrew Black and Charles Consel and
+Crispin Cowan and Jon Inouye and Lakshmi Kethana and Jonathan Walpole and
+Ke Zhang",
+Title = "Optimistic Incremental Specialization: Streamlining a Commercial
+Operating System",
+Booktitle = "15\textsuperscript{th} ACM Symposium on
+Operating Systems Principles (SOSP'95)",
+address = "Copper Mountain, CO",
+month="December",
+year="1995",
+pages="314-321",
+annotation="
+	Uses a replugger, but with a flag to signal when people are
+	using the resource at hand.  Only one reader at a time.
+"
+}
+
+@conference{Cowan96a,
+Author = "Crispin Cowan and Tito Autrey and Charles Krasic and
+Calton Pu and Jonathan Walpole",
+Title = "Fast Concurrent Dynamic Linking for an Adaptive Operating System",
+Booktitle = "International Conference on Configurable Distributed Systems
+(ICCDS'96)",
+address = "Annapolis, MD",
+month="May",
+year="1996",
+pages="108",
+isbn="0-8186-7395-8",
+annotation="
+	Uses a replugger, but with a counter to signal when people are
+	using the resource at hand.  Allows multiple readers.
+"
+}
+
 @techreport{Slingwine95
 @techreport{Slingwine95
 ,author="John D. Slingwine and Paul E. McKenney"
 ,author="John D. Slingwine and Paul E. McKenney"
 ,title="Apparatus and Method for Achieving Reduced Overhead Mutual
 ,title="Apparatus and Method for Achieving Reduced Overhead Mutual
@@ -312,6 +365,49 @@ Andrea Arcangeli and Andi Kleen and Orran Krieger and Rusty Russell"
 [Viewed June 23, 2004]"
 [Viewed June 23, 2004]"
 }
 }
 
 
+@conference{Michael02a
+,author="Maged M. Michael"
+,title="Safe Memory Reclamation for Dynamic Lock-Free Objects Using Atomic
+Reads and Writes"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 21\textsuperscript{st} Annual ACM
+Symposium on Principles of Distributed Computing}"
+,pages="21-30"
+,annotation="
+	Each thread keeps an array of pointers to items that it is
+	currently referencing.	Sort of an inside-out garbage collection
+	mechanism, but one that requires the accessing code to explicitly
+	state its needs.  Also requires read-side memory barriers on
+	most architectures.
+"
+}
+
+@conference{Michael02b
+,author="Maged M. Michael"
+,title="High Performance Dynamic Lock-Free Hash Tables and List-Based Sets"
+,Year="2002"
+,Month="August"
+,booktitle="{Proceedings of the 14\textsuperscript{th} Annual ACM
+Symposium on Parallel
+Algorithms and Architecture}"
+,pages="73-82"
+,annotation="
+	Like the title says...
+"
+}
+
+@InProceedings{HerlihyLM02
+,author={Maurice Herlihy and Victor Luchangco and Mark Moir}
+,title="The Repeat Offender Problem: A Mechanism for Supporting Dynamic-Sized,
+Lock-Free Data Structures"
+,booktitle={Proceedings of 16\textsuperscript{th} International
+Symposium on Distributed Computing}
+,year=2002
+,month="October"
+,pages="339-353"
+}
+
 @article{Appavoo03a
 @article{Appavoo03a
 ,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and
 ,author="J. Appavoo and K. Hui and C. A. N. Soules and R. W. Wisniewski and
 D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and
 D. M. {Da Silva} and O. Krieger and M. A. Auslander and D. J. Edelsohn and
@@ -447,3 +543,95 @@ Oregon Health and Sciences University"
 	Realtime turns into making RCU yet more realtime friendly.
 	Realtime turns into making RCU yet more realtime friendly.
 "
 "
 }
 }
+
+@conference{ThomasEHart2006a
+,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown"
+,Title="Making Lockless Synchronization Fast: Performance Implications
+of Memory Reclamation"
+,Booktitle="20\textsuperscript{th} {IEEE} International Parallel and
+Distributed Processing Symposium"
+,month="April"
+,year="2006"
+,day="25-29"
+,address="Rhodes, Greece"
+,annotation="
+	Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
+	reference counting.
+"
+}
+
+@Conference{PaulEMcKenney2006b
+,Author="Paul E. McKenney and Dipankar Sarma and Ingo Molnar and
+Suparna Bhattacharya"
+,Title="Extending RCU for Realtime and Embedded Workloads"
+,Booktitle="{Ottawa Linux Symposium}"
+,Month="July"
+,Year="2006"
+,pages="v2 123-138"
+,note="Available:
+\url{http://www.linuxsymposium.org/2006/view_abstract.php?content_key=184}
+\url{http://www.rdrop.com/users/paulmck/RCU/OLSrtRCU.2006.08.11a.pdf}
+[Viewed January 1, 2007]"
+,annotation="
+	Described how to improve the -rt implementation of realtime RCU.
+"
+}
+
+@unpublished{PaulEMcKenney2006c
+,Author="Paul E. McKenney"
+,Title="Sleepable {RCU}"
+,month="October"
+,day="9"
+,year="2006"
+,note="Available:
+\url{http://lwn.net/Articles/202847/}
+Revised:
+\url{http://www.rdrop.com/users/paulmck/RCU/srcu.2007.01.14a.pdf}
+[Viewed August 21, 2006]"
+,annotation="
+	LWN article introducing SRCU.
+"
+}
+
+@unpublished{RobertOlsson2006a
+,Author="Robert Olsson and Stefan Nilsson"
+,Title="{TRASH}: A dynamic {LC}-trie and hash data structure"
+,month="August"
+,day="18"
+,year="2006"
+,note="Available:
+\url{http://www.nada.kth.se/~snilsson/public/papers/trash/trash.pdf}
+[Viewed February 24, 2007]"
+,annotation="
+	RCU-protected dynamic trie-hash combination.
+"
+}
+
+@unpublished{ThomasEHart2007a
+,Author="Thomas E. Hart and Paul E. McKenney and Angela Demke Brown and Jonathan Walpole"
+,Title="Performance of memory reclamation for lockless synchronization"
+,journal="J. Parallel Distrib. Comput."
+,year="2007"
+,note="To appear in J. Parallel Distrib. Comput.
+       \url{doi=10.1016/j.jpdc.2007.04.010}"
+,annotation={
+	Compares QSBR (AKA "classic RCU"), HPBR, EBR, and lock-free
+	reference counting.  Journal version of ThomasEHart2006a.
+}
+}
+
+@unpublished{PaulEMcKenney2007QRCUspin
+,Author="Paul E. McKenney"
+,Title="Using Promela and Spin to verify parallel algorithms"
+,month="August"
+,day="1"
+,year="2007"
+,note="Available:
+\url{http://lwn.net/Articles/243851/}
+[Viewed September 8, 2007]"
+,annotation="
+	LWN article describing Promela and spin, and also using Oleg
+	Nesterov's QRCU as an example (with Paul McKenney's fastpath).
+"
+}
+

+ 17 - 2
Documentation/RCU/rcu.txt

@@ -36,6 +36,14 @@ o	How can the updater tell when a grace period has completed
 	executed in user mode, or executed in the idle loop, we can
 	executed in user mode, or executed in the idle loop, we can
 	safely free up that item.
 	safely free up that item.
 
 
+	Preemptible variants of RCU (CONFIG_PREEMPT_RCU) get the
+	same effect, but require that the readers manipulate CPU-local
+	counters.  These counters allow limited types of blocking
+	within RCU read-side critical sections.  SRCU also uses
+	CPU-local counters, and permits general blocking within
+	RCU read-side critical sections.  These two variants of
+	RCU detect grace periods by sampling these counters.
+
 o	If I am running on a uniprocessor kernel, which can only do one
 o	If I am running on a uniprocessor kernel, which can only do one
 	thing at a time, why should I wait for a grace period?
 	thing at a time, why should I wait for a grace period?
 
 
@@ -46,7 +54,10 @@ o	How can I see where RCU is currently used in the Linux kernel?
 	Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
 	Search for "rcu_read_lock", "rcu_read_unlock", "call_rcu",
 	"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
 	"rcu_read_lock_bh", "rcu_read_unlock_bh", "call_rcu_bh",
 	"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
 	"srcu_read_lock", "srcu_read_unlock", "synchronize_rcu",
-	"synchronize_net", and "synchronize_srcu".
+	"synchronize_net", "synchronize_srcu", and the other RCU
+	primitives.  Or grab one of the cscope databases from:
+
+	http://www.rdrop.com/users/paulmck/RCU/linuxusage/rculocktab.html
 
 
 o	What guidelines should I follow when writing code that uses RCU?
 o	What guidelines should I follow when writing code that uses RCU?
 
 
@@ -67,7 +78,11 @@ o	I hear that RCU is patented?  What is with that?
 
 
 o	I hear that RCU needs work in order to support realtime kernels?
 o	I hear that RCU needs work in order to support realtime kernels?
 
 
-	Yes, work in progress.
+	This work is largely completed.  Realtime-friendly RCU can be
+	enabled via the CONFIG_PREEMPT_RCU kernel configuration parameter.
+	However, work is in progress for enabling priority boosting of
+	preempted RCU read-side critical sections.This is needed if you
+	have CPU-bound realtime threads.
 
 
 o	Where can I find more information on RCU?
 o	Where can I find more information on RCU?
 
 

+ 5 - 6
Documentation/RCU/torture.txt

@@ -46,12 +46,13 @@ stat_interval	The number of seconds between output of torture
 
 
 shuffle_interval
 shuffle_interval
 		The number of seconds to keep the test threads affinitied
 		The number of seconds to keep the test threads affinitied
-		to a particular subset of the CPUs.  Used in conjunction
-		with test_no_idle_hz.
+		to a particular subset of the CPUs, defaults to 5 seconds.
+		Used in conjunction with test_no_idle_hz.
 
 
 test_no_idle_hz	Whether or not to test the ability of RCU to operate in
 test_no_idle_hz	Whether or not to test the ability of RCU to operate in
 		a kernel that disables the scheduling-clock interrupt to
 		a kernel that disables the scheduling-clock interrupt to
 		idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
 		idle CPUs.  Boolean parameter, "1" to test, "0" otherwise.
+		Defaults to omitting this test.
 
 
 torture_type	The type of RCU to test: "rcu" for the rcu_read_lock() API,
 torture_type	The type of RCU to test: "rcu" for the rcu_read_lock() API,
 		"rcu_sync" for rcu_read_lock() with synchronous reclamation,
 		"rcu_sync" for rcu_read_lock() with synchronous reclamation,
@@ -82,8 +83,6 @@ be evident.  ;-)
 
 
 The entries are as follows:
 The entries are as follows:
 
 
-o	"ggp": The number of counter flips (or batches) since boot.
-
 o	"rtc": The hexadecimal address of the structure currently visible
 o	"rtc": The hexadecimal address of the structure currently visible
 	to readers.
 	to readers.
 
 
@@ -117,8 +116,8 @@ o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
 o	"Reader Batch": Another histogram of "ages" of structures seen
 o	"Reader Batch": Another histogram of "ages" of structures seen
 	by readers, but in terms of counter flips (or batches) rather
 	by readers, but in terms of counter flips (or batches) rather
 	than in terms of grace periods.  The legal number of non-zero
 	than in terms of grace periods.  The legal number of non-zero
-	entries is again two.  The reason for this separate view is
-	that it is easier to get the third entry to show up in the
+	entries is again two.  The reason for this separate view is that
+	it is sometimes easier to get the third entry to show up in the
 	"Reader Batch" list than in the "Reader Pipe" list.
 	"Reader Batch" list than in the "Reader Pipe" list.
 
 
 o	"Free-Block Circulation": Shows the number of torture structures
 o	"Free-Block Circulation": Shows the number of torture structures

+ 6 - 5
Documentation/cpu-hotplug.txt

@@ -109,12 +109,13 @@ Never use anything other than cpumask_t to represent bitmap of CPUs.
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 	for_each_cpu_mask(x,mask) - Iterate over some random collection of cpu mask.
 
 
 	#include <linux/cpu.h>
 	#include <linux/cpu.h>
-	lock_cpu_hotplug() and unlock_cpu_hotplug():
+	get_online_cpus() and put_online_cpus():
 
 
-The above calls are used to inhibit cpu hotplug operations. While holding the
-cpucontrol mutex, cpu_online_map will not change. If you merely need to avoid
-cpus going away, you could also use preempt_disable() and preempt_enable()
-for those sections. Just remember the critical section cannot call any
+The above calls are used to inhibit cpu hotplug operations. While the
+cpu_hotplug.refcount is non zero, the cpu_online_map will not change.
+If you merely need to avoid cpus going away, you could also use
+preempt_disable() and preempt_enable() for those sections.
+Just remember the critical section cannot call any
 function that can sleep or schedule this process away. The preempt_disable()
 function that can sleep or schedule this process away. The preempt_disable()
 will work as long as stop_machine_run() is used to take a cpu down.
 will work as long as stop_machine_run() is used to take a cpu down.
 
 

+ 0 - 11
arch/arm/kernel/time.c

@@ -79,17 +79,6 @@ static unsigned long dummy_gettimeoffset(void)
 }
 }
 #endif
 #endif
 
 
-/*
- * An implementation of printk_clock() independent from
- * sched_clock().  This avoids non-bootable kernels when
- * printk_clock is enabled.
- */
-unsigned long long printk_clock(void)
-{
-	return (unsigned long long)(jiffies - INITIAL_JIFFIES) *
-			(1000000000 / HZ);
-}
-
 static unsigned long next_rtc_update;
 static unsigned long next_rtc_update;
 
 
 /*
 /*

+ 0 - 4
arch/ia64/kernel/setup.c

@@ -71,8 +71,6 @@ unsigned long __per_cpu_offset[NR_CPUS];
 EXPORT_SYMBOL(__per_cpu_offset);
 EXPORT_SYMBOL(__per_cpu_offset);
 #endif
 #endif
 
 
-extern void ia64_setup_printk_clock(void);
-
 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
 DEFINE_PER_CPU(struct cpuinfo_ia64, cpu_info);
 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
 DEFINE_PER_CPU(unsigned long, local_per_cpu_offset);
 unsigned long ia64_cycles_per_usec;
 unsigned long ia64_cycles_per_usec;
@@ -507,8 +505,6 @@ setup_arch (char **cmdline_p)
 	/* process SAL system table: */
 	/* process SAL system table: */
 	ia64_sal_init(__va(efi.sal_systab));
 	ia64_sal_init(__va(efi.sal_systab));
 
 
-	ia64_setup_printk_clock();
-
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	cpu_physical_id(0) = hard_smp_processor_id();
 	cpu_physical_id(0) = hard_smp_processor_id();
 #endif
 #endif

+ 0 - 27
arch/ia64/kernel/time.c

@@ -344,33 +344,6 @@ udelay (unsigned long usecs)
 }
 }
 EXPORT_SYMBOL(udelay);
 EXPORT_SYMBOL(udelay);
 
 
-static unsigned long long ia64_itc_printk_clock(void)
-{
-	if (ia64_get_kr(IA64_KR_PER_CPU_DATA))
-		return sched_clock();
-	return 0;
-}
-
-static unsigned long long ia64_default_printk_clock(void)
-{
-	return (unsigned long long)(jiffies_64 - INITIAL_JIFFIES) *
-		(1000000000/HZ);
-}
-
-unsigned long long (*ia64_printk_clock)(void) = &ia64_default_printk_clock;
-
-unsigned long long printk_clock(void)
-{
-	return ia64_printk_clock();
-}
-
-void __init
-ia64_setup_printk_clock(void)
-{
-	if (!(sal_platform_features & IA64_SAL_PLATFORM_FEATURE_ITC_DRIFT))
-		ia64_printk_clock = ia64_itc_printk_clock;
-}
-
 /* IA64 doesn't cache the timezone */
 /* IA64 doesn't cache the timezone */
 void update_vsyscall_tz(void)
 void update_vsyscall_tz(void)
 {
 {

+ 0 - 11
arch/ia64/sn/kernel/setup.c

@@ -64,7 +64,6 @@ extern void sn_timer_init(void);
 extern unsigned long last_time_offset;
 extern unsigned long last_time_offset;
 extern void (*ia64_mark_idle) (int);
 extern void (*ia64_mark_idle) (int);
 extern void snidle(int);
 extern void snidle(int);
-extern unsigned long long (*ia64_printk_clock)(void);
 
 
 unsigned long sn_rtc_cycles_per_second;
 unsigned long sn_rtc_cycles_per_second;
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
 EXPORT_SYMBOL(sn_rtc_cycles_per_second);
@@ -360,14 +359,6 @@ sn_scan_pcdp(void)
 
 
 static unsigned long sn2_rtc_initial;
 static unsigned long sn2_rtc_initial;
 
 
-static unsigned long long ia64_sn2_printk_clock(void)
-{
-	unsigned long rtc_now = rtc_time();
-
-	return (rtc_now - sn2_rtc_initial) *
-		(1000000000 / sn_rtc_cycles_per_second);
-}
-
 /**
 /**
  * sn_setup - SN platform setup routine
  * sn_setup - SN platform setup routine
  * @cmdline_p: kernel command line
  * @cmdline_p: kernel command line
@@ -468,8 +459,6 @@ void __init sn_setup(char **cmdline_p)
 
 
 	platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR;
 	platform_intr_list[ACPI_INTERRUPT_CPEI] = IA64_CPE_VECTOR;
 
 
-	ia64_printk_clock = ia64_sn2_printk_clock;
-
 	printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF);
 	printk("SGI SAL version %x.%02x\n", version >> 8, version & 0x00FF);
 
 
 	/*
 	/*

+ 5 - 5
arch/mips/kernel/mips-mt-fpaff.c

@@ -58,13 +58,13 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
 	if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask)))
 		return -EFAULT;
 		return -EFAULT;
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 	read_lock(&tasklist_lock);
 
 
 	p = find_process_by_pid(pid);
 	p = find_process_by_pid(pid);
 	if (!p) {
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		read_unlock(&tasklist_lock);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return -ESRCH;
 		return -ESRCH;
 	}
 	}
 
 
@@ -106,7 +106,7 @@ asmlinkage long mipsmt_sys_sched_setaffinity(pid_t pid, unsigned int len,
 
 
 out_unlock:
 out_unlock:
 	put_task_struct(p);
 	put_task_struct(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return retval;
 	return retval;
 }
 }
 
 
@@ -125,7 +125,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 	if (len < real_len)
 	if (len < real_len)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 	read_lock(&tasklist_lock);
 
 
 	retval = -ESRCH;
 	retval = -ESRCH;
@@ -140,7 +140,7 @@ asmlinkage long mipsmt_sys_sched_getaffinity(pid_t pid, unsigned int len,
 
 
 out_unlock:
 out_unlock:
 	read_unlock(&tasklist_lock);
 	read_unlock(&tasklist_lock);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (retval)
 	if (retval)
 		return retval;
 		return retval;
 	if (copy_to_user(user_mask_ptr, &mask, real_len))
 	if (copy_to_user(user_mask_ptr, &mask, real_len))

+ 4 - 4
arch/powerpc/platforms/pseries/hotplug-cpu.c

@@ -153,7 +153,7 @@ static int pseries_add_processor(struct device_node *np)
 	for (i = 0; i < nthreads; i++)
 	for (i = 0; i < nthreads; i++)
 		cpu_set(i, tmp);
 		cpu_set(i, tmp);
 
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 
 
 	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
 	BUG_ON(!cpus_subset(cpu_present_map, cpu_possible_map));
 
 
@@ -190,7 +190,7 @@ static int pseries_add_processor(struct device_node *np)
 	}
 	}
 	err = 0;
 	err = 0;
 out_unlock:
 out_unlock:
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 	return err;
 	return err;
 }
 }
 
 
@@ -211,7 +211,7 @@ static void pseries_remove_processor(struct device_node *np)
 
 
 	nthreads = len / sizeof(u32);
 	nthreads = len / sizeof(u32);
 
 
-	lock_cpu_hotplug();
+	cpu_maps_update_begin();
 	for (i = 0; i < nthreads; i++) {
 	for (i = 0; i < nthreads; i++) {
 		for_each_present_cpu(cpu) {
 		for_each_present_cpu(cpu) {
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
 			if (get_hard_smp_processor_id(cpu) != intserv[i])
@@ -225,7 +225,7 @@ static void pseries_remove_processor(struct device_node *np)
 			printk(KERN_WARNING "Could not find cpu to remove "
 			printk(KERN_WARNING "Could not find cpu to remove "
 			       "with physical id 0x%x\n", intserv[i]);
 			       "with physical id 0x%x\n", intserv[i]);
 	}
 	}
-	unlock_cpu_hotplug();
+	cpu_maps_update_done();
 }
 }
 
 
 static int pseries_smp_notifier(struct notifier_block *nb,
 static int pseries_smp_notifier(struct notifier_block *nb,

+ 4 - 4
arch/powerpc/platforms/pseries/rtasd.c

@@ -382,7 +382,7 @@ static void do_event_scan_all_cpus(long delay)
 {
 {
 	int cpu;
 	int cpu;
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	cpu = first_cpu(cpu_online_map);
 	cpu = first_cpu(cpu_online_map);
 	for (;;) {
 	for (;;) {
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
@@ -390,15 +390,15 @@ static void do_event_scan_all_cpus(long delay)
 		set_cpus_allowed(current, CPU_MASK_ALL);
 		set_cpus_allowed(current, CPU_MASK_ALL);
 
 
 		/* Drop hotplug lock, and sleep for the specified delay */
 		/* Drop hotplug lock, and sleep for the specified delay */
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		msleep_interruptible(delay);
 		msleep_interruptible(delay);
-		lock_cpu_hotplug();
+		get_online_cpus();
 
 
 		cpu = next_cpu(cpu, cpu_online_map);
 		cpu = next_cpu(cpu, cpu_online_map);
 		if (cpu == NR_CPUS)
 		if (cpu == NR_CPUS)
 			break;
 			break;
 	}
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 }
 
 
 static int rtasd(void *unused)
 static int rtasd(void *unused)

+ 4 - 4
arch/x86/kernel/cpu/mtrr/main.c

@@ -349,7 +349,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	replace = -1;
 	replace = -1;
 
 
 	/* No CPU hotplug when we change MTRR entries */
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	/*  Search for existing MTRR  */
 	/*  Search for existing MTRR  */
 	mutex_lock(&mtrr_mutex);
 	mutex_lock(&mtrr_mutex);
 	for (i = 0; i < num_var_ranges; ++i) {
 	for (i = 0; i < num_var_ranges; ++i) {
@@ -405,7 +405,7 @@ int mtrr_add_page(unsigned long base, unsigned long size,
 	error = i;
 	error = i;
  out:
  out:
 	mutex_unlock(&mtrr_mutex);
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 	return error;
 }
 }
 
 
@@ -495,7 +495,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 
 
 	max = num_var_ranges;
 	max = num_var_ranges;
 	/* No CPU hotplug when we change MTRR entries */
 	/* No CPU hotplug when we change MTRR entries */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&mtrr_mutex);
 	mutex_lock(&mtrr_mutex);
 	if (reg < 0) {
 	if (reg < 0) {
 		/*  Search for existing MTRR  */
 		/*  Search for existing MTRR  */
@@ -536,7 +536,7 @@ int mtrr_del_page(int reg, unsigned long base, unsigned long size)
 	error = reg;
 	error = reg;
  out:
  out:
 	mutex_unlock(&mtrr_mutex);
 	mutex_unlock(&mtrr_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	return error;
 	return error;
 }
 }
 /**
 /**

+ 3 - 3
arch/x86/kernel/entry_64.S

@@ -283,7 +283,7 @@ sysret_careful:
 sysret_signal:
 sysret_signal:
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
 	sti
 	sti
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    1f
 	jz    1f
 
 
 	/* Really a signal */
 	/* Really a signal */
@@ -377,7 +377,7 @@ int_very_careful:
 	jmp int_restore_rest
 	jmp int_restore_rest
 	
 	
 int_signal:
 int_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz 1f
 	jz 1f
 	movq %rsp,%rdi		# &ptregs -> arg1
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	xorl %esi,%esi		# oldset -> arg2
@@ -603,7 +603,7 @@ retint_careful:
 	jmp retint_check
 	jmp retint_check
 	
 	
 retint_signal:
 retint_signal:
-	testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx
+	testl $_TIF_DO_NOTIFY_MASK,%edx
 	jz    retint_swapgs
 	jz    retint_swapgs
 	TRACE_IRQS_ON
 	TRACE_IRQS_ON
 	sti
 	sti

+ 8 - 8
arch/x86/kernel/microcode.c

@@ -436,7 +436,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&microcode_mutex);
 	mutex_lock(&microcode_mutex);
 
 
 	user_buffer = (void __user *) buf;
 	user_buffer = (void __user *) buf;
@@ -447,7 +447,7 @@ static ssize_t microcode_write (struct file *file, const char __user *buf, size_
 		ret = (ssize_t)len;
 		ret = (ssize_t)len;
 
 
 	mutex_unlock(&microcode_mutex);
 	mutex_unlock(&microcode_mutex);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 
 	return ret;
 	return ret;
 }
 }
@@ -658,14 +658,14 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
 
 
 		old = current->cpus_allowed;
 		old = current->cpus_allowed;
 
 
-		lock_cpu_hotplug();
+		get_online_cpus();
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 		set_cpus_allowed(current, cpumask_of_cpu(cpu));
 
 
 		mutex_lock(&microcode_mutex);
 		mutex_lock(&microcode_mutex);
 		if (uci->valid)
 		if (uci->valid)
 			err = cpu_request_microcode(cpu);
 			err = cpu_request_microcode(cpu);
 		mutex_unlock(&microcode_mutex);
 		mutex_unlock(&microcode_mutex);
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		set_cpus_allowed(current, old);
 		set_cpus_allowed(current, old);
 	}
 	}
 	if (err)
 	if (err)
@@ -817,9 +817,9 @@ static int __init microcode_init (void)
 		return PTR_ERR(microcode_pdev);
 		return PTR_ERR(microcode_pdev);
 	}
 	}
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
 	error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 	if (error) {
 	if (error) {
 		microcode_dev_exit();
 		microcode_dev_exit();
 		platform_device_unregister(microcode_pdev);
 		platform_device_unregister(microcode_pdev);
@@ -839,9 +839,9 @@ static void __exit microcode_exit (void)
 
 
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 	unregister_hotcpu_notifier(&mc_cpu_notifier);
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
 	sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 
 	platform_device_unregister(microcode_pdev);
 	platform_device_unregister(microcode_pdev);
 }
 }

+ 3 - 0
arch/x86/kernel/signal_32.c

@@ -658,6 +658,9 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
 	/* deal with pending signal delivery */
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 	if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
 		do_signal(regs);
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 	
 	
 	clear_thread_flag(TIF_IRET);
 	clear_thread_flag(TIF_IRET);
 }
 }

+ 3 - 0
arch/x86/kernel/signal_64.c

@@ -480,6 +480,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
 	/* deal with pending signal delivery */
 	/* deal with pending signal delivery */
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 	if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK))
 		do_signal(regs);
 		do_signal(regs);
+
+	if (thread_info_flags & _TIF_HRTICK_RESCHED)
+		hrtick_resched();
 }
 }
 
 
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)

+ 27 - 0
arch/x86/kernel/stacktrace.c

@@ -33,6 +33,19 @@ static void save_stack_address(void *data, unsigned long addr)
 		trace->entries[trace->nr_entries++] = addr;
 		trace->entries[trace->nr_entries++] = addr;
 }
 }
 
 
+static void save_stack_address_nosched(void *data, unsigned long addr)
+{
+	struct stack_trace *trace = (struct stack_trace *)data;
+	if (in_sched_functions(addr))
+		return;
+	if (trace->skip > 0) {
+		trace->skip--;
+		return;
+	}
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = addr;
+}
+
 static const struct stacktrace_ops save_stack_ops = {
 static const struct stacktrace_ops save_stack_ops = {
 	.warning = save_stack_warning,
 	.warning = save_stack_warning,
 	.warning_symbol = save_stack_warning_symbol,
 	.warning_symbol = save_stack_warning_symbol,
@@ -40,6 +53,13 @@ static const struct stacktrace_ops save_stack_ops = {
 	.address = save_stack_address,
 	.address = save_stack_address,
 };
 };
 
 
+static const struct stacktrace_ops save_stack_ops_nosched = {
+	.warning = save_stack_warning,
+	.warning_symbol = save_stack_warning_symbol,
+	.stack = save_stack_stack,
+	.address = save_stack_address_nosched,
+};
+
 /*
 /*
  * Save stack-backtrace addresses into a stack_trace buffer.
  * Save stack-backtrace addresses into a stack_trace buffer.
  */
  */
@@ -50,3 +70,10 @@ void save_stack_trace(struct stack_trace *trace)
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 		trace->entries[trace->nr_entries++] = ULONG_MAX;
 }
 }
 EXPORT_SYMBOL(save_stack_trace);
 EXPORT_SYMBOL(save_stack_trace);
+
+void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace)
+{
+	dump_trace(tsk, NULL, NULL, &save_stack_ops_nosched, trace);
+	if (trace->nr_entries < trace->max_entries)
+		trace->entries[trace->nr_entries++] = ULONG_MAX;
+}

+ 4 - 4
drivers/lguest/x86/core.c

@@ -459,7 +459,7 @@ void __init lguest_arch_host_init(void)
 
 
 	/* We don't need the complexity of CPUs coming and going while we're
 	/* We don't need the complexity of CPUs coming and going while we're
 	 * doing this. */
 	 * doing this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_has_pge) { /* We have a broader idea of "global". */
 	if (cpu_has_pge) { /* We have a broader idea of "global". */
 		/* Remember that this was originally set (for cleanup). */
 		/* Remember that this was originally set (for cleanup). */
 		cpu_had_pge = 1;
 		cpu_had_pge = 1;
@@ -469,20 +469,20 @@ void __init lguest_arch_host_init(void)
 		/* Turn off the feature in the global feature set. */
 		/* Turn off the feature in the global feature set. */
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		clear_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 	}
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 };
 };
 /*:*/
 /*:*/
 
 
 void __exit lguest_arch_host_fini(void)
 void __exit lguest_arch_host_fini(void)
 {
 {
 	/* If we had PGE before we started, turn it back on now. */
 	/* If we had PGE before we started, turn it back on now. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	if (cpu_had_pge) {
 	if (cpu_had_pge) {
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		set_bit(X86_FEATURE_PGE, boot_cpu_data.x86_capability);
 		/* adjust_pge's argument "1" means set PGE. */
 		/* adjust_pge's argument "1" means set PGE. */
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 		on_each_cpu(adjust_pge, (void *)1, 0, 1);
 	}
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 }
 
 
 
 

+ 2 - 2
drivers/s390/char/sclp_config.c

@@ -29,12 +29,12 @@ static void sclp_cpu_capability_notify(struct work_struct *work)
 	struct sys_device *sysdev;
 	struct sys_device *sysdev;
 
 
 	printk(KERN_WARNING TAG "cpu capability changed.\n");
 	printk(KERN_WARNING TAG "cpu capability changed.\n");
-	lock_cpu_hotplug();
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 	for_each_online_cpu(cpu) {
 		sysdev = get_cpu_sysdev(cpu);
 		sysdev = get_cpu_sysdev(cpu);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 		kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
 	}
 	}
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 }
 
 
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)
 static void sclp_conf_receiver_fn(struct evbuf_header *evbuf)

+ 0 - 1
fs/Kconfig

@@ -2130,4 +2130,3 @@ source "fs/nls/Kconfig"
 source "fs/dlm/Kconfig"
 source "fs/dlm/Kconfig"
 
 
 endmenu
 endmenu
-

+ 78 - 0
fs/proc/base.c

@@ -310,6 +310,77 @@ static int proc_pid_schedstat(struct task_struct *task, char *buffer)
 }
 }
 #endif
 #endif
 
 
+#ifdef CONFIG_LATENCYTOP
+static int lstats_show_proc(struct seq_file *m, void *v)
+{
+	int i;
+	struct task_struct *task = m->private;
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < 32; i++) {
+		if (task->latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				task->latency_record[i].count,
+				task->latency_record[i].time,
+				task->latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!task->latency_record[i].backtrace[q])
+					break;
+				if (task->latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, task->latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+
+	}
+	return 0;
+}
+
+static int lstats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	struct seq_file *m;
+	struct task_struct *task = get_proc_task(inode);
+
+	ret = single_open(file, lstats_show_proc, NULL);
+	if (!ret) {
+		m = file->private_data;
+		m->private = task;
+	}
+	return ret;
+}
+
+static ssize_t lstats_write(struct file *file, const char __user *buf,
+			    size_t count, loff_t *offs)
+{
+	struct seq_file *m;
+	struct task_struct *task;
+
+	m = file->private_data;
+	task = m->private;
+	clear_all_latency_tracing(task);
+
+	return count;
+}
+
+static const struct file_operations proc_lstats_operations = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+#endif
+
 /* The badness from the OOM killer */
 /* The badness from the OOM killer */
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 unsigned long badness(struct task_struct *p, unsigned long uptime);
 static int proc_oom_score(struct task_struct *task, char *buffer)
 static int proc_oom_score(struct task_struct *task, char *buffer)
@@ -1020,6 +1091,7 @@ static const struct file_operations proc_fault_inject_operations = {
 };
 };
 #endif
 #endif
 
 
+
 #ifdef CONFIG_SCHED_DEBUG
 #ifdef CONFIG_SCHED_DEBUG
 /*
 /*
  * Print out various scheduling related per-task fields:
  * Print out various scheduling related per-task fields:
@@ -2230,6 +2302,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 	INF("schedstat",  S_IRUGO, pid_schedstat),
 #endif
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",     S_IRUGO, cpuset),
 	REG("cpuset",     S_IRUGO, cpuset),
 #endif
 #endif
@@ -2555,6 +2630,9 @@ static const struct pid_entry tid_base_stuff[] = {
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	INF("schedstat", S_IRUGO, pid_schedstat),
 	INF("schedstat", S_IRUGO, pid_schedstat),
 #endif
 #endif
+#ifdef CONFIG_LATENCYTOP
+	REG("latency",  S_IRUGO, lstats),
+#endif
 #ifdef CONFIG_PROC_PID_CPUSET
 #ifdef CONFIG_PROC_PID_CPUSET
 	REG("cpuset",    S_IRUGO, cpuset),
 	REG("cpuset",    S_IRUGO, cpuset),
 #endif
 #endif

+ 3 - 2
include/asm-generic/resource.h

@@ -44,8 +44,8 @@
 #define RLIMIT_NICE		13	/* max nice prio allowed to raise to
 #define RLIMIT_NICE		13	/* max nice prio allowed to raise to
 					   0-39 for nice level 19 .. -20 */
 					   0-39 for nice level 19 .. -20 */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
 #define RLIMIT_RTPRIO		14	/* maximum realtime priority */
-
-#define RLIM_NLIMITS		15
+#define RLIMIT_RTTIME		15	/* timeout for RT tasks in us */
+#define RLIM_NLIMITS		16
 
 
 /*
 /*
  * SuS says limits have to be unsigned.
  * SuS says limits have to be unsigned.
@@ -86,6 +86,7 @@
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 	[RLIMIT_MSGQUEUE]	= {   MQ_BYTES_MAX,   MQ_BYTES_MAX },	\
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_NICE]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
 	[RLIMIT_RTPRIO]		= { 0, 0 },				\
+	[RLIMIT_RTTIME]		= {  RLIM_INFINITY,  RLIM_INFINITY },	\
 }
 }
 
 
 #endif	/* __KERNEL__ */
 #endif	/* __KERNEL__ */

+ 2 - 0
include/asm-x86/thread_info_32.h

@@ -132,6 +132,7 @@ static inline struct thread_info *current_thread_info(void)
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SYSCALL_AUDIT	6	/* syscall auditing active */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_SECCOMP		7	/* secure computing */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
 #define TIF_RESTORE_SIGMASK	8	/* restore signal mask in do_signal() */
+#define TIF_HRTICK_RESCHED	9	/* reprogram hrtick timer */
 #define TIF_MEMDIE		16
 #define TIF_MEMDIE		16
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_DEBUG		17	/* uses debug registers */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
 #define TIF_IO_BITMAP		18	/* uses I/O bitmap */
@@ -147,6 +148,7 @@ static inline struct thread_info *current_thread_info(void)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_DEBUG		(1<<TIF_DEBUG)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_IO_BITMAP		(1<<TIF_IO_BITMAP)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)
 #define _TIF_FREEZE		(1<<TIF_FREEZE)

+ 5 - 0
include/asm-x86/thread_info_64.h

@@ -115,6 +115,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_RESTORE_SIGMASK	9	/* restore signal mask in do_signal */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
 #define TIF_MCE_NOTIFY		10	/* notify userspace of an MCE */
+#define TIF_HRTICK_RESCHED	11	/* reprogram hrtick timer */
 /* 16 free */
 /* 16 free */
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_IA32		17	/* 32bit process */ 
 #define TIF_FORK		18	/* ret_from_fork */
 #define TIF_FORK		18	/* ret_from_fork */
@@ -133,6 +134,7 @@ static inline struct thread_info *stack_thread_info(void)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_RESTORE_SIGMASK	(1<<TIF_RESTORE_SIGMASK)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
 #define _TIF_MCE_NOTIFY		(1<<TIF_MCE_NOTIFY)
+#define _TIF_HRTICK_RESCHED	(1<<TIF_HRTICK_RESCHED)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_FORK		(1<<TIF_FORK)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
 #define _TIF_ABI_PENDING	(1<<TIF_ABI_PENDING)
@@ -146,6 +148,9 @@ static inline struct thread_info *stack_thread_info(void)
 /* work to do on any return to user space */
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 #define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
 
 
+#define _TIF_DO_NOTIFY_MASK \
+	(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY|_TIF_HRTICK_RESCHED)
+
 /* flags to check in __switch_to() */
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 #define _TIF_WORK_CTXSW (_TIF_DEBUG|_TIF_IO_BITMAP)
 
 

+ 13 - 4
include/linux/cpu.h

@@ -71,18 +71,27 @@ static inline void unregister_cpu_notifier(struct notifier_block *nb)
 
 
 int cpu_up(unsigned int cpu);
 int cpu_up(unsigned int cpu);
 
 
+extern void cpu_hotplug_init(void);
+
 #else
 #else
 
 
 static inline int register_cpu_notifier(struct notifier_block *nb)
 static inline int register_cpu_notifier(struct notifier_block *nb)
 {
 {
 	return 0;
 	return 0;
 }
 }
+
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 static inline void unregister_cpu_notifier(struct notifier_block *nb)
 {
 {
 }
 }
 
 
+static inline void cpu_hotplug_init(void)
+{
+}
+
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 extern struct sysdev_class cpu_sysdev_class;
 extern struct sysdev_class cpu_sysdev_class;
+extern void cpu_maps_update_begin(void);
+extern void cpu_maps_update_done(void);
 
 
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
 /* Stop CPUs going up and down. */
 /* Stop CPUs going up and down. */
@@ -97,8 +106,8 @@ static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 	mutex_unlock(cpu_hp_mutex);
 	mutex_unlock(cpu_hp_mutex);
 }
 }
 
 
-extern void lock_cpu_hotplug(void);
-extern void unlock_cpu_hotplug(void);
+extern void get_online_cpus(void);
+extern void put_online_cpus(void);
 #define hotcpu_notifier(fn, pri) {				\
 #define hotcpu_notifier(fn, pri) {				\
 	static struct notifier_block fn##_nb =			\
 	static struct notifier_block fn##_nb =			\
 		{ .notifier_call = fn, .priority = pri };	\
 		{ .notifier_call = fn, .priority = pri };	\
@@ -115,8 +124,8 @@ static inline void cpuhotplug_mutex_lock(struct mutex *cpu_hp_mutex)
 static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 static inline void cpuhotplug_mutex_unlock(struct mutex *cpu_hp_mutex)
 { }
 { }
 
 
-#define lock_cpu_hotplug()	do { } while (0)
-#define unlock_cpu_hotplug()	do { } while (0)
+#define get_online_cpus()	do { } while (0)
+#define put_online_cpus()	do { } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 #define hotcpu_notifier(fn, pri)	do { (void)(fn); } while (0)
 /* These aren't inline functions due to a GCC bug. */
 /* These aren't inline functions due to a GCC bug. */
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })
 #define register_hotcpu_notifier(nb)	({ (void)(nb); 0; })

+ 5 - 0
include/linux/debug_locks.h

@@ -47,6 +47,7 @@ struct task_struct;
 
 
 #ifdef CONFIG_LOCKDEP
 #ifdef CONFIG_LOCKDEP
 extern void debug_show_all_locks(void);
 extern void debug_show_all_locks(void);
+extern void __debug_show_held_locks(struct task_struct *task);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_show_held_locks(struct task_struct *task);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
 extern void debug_check_no_locks_freed(const void *from, unsigned long len);
 extern void debug_check_no_locks_held(struct task_struct *task);
 extern void debug_check_no_locks_held(struct task_struct *task);
@@ -55,6 +56,10 @@ static inline void debug_show_all_locks(void)
 {
 {
 }
 }
 
 
+static inline void __debug_show_held_locks(struct task_struct *task)
+{
+}
+
 static inline void debug_show_held_locks(struct task_struct *task)
 static inline void debug_show_held_locks(struct task_struct *task)
 {
 {
 }
 }

+ 5 - 1
include/linux/futex.h

@@ -1,8 +1,12 @@
 #ifndef _LINUX_FUTEX_H
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
 
-#include <linux/sched.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
 
 
+struct inode;
+struct mm_struct;
+struct task_struct;
 union ktime;
 union ktime;
 
 
 /* Second argument to futex syscall */
 /* Second argument to futex syscall */

+ 1 - 5
include/linux/hardirq.h

@@ -72,11 +72,7 @@
 #define in_softirq()		(softirq_count())
 #define in_softirq()		(softirq_count())
 #define in_interrupt()		(irq_count())
 #define in_interrupt()		(irq_count())
 
 
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
-#else
-# define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
-#endif
+#define in_atomic()		((preempt_count() & ~PREEMPT_ACTIVE) != 0)
 
 
 #ifdef CONFIG_PREEMPT
 #ifdef CONFIG_PREEMPT
 # define PREEMPT_CHECK_OFFSET 1
 # define PREEMPT_CHECK_OFFSET 1

+ 11 - 3
include/linux/hrtimer.h

@@ -115,10 +115,8 @@ struct hrtimer {
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	enum hrtimer_restart		(*function)(struct hrtimer *);
 	struct hrtimer_clock_base	*base;
 	struct hrtimer_clock_base	*base;
 	unsigned long			state;
 	unsigned long			state;
-#ifdef CONFIG_HIGH_RES_TIMERS
 	enum hrtimer_cb_mode		cb_mode;
 	enum hrtimer_cb_mode		cb_mode;
 	struct list_head		cb_entry;
 	struct list_head		cb_entry;
-#endif
 #ifdef CONFIG_TIMER_STATS
 #ifdef CONFIG_TIMER_STATS
 	void				*start_site;
 	void				*start_site;
 	char				start_comm[16];
 	char				start_comm[16];
@@ -194,10 +192,10 @@ struct hrtimer_cpu_base {
 	spinlock_t			lock;
 	spinlock_t			lock;
 	struct lock_class_key		lock_key;
 	struct lock_class_key		lock_key;
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
 	struct hrtimer_clock_base	clock_base[HRTIMER_MAX_CLOCK_BASES];
+	struct list_head		cb_pending;
 #ifdef CONFIG_HIGH_RES_TIMERS
 #ifdef CONFIG_HIGH_RES_TIMERS
 	ktime_t				expires_next;
 	ktime_t				expires_next;
 	int				hres_active;
 	int				hres_active;
-	struct list_head		cb_pending;
 	unsigned long			nr_events;
 	unsigned long			nr_events;
 #endif
 #endif
 };
 };
@@ -217,6 +215,11 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->get_time();
 	return timer->base->get_time();
 }
 }
 
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return timer->base->cpu_base->hres_active;
+}
+
 /*
 /*
  * The resolution of the clocks. The resolution value is returned in
  * The resolution of the clocks. The resolution value is returned in
  * the clock_getres() system call to give application programmers an
  * the clock_getres() system call to give application programmers an
@@ -248,6 +251,10 @@ static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer)
 	return timer->base->softirq_time;
 	return timer->base->softirq_time;
 }
 }
 
 
+static inline int hrtimer_is_hres_active(struct hrtimer *timer)
+{
+	return 0;
+}
 #endif
 #endif
 
 
 extern ktime_t ktime_get(void);
 extern ktime_t ktime_get(void);
@@ -310,6 +317,7 @@ extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl,
 
 
 /* Soft interrupt function to run the hrtimer queues: */
 /* Soft interrupt function to run the hrtimer queues: */
 extern void hrtimer_run_queues(void);
 extern void hrtimer_run_queues(void);
+extern void hrtimer_run_pending(void);
 
 
 /* Bootup initialization: */
 /* Bootup initialization: */
 extern void __init hrtimers_init(void);
 extern void __init hrtimers_init(void);

+ 5 - 2
include/linux/init_task.h

@@ -132,9 +132,12 @@ extern struct group_info init_groups;
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
 	.mm		= NULL,						\
 	.active_mm	= &init_mm,					\
 	.active_mm	= &init_mm,					\
-	.run_list	= LIST_HEAD_INIT(tsk.run_list),			\
+	.rt		= {						\
+		.run_list	= LIST_HEAD_INIT(tsk.rt.run_list),	\
+		.time_slice	= HZ, 					\
+		.nr_cpus_allowed = NR_CPUS,				\
+	},								\
 	.ioprio		= 0,						\
 	.ioprio		= 0,						\
-	.time_slice	= HZ,						\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_children= LIST_HEAD_INIT(tsk.ptrace_children),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\
 	.ptrace_list	= LIST_HEAD_INIT(tsk.ptrace_list),		\

+ 1 - 0
include/linux/interrupt.h

@@ -256,6 +256,7 @@ enum
 #ifdef CONFIG_HIGH_RES_TIMERS
 #ifdef CONFIG_HIGH_RES_TIMERS
 	HRTIMER_SOFTIRQ,
 	HRTIMER_SOFTIRQ,
 #endif
 #endif
+	RCU_SOFTIRQ, 	/* Preferable RCU should always be the last softirq */
 };
 };
 
 
 /* softirq mask and active fields moved to irq_cpustat_t in
 /* softirq mask and active fields moved to irq_cpustat_t in

+ 6 - 0
include/linux/jiffies.h

@@ -29,6 +29,12 @@
 # define SHIFT_HZ	9
 # define SHIFT_HZ	9
 #elif HZ >= 768 && HZ < 1536
 #elif HZ >= 768 && HZ < 1536
 # define SHIFT_HZ	10
 # define SHIFT_HZ	10
+#elif HZ >= 1536 && HZ < 3072
+# define SHIFT_HZ	11
+#elif HZ >= 3072 && HZ < 6144
+# define SHIFT_HZ	12
+#elif HZ >= 6144 && HZ < 12288
+# define SHIFT_HZ	13
 #else
 #else
 # error You lose.
 # error You lose.
 #endif
 #endif

+ 2 - 2
include/linux/kernel.h

@@ -105,8 +105,8 @@ struct user;
  * supposed to.
  * supposed to.
  */
  */
 #ifdef CONFIG_PREEMPT_VOLUNTARY
 #ifdef CONFIG_PREEMPT_VOLUNTARY
-extern int cond_resched(void);
-# define might_resched() cond_resched()
+extern int _cond_resched(void);
+# define might_resched() _cond_resched()
 #else
 #else
 # define might_resched() do { } while (0)
 # define might_resched() do { } while (0)
 #endif
 #endif

+ 44 - 0
include/linux/latencytop.h

@@ -0,0 +1,44 @@
+/*
+ * latencytop.h: Infrastructure for displaying latency
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ */
+
+#ifndef _INCLUDE_GUARD_LATENCYTOP_H_
+#define _INCLUDE_GUARD_LATENCYTOP_H_
+
+#ifdef CONFIG_LATENCYTOP
+
+#define LT_SAVECOUNT		32
+#define LT_BACKTRACEDEPTH	12
+
+struct latency_record {
+	unsigned long	backtrace[LT_BACKTRACEDEPTH];
+	unsigned int	count;
+	unsigned long	time;
+	unsigned long	max;
+};
+
+
+struct task_struct;
+
+void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+
+void clear_all_latency_tracing(struct task_struct *p);
+
+#else
+
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+}
+
+static inline void clear_all_latency_tracing(struct task_struct *p)
+{
+}
+
+#endif
+
+#endif

+ 1 - 3
include/linux/notifier.h

@@ -207,9 +207,7 @@ static inline int notifier_to_errno(int ret)
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_PREPARE	0x0005 /* CPU (unsigned)v going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DOWN_FAILED		0x0006 /* CPU (unsigned)v NOT going down */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
 #define CPU_DEAD		0x0007 /* CPU (unsigned)v dead */
-#define CPU_LOCK_ACQUIRE	0x0008 /* Acquire all hotcpu locks */
-#define CPU_LOCK_RELEASE	0x0009 /* Release all hotcpu locks */
-#define CPU_DYING		0x000A /* CPU (unsigned)v not running any task,
+#define CPU_DYING		0x0008 /* CPU (unsigned)v not running any task,
 				        * not handling interrupts, soon dead */
 				        * not handling interrupts, soon dead */
 
 
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend
 /* Used for CPU hotplug events occuring while tasks are frozen due to a suspend

+ 164 - 0
include/linux/rcuclassic.h

@@ -0,0 +1,164 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (classic version)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Author: Dipankar Sarma <dipankar@in.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+
+#ifndef __LINUX_RCUCLASSIC_H
+#define __LINUX_RCUCLASSIC_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+
+/* Global control variables for rcupdate callback mechanism. */
+struct rcu_ctrlblk {
+	long	cur;		/* Current batch number.                      */
+	long	completed;	/* Number of the last completed batch         */
+	int	next_pending;	/* Is the next batch already waiting?         */
+
+	int	signaled;
+
+	spinlock_t	lock	____cacheline_internodealigned_in_smp;
+	cpumask_t	cpumask; /* CPUs that need to switch in order    */
+				 /* for current batch to proceed.        */
+} ____cacheline_internodealigned_in_smp;
+
+/* Is batch a before batch b ? */
+static inline int rcu_batch_before(long a, long b)
+{
+	return (a - b) < 0;
+}
+
+/* Is batch a after batch b ? */
+static inline int rcu_batch_after(long a, long b)
+{
+	return (a - b) > 0;
+}
+
+/*
+ * Per-CPU data for Read-Copy UPdate.
+ * nxtlist - new callbacks are added here
+ * curlist - current batch for which quiescent cycle started if any
+ */
+struct rcu_data {
+	/* 1) quiescent state handling : */
+	long		quiescbatch;     /* Batch # for grace period */
+	int		passed_quiesc;	 /* User-mode/idle loop etc. */
+	int		qs_pending;	 /* core waits for quiesc state */
+
+	/* 2) batch handling */
+	long  	       	batch;           /* Batch # for current RCU batch */
+	struct rcu_head *nxtlist;
+	struct rcu_head **nxttail;
+	long            qlen; 	 	 /* # of queued callbacks */
+	struct rcu_head *curlist;
+	struct rcu_head **curtail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long		blimit;		 /* Upper limit on a processed batch */
+	int cpu;
+	struct rcu_head barrier;
+};
+
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+static inline void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+static inline void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+extern struct lockdep_map rcu_lock_map;
+# define rcu_read_acquire()	\
+			lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
+# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
+#else
+# define rcu_read_acquire()	do { } while (0)
+# define rcu_read_release()	do { } while (0)
+#endif
+
+#define __rcu_read_lock() \
+	do { \
+		preempt_disable(); \
+		__acquire(RCU); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU); \
+		preempt_enable(); \
+	} while (0)
+#define __rcu_read_lock_bh() \
+	do { \
+		local_bh_disable(); \
+		__acquire(RCU_BH); \
+		rcu_read_acquire(); \
+	} while (0)
+#define __rcu_read_unlock_bh() \
+	do { \
+		rcu_read_release(); \
+		__release(RCU_BH); \
+		local_bh_enable(); \
+	} while (0)
+
+#define __synchronize_sched() synchronize_rcu()
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUCLASSIC_H */

+ 53 - 120
include/linux/rcupdate.h

@@ -15,7 +15,7 @@
  * along with this program; if not, write to the Free Software
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  *
- * Copyright (C) IBM Corporation, 2001
+ * Copyright IBM Corporation, 2001
  *
  *
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
  * Author: Dipankar Sarma <dipankar@in.ibm.com>
  * 
  * 
@@ -53,96 +53,18 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 	void (*func)(struct rcu_head *head);
 };
 };
 
 
+#ifdef CONFIG_CLASSIC_RCU
+#include <linux/rcuclassic.h>
+#else /* #ifdef CONFIG_CLASSIC_RCU */
+#include <linux/rcupreempt.h>
+#endif /* #else #ifdef CONFIG_CLASSIC_RCU */
+
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
 #define INIT_RCU_HEAD(ptr) do { \
 #define INIT_RCU_HEAD(ptr) do { \
        (ptr)->next = NULL; (ptr)->func = NULL; \
        (ptr)->next = NULL; (ptr)->func = NULL; \
 } while (0)
 } while (0)
 
 
-
-
-/* Global control variables for rcupdate callback mechanism. */
-struct rcu_ctrlblk {
-	long	cur;		/* Current batch number.                      */
-	long	completed;	/* Number of the last completed batch         */
-	int	next_pending;	/* Is the next batch already waiting?         */
-
-	int	signaled;
-
-	spinlock_t	lock	____cacheline_internodealigned_in_smp;
-	cpumask_t	cpumask; /* CPUs that need to switch in order    */
-	                         /* for current batch to proceed.        */
-} ____cacheline_internodealigned_in_smp;
-
-/* Is batch a before batch b ? */
-static inline int rcu_batch_before(long a, long b)
-{
-        return (a - b) < 0;
-}
-
-/* Is batch a after batch b ? */
-static inline int rcu_batch_after(long a, long b)
-{
-        return (a - b) > 0;
-}
-
-/*
- * Per-CPU data for Read-Copy UPdate.
- * nxtlist - new callbacks are added here
- * curlist - current batch for which quiescent cycle started if any
- */
-struct rcu_data {
-	/* 1) quiescent state handling : */
-	long		quiescbatch;     /* Batch # for grace period */
-	int		passed_quiesc;	 /* User-mode/idle loop etc. */
-	int		qs_pending;	 /* core waits for quiesc state */
-
-	/* 2) batch handling */
-	long  	       	batch;           /* Batch # for current RCU batch */
-	struct rcu_head *nxtlist;
-	struct rcu_head **nxttail;
-	long            qlen; 	 	 /* # of queued callbacks */
-	struct rcu_head *curlist;
-	struct rcu_head **curtail;
-	struct rcu_head *donelist;
-	struct rcu_head **donetail;
-	long		blimit;		 /* Upper limit on a processed batch */
-	int cpu;
-	struct rcu_head barrier;
-};
-
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-
-extern int rcu_pending(int cpu);
-extern int rcu_needs_cpu(int cpu);
-
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-extern struct lockdep_map rcu_lock_map;
-# define rcu_read_acquire()	lock_acquire(&rcu_lock_map, 0, 0, 2, 1, _THIS_IP_)
-# define rcu_read_release()	lock_release(&rcu_lock_map, 1, _THIS_IP_)
-#else
-# define rcu_read_acquire()	do { } while (0)
-# define rcu_read_release()	do { } while (0)
-#endif
-
 /**
 /**
  * rcu_read_lock - mark the beginning of an RCU read-side critical section.
  * rcu_read_lock - mark the beginning of an RCU read-side critical section.
  *
  *
@@ -172,24 +94,13 @@ extern struct lockdep_map rcu_lock_map;
  *
  *
  * It is illegal to block while in an RCU read-side critical section.
  * It is illegal to block while in an RCU read-side critical section.
  */
  */
-#define rcu_read_lock() \
-	do { \
-		preempt_disable(); \
-		__acquire(RCU); \
-		rcu_read_acquire(); \
-	} while(0)
+#define rcu_read_lock() __rcu_read_lock()
 
 
 /**
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  *
  * See rcu_read_lock() for more information.
  * See rcu_read_lock() for more information.
  */
  */
-#define rcu_read_unlock() \
-	do { \
-		rcu_read_release(); \
-		__release(RCU); \
-		preempt_enable(); \
-	} while(0)
 
 
 /*
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -200,6 +111,7 @@ extern struct lockdep_map rcu_lock_map;
  * used as well.  RCU does not care how the writers keep out of each
  * used as well.  RCU does not care how the writers keep out of each
  * others' way, as long as they do so.
  * others' way, as long as they do so.
  */
  */
+#define rcu_read_unlock() __rcu_read_unlock()
 
 
 /**
 /**
  * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
  * rcu_read_lock_bh - mark the beginning of a softirq-only RCU critical section
@@ -212,24 +124,14 @@ extern struct lockdep_map rcu_lock_map;
  * can use just rcu_read_lock().
  * can use just rcu_read_lock().
  *
  *
  */
  */
-#define rcu_read_lock_bh() \
-	do { \
-		local_bh_disable(); \
-		__acquire(RCU_BH); \
-		rcu_read_acquire(); \
-	} while(0)
+#define rcu_read_lock_bh() __rcu_read_lock_bh()
 
 
 /*
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  *
  * See rcu_read_lock_bh() for more information.
  * See rcu_read_lock_bh() for more information.
  */
  */
-#define rcu_read_unlock_bh() \
-	do { \
-		rcu_read_release(); \
-		__release(RCU_BH); \
-		local_bh_enable(); \
-	} while(0)
+#define rcu_read_unlock_bh() __rcu_read_unlock_bh()
 
 
 /*
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * Prevent the compiler from merging or refetching accesses.  The compiler
@@ -293,21 +195,52 @@ extern struct lockdep_map rcu_lock_map;
  * In "classic RCU", these two guarantees happen to be one and
  * In "classic RCU", these two guarantees happen to be one and
  * the same, but can differ in realtime RCU implementations.
  * the same, but can differ in realtime RCU implementations.
  */
  */
-#define synchronize_sched() synchronize_rcu()
+#define synchronize_sched() __synchronize_sched()
 
 
-extern void rcu_init(void);
-extern void rcu_check_callbacks(int cpu, int user);
-extern void rcu_restart_cpu(int cpu);
-extern long rcu_batches_completed(void);
-extern long rcu_batches_completed_bh(void);
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+extern void call_rcu(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head));
 
 
-/* Exported interfaces */
-extern void FASTCALL(call_rcu(struct rcu_head *head, 
-				void (*func)(struct rcu_head *head)));
-extern void FASTCALL(call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *head)));
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by :
+ *  - rcu_read_lock() and  rcu_read_unlock(), if in interrupt context.
+ *  OR
+ *  - rcu_read_lock_bh() and rcu_read_unlock_bh(), if in process context.
+ *  These may be nested.
+ */
+extern void call_rcu_bh(struct rcu_head *head,
+			void (*func)(struct rcu_head *head));
+
+/* Exported common interfaces */
 extern void synchronize_rcu(void);
 extern void synchronize_rcu(void);
 extern void rcu_barrier(void);
 extern void rcu_barrier(void);
+extern long rcu_batches_completed(void);
+extern long rcu_batches_completed_bh(void);
+
+/* Internal to kernel */
+extern void rcu_init(void);
+extern int rcu_needs_cpu(int cpu);
 
 
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 #endif /* __LINUX_RCUPDATE_H */
 #endif /* __LINUX_RCUPDATE_H */

+ 86 - 0
include/linux/rcupreempt.h

@@ -0,0 +1,86 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+
+#ifndef __LINUX_RCUPREEMPT_H
+#define __LINUX_RCUPREEMPT_H
+
+#ifdef __KERNEL__
+
+#include <linux/cache.h>
+#include <linux/spinlock.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/cpumask.h>
+#include <linux/seqlock.h>
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void __rcu_read_lock(void);
+extern void __rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+extern int rcu_needs_cpu(int cpu);
+
+#define __rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#define __rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+
+extern void __synchronize_sched(void);
+
+extern void __rcu_init(void);
+extern void rcu_check_callbacks(int cpu, int user);
+extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
+
+/*
+ * Return the number of RCU batches processed thus far. Useful for debug
+ * and statistic. The _bh variant is identifcal to straight RCU
+ */
+static inline long rcu_batches_completed_bh(void)
+{
+	return rcu_batches_completed();
+}
+
+#ifdef CONFIG_RCU_TRACE
+struct rcupreempt_trace;
+extern long *rcupreempt_flipctr(int cpu);
+extern long rcupreempt_data_completed(void);
+extern int rcupreempt_flip_flag(int cpu);
+extern int rcupreempt_mb_flag(int cpu);
+extern char *rcupreempt_try_flip_state_name(void);
+extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
+#endif
+
+struct softirq_action;
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_H */

+ 99 - 0
include/linux/rcupreempt_trace.h

@@ -0,0 +1,99 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion (RT implementation)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2006
+ *
+ * Author:  Paul McKenney <paulmck@us.ibm.com>
+ *
+ * Based on the original work by Paul McKenney <paul.mckenney@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of the Preemptible Read-Copy Update mechanism see -
+ * 		 http://lwn.net/Articles/253651/
+ */
+
+#ifndef __LINUX_RCUPREEMPT_TRACE_H
+#define __LINUX_RCUPREEMPT_TRACE_H
+
+#ifdef __KERNEL__
+#include <linux/types.h>
+#include <linux/kernel.h>
+
+#include <asm/atomic.h>
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+struct rcupreempt_trace {
+	long		next_length;
+	long		next_add;
+	long		wait_length;
+	long		wait_add;
+	long		done_length;
+	long		done_add;
+	long		done_remove;
+	atomic_t	done_invoked;
+	long		rcu_check_callbacks;
+	atomic_t	rcu_try_flip_1;
+	atomic_t	rcu_try_flip_e1;
+	long		rcu_try_flip_i1;
+	long		rcu_try_flip_ie1;
+	long		rcu_try_flip_g1;
+	long		rcu_try_flip_a1;
+	long		rcu_try_flip_ae1;
+	long		rcu_try_flip_a2;
+	long		rcu_try_flip_z1;
+	long		rcu_try_flip_ze1;
+	long		rcu_try_flip_z2;
+	long		rcu_try_flip_m1;
+	long		rcu_try_flip_me1;
+	long		rcu_try_flip_m2;
+};
+
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(fn, arg) 	fn(arg);
+#else
+#define RCU_TRACE(fn, arg)
+#endif
+
+extern void rcupreempt_trace_move2done(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_invoke(struct rcupreempt_trace *trace);
+extern void rcupreempt_trace_next_add(struct rcupreempt_trace *trace);
+
+#endif /* __KERNEL__ */
+#endif /* __LINUX_RCUPREEMPT_TRACE_H */

+ 77 - 6
include/linux/sched.h

@@ -78,7 +78,6 @@ struct sched_param {
 #include <linux/proportions.h>
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
 #include <linux/rcupdate.h>
-#include <linux/futex.h>
 #include <linux/rtmutex.h>
 #include <linux/rtmutex.h>
 
 
 #include <linux/time.h>
 #include <linux/time.h>
@@ -88,11 +87,13 @@ struct sched_param {
 #include <linux/hrtimer.h>
 #include <linux/hrtimer.h>
 #include <linux/task_io_accounting.h>
 #include <linux/task_io_accounting.h>
 #include <linux/kobject.h>
 #include <linux/kobject.h>
+#include <linux/latencytop.h>
 
 
 #include <asm/processor.h>
 #include <asm/processor.h>
 
 
 struct exec_domain;
 struct exec_domain;
 struct futex_pi_state;
 struct futex_pi_state;
+struct robust_list_head;
 struct bio;
 struct bio;
 
 
 /*
 /*
@@ -230,6 +231,8 @@ static inline int select_nohz_load_balancer(int cpu)
 }
 }
 #endif
 #endif
 
 
+extern unsigned long rt_needs_cpu(int cpu);
+
 /*
 /*
  * Only dump TASK_* tasks. (0 for all tasks)
  * Only dump TASK_* tasks. (0 for all tasks)
  */
  */
@@ -257,13 +260,19 @@ extern void trap_init(void);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void account_process_tick(struct task_struct *task, int user);
 extern void update_process_times(int user);
 extern void update_process_times(int user);
 extern void scheduler_tick(void);
 extern void scheduler_tick(void);
+extern void hrtick_resched(void);
+
+extern void sched_show_task(struct task_struct *p);
 
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 extern void softlockup_tick(void);
 extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern void touch_all_softlockup_watchdogs(void);
-extern int softlockup_thresh;
+extern unsigned long  softlockup_thresh;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
 #else
 #else
 static inline void softlockup_tick(void)
 static inline void softlockup_tick(void)
 {
 {
@@ -822,6 +831,7 @@ struct sched_class {
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*enqueue_task) (struct rq *rq, struct task_struct *p, int wakeup);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*dequeue_task) (struct rq *rq, struct task_struct *p, int sleep);
 	void (*yield_task) (struct rq *rq);
 	void (*yield_task) (struct rq *rq);
+	int  (*select_task_rq)(struct task_struct *p, int sync);
 
 
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 	void (*check_preempt_curr) (struct rq *rq, struct task_struct *p);
 
 
@@ -837,11 +847,25 @@ struct sched_class {
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 	int (*move_one_task) (struct rq *this_rq, int this_cpu,
 			      struct rq *busiest, struct sched_domain *sd,
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
 			      enum cpu_idle_type idle);
+	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	void (*post_schedule) (struct rq *this_rq);
+	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 #endif
 #endif
 
 
 	void (*set_curr_task) (struct rq *rq);
 	void (*set_curr_task) (struct rq *rq);
-	void (*task_tick) (struct rq *rq, struct task_struct *p);
+	void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
 	void (*task_new) (struct rq *rq, struct task_struct *p);
+	void (*set_cpus_allowed)(struct task_struct *p, cpumask_t *newmask);
+
+	void (*join_domain)(struct rq *rq);
+	void (*leave_domain)(struct rq *rq);
+
+	void (*switched_from) (struct rq *this_rq, struct task_struct *task,
+			       int running);
+	void (*switched_to) (struct rq *this_rq, struct task_struct *task,
+			     int running);
+	void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
+			     int oldprio, int running);
 };
 };
 
 
 struct load_weight {
 struct load_weight {
@@ -871,6 +895,8 @@ struct sched_entity {
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	u64			wait_start;
 	u64			wait_start;
 	u64			wait_max;
 	u64			wait_max;
+	u64			wait_count;
+	u64			wait_sum;
 
 
 	u64			sleep_start;
 	u64			sleep_start;
 	u64			sleep_max;
 	u64			sleep_max;
@@ -909,6 +935,21 @@ struct sched_entity {
 #endif
 #endif
 };
 };
 
 
+struct sched_rt_entity {
+	struct list_head run_list;
+	unsigned int time_slice;
+	unsigned long timeout;
+	int nr_cpus_allowed;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct sched_rt_entity	*parent;
+	/* rq on which this entity is (to be) queued: */
+	struct rt_rq		*rt_rq;
+	/* rq "owned" by this entity/group: */
+	struct rt_rq		*my_q;
+#endif
+};
+
 struct task_struct {
 struct task_struct {
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	void *stack;
 	void *stack;
@@ -925,9 +966,9 @@ struct task_struct {
 #endif
 #endif
 
 
 	int prio, static_prio, normal_prio;
 	int prio, static_prio, normal_prio;
-	struct list_head run_list;
 	const struct sched_class *sched_class;
 	const struct sched_class *sched_class;
 	struct sched_entity se;
 	struct sched_entity se;
+	struct sched_rt_entity rt;
 
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 	/* list of struct preempt_notifier: */
 	/* list of struct preempt_notifier: */
@@ -951,7 +992,11 @@ struct task_struct {
 
 
 	unsigned int policy;
 	unsigned int policy;
 	cpumask_t cpus_allowed;
 	cpumask_t cpus_allowed;
-	unsigned int time_slice;
+
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	int rcu_flipctr_idx;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 
 
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 	struct sched_info sched_info;
 	struct sched_info sched_info;
@@ -1041,6 +1086,11 @@ struct task_struct {
 /* ipc stuff */
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 	struct sysv_sem sysvsem;
 #endif
 #endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+/* hung task detection */
+	unsigned long last_switch_timestamp;
+	unsigned long last_switch_count;
+#endif
 /* CPU-specific state of this task */
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 	struct thread_struct thread;
 /* filesystem information */
 /* filesystem information */
@@ -1173,6 +1223,10 @@ struct task_struct {
 	int make_it_fail;
 	int make_it_fail;
 #endif
 #endif
 	struct prop_local_single dirties;
 	struct prop_local_single dirties;
+#ifdef CONFIG_LATENCYTOP
+	int latency_record_count;
+	struct latency_record latency_record[LT_SAVECOUNT];
+#endif
 };
 };
 
 
 /*
 /*
@@ -1453,6 +1507,12 @@ extern unsigned int sysctl_sched_child_runs_first;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_features;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_nr_migrate;
+extern unsigned int sysctl_sched_rt_period;
+extern unsigned int sysctl_sched_rt_ratio;
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+extern unsigned int sysctl_sched_min_bal_int_shares;
+extern unsigned int sysctl_sched_max_bal_int_shares;
+#endif
 
 
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 int sched_nr_latency_handler(struct ctl_table *table, int write,
 		struct file *file, void __user *buffer, size_t *length,
 		struct file *file, void __user *buffer, size_t *length,
@@ -1845,7 +1905,18 @@ static inline int need_resched(void)
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_lock() will drop the spinlock before scheduling,
  * cond_resched_softirq() will enable bhs before scheduling.
  * cond_resched_softirq() will enable bhs before scheduling.
  */
  */
-extern int cond_resched(void);
+#ifdef CONFIG_PREEMPT
+static inline int cond_resched(void)
+{
+	return 0;
+}
+#else
+extern int _cond_resched(void);
+static inline int cond_resched(void)
+{
+	return _cond_resched();
+}
+#endif
 extern int cond_resched_lock(spinlock_t * lock);
 extern int cond_resched_lock(spinlock_t * lock);
 extern int cond_resched_softirq(void);
 extern int cond_resched_softirq(void);
 
 

+ 1 - 13
include/linux/smp_lock.h

@@ -17,22 +17,10 @@ extern void __lockfunc __release_kernel_lock(void);
 		__release_kernel_lock();	\
 		__release_kernel_lock();	\
 } while (0)
 } while (0)
 
 
-/*
- * Non-SMP kernels will never block on the kernel lock,
- * so we are better off returning a constant zero from
- * reacquire_kernel_lock() so that the compiler can see
- * it at compile-time.
- */
-#if defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_BKL)
-# define return_value_on_smp return
-#else
-# define return_value_on_smp
-#endif
-
 static inline int reacquire_kernel_lock(struct task_struct *task)
 static inline int reacquire_kernel_lock(struct task_struct *task)
 {
 {
 	if (unlikely(task->lock_depth >= 0))
 	if (unlikely(task->lock_depth >= 0))
-		return_value_on_smp __reacquire_kernel_lock();
+		return __reacquire_kernel_lock();
 	return 0;
 	return 0;
 }
 }
 
 

+ 3 - 0
include/linux/stacktrace.h

@@ -9,10 +9,13 @@ struct stack_trace {
 };
 };
 
 
 extern void save_stack_trace(struct stack_trace *trace);
 extern void save_stack_trace(struct stack_trace *trace);
+extern void save_stack_trace_tsk(struct task_struct *tsk,
+				struct stack_trace *trace);
 
 
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 extern void print_stack_trace(struct stack_trace *trace, int spaces);
 #else
 #else
 # define save_stack_trace(trace)			do { } while (0)
 # define save_stack_trace(trace)			do { } while (0)
+# define save_stack_trace_tsk(tsk, trace)		do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 # define print_stack_trace(trace, spaces)		do { } while (0)
 #endif
 #endif
 
 

+ 4 - 1
include/linux/topology.h

@@ -5,7 +5,7 @@
  *
  *
  * Copyright (C) 2002, IBM Corp.
  * Copyright (C) 2002, IBM Corp.
  *
  *
- * All rights reserved.          
+ * All rights reserved.
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * it under the terms of the GNU General Public License as published by
@@ -103,6 +103,7 @@
 	.forkexec_idx		= 0,			\
 	.forkexec_idx		= 0,			\
 	.flags			= SD_LOAD_BALANCE	\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_IDLE		\
 				| SD_WAKE_IDLE		\
@@ -134,6 +135,7 @@
 	.forkexec_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_IDLE		\
 				| SD_WAKE_IDLE		\
@@ -165,6 +167,7 @@
 	.forkexec_idx		= 1,			\
 	.forkexec_idx		= 1,			\
 	.flags			= SD_LOAD_BALANCE	\
 	.flags			= SD_LOAD_BALANCE	\
 				| SD_BALANCE_NEWIDLE	\
 				| SD_BALANCE_NEWIDLE	\
+				| SD_BALANCE_FORK	\
 				| SD_BALANCE_EXEC	\
 				| SD_BALANCE_EXEC	\
 				| SD_WAKE_AFFINE	\
 				| SD_WAKE_AFFINE	\
 				| BALANCE_FOR_PKG_POWER,\
 				| BALANCE_FOR_PKG_POWER,\

+ 28 - 0
init/Kconfig

@@ -763,3 +763,31 @@ source "block/Kconfig"
 
 
 config PREEMPT_NOTIFIERS
 config PREEMPT_NOTIFIERS
 	bool
 	bool
+
+choice
+	prompt "RCU implementation type:"
+	default CLASSIC_RCU
+
+config CLASSIC_RCU
+	bool "Classic RCU"
+	help
+	  This option selects the classic RCU implementation that is
+	  designed for best read-side performance on non-realtime
+	  systems.
+
+	  Say Y if you are unsure.
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+endchoice

+ 1 - 0
init/main.c

@@ -607,6 +607,7 @@ asmlinkage void __init start_kernel(void)
 	vfs_caches_init_early();
 	vfs_caches_init_early();
 	cpuset_init_early();
 	cpuset_init_early();
 	mem_init();
 	mem_init();
+	cpu_hotplug_init();
 	kmem_cache_init();
 	kmem_cache_init();
 	setup_per_cpu_pageset();
 	setup_per_cpu_pageset();
 	numa_policy_init();
 	numa_policy_init();

+ 2 - 0
kernel/Kconfig.hz

@@ -54,3 +54,5 @@ config HZ
 	default 300 if HZ_300
 	default 300 if HZ_300
 	default 1000 if HZ_1000
 	default 1000 if HZ_1000
 
 
+config SCHED_HRTICK
+	def_bool HIGH_RES_TIMERS && X86

+ 6 - 7
kernel/Kconfig.preempt

@@ -52,14 +52,13 @@ config PREEMPT
 
 
 endchoice
 endchoice
 
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on SMP || PREEMPT
+config RCU_TRACE
+	bool "Enable tracing for RCU - currently stats in debugfs"
+	select DEBUG_FS
 	default y
 	default y
 	help
 	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
+	  This option provides tracing in RCU which presents stats
+	  in debugfs for debugging RCU implementation.
 
 
-	  Say Y here if you are building a kernel for a desktop system.
+	  Say Y here if you want to enable RCU tracing
 	  Say N if you are unsure.
 	  Say N if you are unsure.
-

+ 6 - 0
kernel/Makefile

@@ -52,11 +52,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
+obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
+obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
+ifeq ($(CONFIG_PREEMPT_RCU),y)
+obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
+endif
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_RELAY) += relay.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
 obj-$(CONFIG_MARKERS) += marker.o
 obj-$(CONFIG_MARKERS) += marker.o
+obj-$(CONFIG_LATENCYTOP) += latencytop.o
 
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is

+ 115 - 49
kernel/cpu.c

@@ -15,9 +15,8 @@
 #include <linux/stop_machine.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 
 
-/* This protects CPUs going up and down... */
+/* Serializes the updates to cpu_online_map, cpu_present_map */
 static DEFINE_MUTEX(cpu_add_remove_lock);
 static DEFINE_MUTEX(cpu_add_remove_lock);
-static DEFINE_MUTEX(cpu_bitmask_lock);
 
 
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
 
 
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
  */
  */
 static int cpu_hotplug_disabled;
 static int cpu_hotplug_disabled;
 
 
-#ifdef CONFIG_HOTPLUG_CPU
+static struct {
+	struct task_struct *active_writer;
+	struct mutex lock; /* Synchronizes accesses to refcount, */
+	/*
+	 * Also blocks the new readers during
+	 * an ongoing cpu hotplug operation.
+	 */
+	int refcount;
+	wait_queue_head_t writer_queue;
+} cpu_hotplug;
 
 
-/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
-static struct task_struct *recursive;
-static int recursive_depth;
+#define writer_exists() (cpu_hotplug.active_writer != NULL)
 
 
-void lock_cpu_hotplug(void)
+void __init cpu_hotplug_init(void)
 {
 {
-	struct task_struct *tsk = current;
-
-	if (tsk == recursive) {
-		static int warnings = 10;
-		if (warnings) {
-			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
-			WARN_ON(1);
-			warnings--;
-		}
-		recursive_depth++;
+	cpu_hotplug.active_writer = NULL;
+	mutex_init(&cpu_hotplug.lock);
+	cpu_hotplug.refcount = 0;
+	init_waitqueue_head(&cpu_hotplug.writer_queue);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+void get_online_cpus(void)
+{
+	might_sleep();
+	if (cpu_hotplug.active_writer == current)
 		return;
 		return;
-	}
-	mutex_lock(&cpu_bitmask_lock);
-	recursive = tsk;
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount++;
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 }
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(get_online_cpus);
 
 
-void unlock_cpu_hotplug(void)
+void put_online_cpus(void)
 {
 {
-	WARN_ON(recursive != current);
-	if (recursive_depth) {
-		recursive_depth--;
+	if (cpu_hotplug.active_writer == current)
 		return;
 		return;
-	}
-	recursive = NULL;
-	mutex_unlock(&cpu_bitmask_lock);
+	mutex_lock(&cpu_hotplug.lock);
+	cpu_hotplug.refcount--;
+
+	if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
+		wake_up(&cpu_hotplug.writer_queue);
+
+	mutex_unlock(&cpu_hotplug.lock);
+
 }
 }
-EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
+EXPORT_SYMBOL_GPL(put_online_cpus);
 
 
 #endif	/* CONFIG_HOTPLUG_CPU */
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 
+/*
+ * The following two API's must be used when attempting
+ * to serialize the updates to cpu_online_map, cpu_present_map.
+ */
+void cpu_maps_update_begin(void)
+{
+	mutex_lock(&cpu_add_remove_lock);
+}
+
+void cpu_maps_update_done(void)
+{
+	mutex_unlock(&cpu_add_remove_lock);
+}
+
+/*
+ * This ensures that the hotplug operation can begin only when the
+ * refcount goes to zero.
+ *
+ * Note that during a cpu-hotplug operation, the new readers, if any,
+ * will be blocked by the cpu_hotplug.lock
+ *
+ * Since cpu_maps_update_begin is always called after invoking
+ * cpu_maps_update_begin, we can be sure that only one writer is active.
+ *
+ * Note that theoretically, there is a possibility of a livelock:
+ * - Refcount goes to zero, last reader wakes up the sleeping
+ *   writer.
+ * - Last reader unlocks the cpu_hotplug.lock.
+ * - A new reader arrives at this moment, bumps up the refcount.
+ * - The writer acquires the cpu_hotplug.lock finds the refcount
+ *   non zero and goes to sleep again.
+ *
+ * However, this is very difficult to achieve in practice since
+ * get_online_cpus() not an api which is called all that often.
+ *
+ */
+static void cpu_hotplug_begin(void)
+{
+	DECLARE_WAITQUEUE(wait, current);
+
+	mutex_lock(&cpu_hotplug.lock);
+
+	cpu_hotplug.active_writer = current;
+	add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
+	while (cpu_hotplug.refcount) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		mutex_unlock(&cpu_hotplug.lock);
+		schedule();
+		mutex_lock(&cpu_hotplug.lock);
+	}
+	remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
+}
+
+static void cpu_hotplug_done(void)
+{
+	cpu_hotplug.active_writer = NULL;
+	mutex_unlock(&cpu_hotplug.lock);
+}
 /* Need to know about CPUs going up/down? */
 /* Need to know about CPUs going up/down? */
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 int __cpuinit register_cpu_notifier(struct notifier_block *nb)
 {
 {
 	int ret;
 	int ret;
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
 	ret = raw_notifier_chain_register(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return ret;
 	return ret;
 }
 }
 
 
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
 
 
 void unregister_cpu_notifier(struct notifier_block *nb)
 void unregister_cpu_notifier(struct notifier_block *nb)
 {
 {
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	raw_notifier_chain_unregister(&cpu_chain, nb);
 	raw_notifier_chain_unregister(&cpu_chain, nb);
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 }
 EXPORT_SYMBOL(unregister_cpu_notifier);
 EXPORT_SYMBOL(unregister_cpu_notifier);
 
 
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	if (!cpu_online(cpu))
 	if (!cpu_online(cpu))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	cpu_hotplug_begin();
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 	err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
 					hcpu, -1, &nr_calls);
 					hcpu, -1, &nr_calls);
 	if (err == NOTIFY_BAD) {
 	if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
 	cpu_clear(cpu, tmp);
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 	set_cpus_allowed(current, tmp);
 
 
-	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
 	p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 
 
 	if (IS_ERR(p) || cpu_online(cpu)) {
 	if (IS_ERR(p) || cpu_online(cpu)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		/* CPU didn't die: tell everyone.  Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
 out_allowed:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 	set_cpus_allowed(current, old_allowed);
 out_release:
 out_release:
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 	return err;
 	return err;
 }
 }
 
 
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
 {
 {
 	int err = 0;
 	int err = 0;
 
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 		err = -EBUSY;
 	else
 	else
 		err = _cpu_down(cpu, 0);
 		err = _cpu_down(cpu, 0);
 
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 	return err;
 }
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	if (cpu_online(cpu) || !cpu_present(cpu))
 	if (cpu_online(cpu) || !cpu_present(cpu))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu);
+	cpu_hotplug_begin();
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 	ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
 							-1, &nr_calls);
 							-1, &nr_calls);
 	if (ret == NOTIFY_BAD) {
 	if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 	}
 	}
 
 
 	/* Arch-specific enabling code. */
 	/* Arch-specific enabling code. */
-	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
 	ret = __cpu_up(cpu);
-	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 	if (ret != 0)
 		goto out_notify;
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 	BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
 	if (ret != 0)
 	if (ret != 0)
 		__raw_notifier_call_chain(&cpu_chain,
 		__raw_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
 				CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
-	raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu);
+	cpu_hotplug_done();
 
 
 	return ret;
 	return ret;
 }
 }
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	if (cpu_hotplug_disabled)
 	if (cpu_hotplug_disabled)
 		err = -EBUSY;
 		err = -EBUSY;
 	else
 	else
 		err = _cpu_up(cpu, 0);
 		err = _cpu_up(cpu, 0);
 
 
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return err;
 	return err;
 }
 }
 
 
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
 {
 {
 	int cpu, first_cpu, error = 0;
 	int cpu, first_cpu, error = 0;
 
 
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	first_cpu = first_cpu(cpu_online_map);
 	first_cpu = first_cpu(cpu_online_map);
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	/* We take down all of the non-boot CPUs in one shot to avoid races
 	 * with the userspace trying to use the CPU hotplug at the same time
 	 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
 	} else {
 	} else {
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 		printk(KERN_ERR "Non-boot CPUs are not disabled\n");
 	}
 	}
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 	return error;
 	return error;
 }
 }
 
 
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
 	int cpu, error;
 	int cpu, error;
 
 
 	/* Allow everyone to use the CPU hotplug again */
 	/* Allow everyone to use the CPU hotplug again */
-	mutex_lock(&cpu_add_remove_lock);
+	cpu_maps_update_begin();
 	cpu_hotplug_disabled = 0;
 	cpu_hotplug_disabled = 0;
 	if (cpus_empty(frozen_cpus))
 	if (cpus_empty(frozen_cpus))
 		goto out;
 		goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
 	}
 	}
 	cpus_clear(frozen_cpus);
 	cpus_clear(frozen_cpus);
 out:
 out:
-	mutex_unlock(&cpu_add_remove_lock);
+	cpu_maps_update_done();
 }
 }
 #endif /* CONFIG_PM_SLEEP_SMP */
 #endif /* CONFIG_PM_SLEEP_SMP */

+ 7 - 7
kernel/cpuset.c

@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
  *
  *
  * Call with cgroup_mutex held.  May take callback_mutex during
  * Call with cgroup_mutex held.  May take callback_mutex during
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
  * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * a call to the get_online_cpus()/put_online_cpus() pair.
  * Must not be called holding callback_mutex, because we must not
  * Must not be called holding callback_mutex, because we must not
- * call lock_cpu_hotplug() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
+ * call get_online_cpus() while holding callback_mutex.  Elsewhere
+ * the kernel nests callback_mutex inside get_online_cpus() calls.
  * So the reverse nesting would risk an ABBA deadlock.
  * So the reverse nesting would risk an ABBA deadlock.
  *
  *
  * The three key local variables below are:
  * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
 
 
 rebuild:
 rebuild:
 	/* Have scheduler rebuild sched domains */
 	/* Have scheduler rebuild sched domains */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	partition_sched_domains(ndoms, doms);
 	partition_sched_domains(ndoms, doms);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 
 done:
 done:
 	if (q && !IS_ERR(q))
 	if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
  *
  *
  * If the cpuset being removed has its flag 'sched_load_balance'
  * If the cpuset being removed has its flag 'sched_load_balance'
  * enabled, then simulate turning sched_load_balance off, which
  * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The lock_cpu_hotplug()
+ * will call rebuild_sched_domains().  The get_online_cpus()
  * call in rebuild_sched_domains() must not be made while holding
  * call in rebuild_sched_domains() must not be made while holding
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
  * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * lock_cpu_hotplug() calls.  So the reverse nesting would risk an
+ * get_online_cpus() calls.  So the reverse nesting would risk an
  * ABBA deadlock.
  * ABBA deadlock.
  */
  */
 
 

+ 11 - 0
kernel/fork.c

@@ -1045,6 +1045,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	copy_flags(clone_flags, p);
 	copy_flags(clone_flags, p);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->children);
 	INIT_LIST_HEAD(&p->sibling);
 	INIT_LIST_HEAD(&p->sibling);
+#ifdef CONFIG_PREEMPT_RCU
+	p->rcu_read_lock_nesting = 0;
+	p->rcu_flipctr_idx = 0;
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
 	p->vfork_done = NULL;
 	p->vfork_done = NULL;
 	spin_lock_init(&p->alloc_lock);
 	spin_lock_init(&p->alloc_lock);
 
 
@@ -1059,6 +1063,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	p->prev_utime = cputime_zero;
 	p->prev_utime = cputime_zero;
 	p->prev_stime = cputime_zero;
 	p->prev_stime = cputime_zero;
 
 
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	p->last_switch_count = 0;
+	p->last_switch_timestamp = 0;
+#endif
+
 #ifdef CONFIG_TASK_XACCT
 #ifdef CONFIG_TASK_XACCT
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->rchar = 0;		/* I/O counter: bytes read */
 	p->wchar = 0;		/* I/O counter: bytes written */
 	p->wchar = 0;		/* I/O counter: bytes written */
@@ -1196,6 +1205,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef TIF_SYSCALL_EMU
 #ifdef TIF_SYSCALL_EMU
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
 #endif
 #endif
+	clear_all_latency_tracing(p);
 
 
 	/* Our parent execution domain becomes current domain
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
 	   These must match for thread signalling to apply */
@@ -1237,6 +1247,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 	 * parent's CPU). This avoids alot of nasty races.
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
 	 */
 	p->cpus_allowed = current->cpus_allowed;
 	p->cpus_allowed = current->cpus_allowed;
+	p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
 		set_task_cpu(p, smp_processor_id());

+ 142 - 114
kernel/hrtimer.c

@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
 }
 }
 #endif /* BITS_PER_LONG >= 64 */
 #endif /* BITS_PER_LONG >= 64 */
 
 
+/*
+ * Check, whether the timer is on the callback pending list
+ */
+static inline int hrtimer_cb_pending(const struct hrtimer *timer)
+{
+	return timer->state & HRTIMER_STATE_PENDING;
+}
+
+/*
+ * Remove a timer from the callback pending list
+ */
+static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
+{
+	list_del_init(&timer->cb_entry);
+}
+
 /* High resolution timer related functions */
 /* High resolution timer related functions */
 #ifdef CONFIG_HIGH_RES_TIMERS
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 
@@ -493,22 +509,6 @@ void hres_timers_resume(void)
 	retrigger_next_event(NULL);
 	retrigger_next_event(NULL);
 }
 }
 
 
-/*
- * Check, whether the timer is on the callback pending list
- */
-static inline int hrtimer_cb_pending(const struct hrtimer *timer)
-{
-	return timer->state & HRTIMER_STATE_PENDING;
-}
-
-/*
- * Remove a timer from the callback pending list
- */
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
-{
-	list_del_init(&timer->cb_entry);
-}
-
 /*
 /*
  * Initialize the high resolution related parts of cpu_base
  * Initialize the high resolution related parts of cpu_base
  */
  */
@@ -516,7 +516,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
 {
 {
 	base->expires_next.tv64 = KTIME_MAX;
 	base->expires_next.tv64 = KTIME_MAX;
 	base->hres_active = 0;
 	base->hres_active = 0;
-	INIT_LIST_HEAD(&base->cb_pending);
 }
 }
 
 
 /*
 /*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
  */
  */
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
 {
 {
-	INIT_LIST_HEAD(&timer->cb_entry);
 }
 }
 
 
 /*
 /*
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
 {
 {
 	return 0;
 	return 0;
 }
 }
-static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
-static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
 static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline int hrtimer_reprogram(struct hrtimer *timer,
+				    struct hrtimer_clock_base *base)
+{
+	return 0;
+}
 
 
 #endif /* CONFIG_HIGH_RES_TIMERS */
 #endif /* CONFIG_HIGH_RES_TIMERS */
 
 
@@ -1001,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
 		clock_id = CLOCK_MONOTONIC;
 		clock_id = CLOCK_MONOTONIC;
 
 
 	timer->base = &cpu_base->clock_base[clock_id];
 	timer->base = &cpu_base->clock_base[clock_id];
+	INIT_LIST_HEAD(&timer->cb_entry);
 	hrtimer_init_timer_hres(timer);
 	hrtimer_init_timer_hres(timer);
 
 
 #ifdef CONFIG_TIMER_STATS
 #ifdef CONFIG_TIMER_STATS
@@ -1030,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
 }
 }
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 EXPORT_SYMBOL_GPL(hrtimer_get_res);
 
 
+static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
+{
+	spin_lock_irq(&cpu_base->lock);
+
+	while (!list_empty(&cpu_base->cb_pending)) {
+		enum hrtimer_restart (*fn)(struct hrtimer *);
+		struct hrtimer *timer;
+		int restart;
+
+		timer = list_entry(cpu_base->cb_pending.next,
+				   struct hrtimer, cb_entry);
+
+		timer_stats_account_hrtimer(timer);
+
+		fn = timer->function;
+		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
+		spin_unlock_irq(&cpu_base->lock);
+
+		restart = fn(timer);
+
+		spin_lock_irq(&cpu_base->lock);
+
+		timer->state &= ~HRTIMER_STATE_CALLBACK;
+		if (restart == HRTIMER_RESTART) {
+			BUG_ON(hrtimer_active(timer));
+			/*
+			 * Enqueue the timer, allow reprogramming of the event
+			 * device
+			 */
+			enqueue_hrtimer(timer, timer->base, 1);
+		} else if (hrtimer_active(timer)) {
+			/*
+			 * If the timer was rearmed on another CPU, reprogram
+			 * the event device.
+			 */
+			if (timer->base->first == &timer->node)
+				hrtimer_reprogram(timer, timer->base);
+		}
+	}
+	spin_unlock_irq(&cpu_base->lock);
+}
+
+static void __run_hrtimer(struct hrtimer *timer)
+{
+	struct hrtimer_clock_base *base = timer->base;
+	struct hrtimer_cpu_base *cpu_base = base->cpu_base;
+	enum hrtimer_restart (*fn)(struct hrtimer *);
+	int restart;
+
+	__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
+	timer_stats_account_hrtimer(timer);
+
+	fn = timer->function;
+	if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
+		/*
+		 * Used for scheduler timers, avoid lock inversion with
+		 * rq->lock and tasklist_lock.
+		 *
+		 * These timers are required to deal with enqueue expiry
+		 * themselves and are not allowed to migrate.
+		 */
+		spin_unlock(&cpu_base->lock);
+		restart = fn(timer);
+		spin_lock(&cpu_base->lock);
+	} else
+		restart = fn(timer);
+
+	/*
+	 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
+	 * reprogramming of the event hardware. This happens at the end of this
+	 * function anyway.
+	 */
+	if (restart != HRTIMER_NORESTART) {
+		BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
+		enqueue_hrtimer(timer, base, 0);
+	}
+	timer->state &= ~HRTIMER_STATE_CALLBACK;
+}
+
 #ifdef CONFIG_HIGH_RES_TIMERS
 #ifdef CONFIG_HIGH_RES_TIMERS
 
 
 /*
 /*
@@ -1087,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 				continue;
 				continue;
 			}
 			}
 
 
-			__remove_hrtimer(timer, base,
-					 HRTIMER_STATE_CALLBACK, 0);
-			timer_stats_account_hrtimer(timer);
-
-			/*
-			 * Note: We clear the CALLBACK bit after
-			 * enqueue_hrtimer to avoid reprogramming of
-			 * the event hardware. This happens at the end
-			 * of this function anyway.
-			 */
-			if (timer->function(timer) != HRTIMER_NORESTART) {
-				BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
-				enqueue_hrtimer(timer, base, 0);
-			}
-			timer->state &= ~HRTIMER_STATE_CALLBACK;
+			__run_hrtimer(timer);
 		}
 		}
 		spin_unlock(&cpu_base->lock);
 		spin_unlock(&cpu_base->lock);
 		base++;
 		base++;
@@ -1122,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 
 static void run_hrtimer_softirq(struct softirq_action *h)
 static void run_hrtimer_softirq(struct softirq_action *h)
 {
 {
-	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
-	spin_lock_irq(&cpu_base->lock);
-
-	while (!list_empty(&cpu_base->cb_pending)) {
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		struct hrtimer *timer;
-		int restart;
-
-		timer = list_entry(cpu_base->cb_pending.next,
-				   struct hrtimer, cb_entry);
+	run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
+}
 
 
-		timer_stats_account_hrtimer(timer);
+#endif	/* CONFIG_HIGH_RES_TIMERS */
 
 
-		fn = timer->function;
-		__remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
+/*
+ * Called from timer softirq every jiffy, expire hrtimers:
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void hrtimer_run_pending(void)
+{
+	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 
 
-		restart = fn(timer);
+	if (hrtimer_hres_active())
+		return;
 
 
-		spin_lock_irq(&cpu_base->lock);
+	/*
+	 * This _is_ ugly: We have to check in the softirq context,
+	 * whether we can switch to highres and / or nohz mode. The
+	 * clocksource switch happens in the timer interrupt with
+	 * xtime_lock held. Notification from there only sets the
+	 * check bit in the tick_oneshot code, otherwise we might
+	 * deadlock vs. xtime_lock.
+	 */
+	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
+		hrtimer_switch_to_hres();
 
 
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart == HRTIMER_RESTART) {
-			BUG_ON(hrtimer_active(timer));
-			/*
-			 * Enqueue the timer, allow reprogramming of the event
-			 * device
-			 */
-			enqueue_hrtimer(timer, timer->base, 1);
-		} else if (hrtimer_active(timer)) {
-			/*
-			 * If the timer was rearmed on another CPU, reprogram
-			 * the event device.
-			 */
-			if (timer->base->first == &timer->node)
-				hrtimer_reprogram(timer, timer->base);
-		}
-	}
-	spin_unlock_irq(&cpu_base->lock);
+	run_hrtimer_pending(cpu_base);
 }
 }
 
 
-#endif	/* CONFIG_HIGH_RES_TIMERS */
-
 /*
 /*
- * Expire the per base hrtimer-queue:
+ * Called from hardirq context every jiffy
  */
  */
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 				     int index)
 				     int index)
@@ -1181,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
 	if (base->get_softirq_time)
 	if (base->get_softirq_time)
 		base->softirq_time = base->get_softirq_time();
 		base->softirq_time = base->get_softirq_time();
 
 
-	spin_lock_irq(&cpu_base->lock);
+	spin_lock(&cpu_base->lock);
 
 
 	while ((node = base->first)) {
 	while ((node = base->first)) {
 		struct hrtimer *timer;
 		struct hrtimer *timer;
-		enum hrtimer_restart (*fn)(struct hrtimer *);
-		int restart;
 
 
 		timer = rb_entry(node, struct hrtimer, node);
 		timer = rb_entry(node, struct hrtimer, node);
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 		if (base->softirq_time.tv64 <= timer->expires.tv64)
 			break;
 			break;
 
 
-#ifdef CONFIG_HIGH_RES_TIMERS
-		WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ);
-#endif
-		timer_stats_account_hrtimer(timer);
-
-		fn = timer->function;
-		__remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
-		spin_unlock_irq(&cpu_base->lock);
-
-		restart = fn(timer);
-
-		spin_lock_irq(&cpu_base->lock);
-
-		timer->state &= ~HRTIMER_STATE_CALLBACK;
-		if (restart != HRTIMER_NORESTART) {
-			BUG_ON(hrtimer_active(timer));
-			enqueue_hrtimer(timer, base, 0);
+		if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
+			__remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
+			list_add_tail(&timer->cb_entry,
+					&base->cpu_base->cb_pending);
+			continue;
 		}
 		}
+
+		__run_hrtimer(timer);
 	}
 	}
-	spin_unlock_irq(&cpu_base->lock);
+	spin_unlock(&cpu_base->lock);
 }
 }
 
 
-/*
- * Called from timer softirq every jiffy, expire hrtimers:
- *
- * For HRT its the fall back code to run the softirq in the timer
- * softirq context in case the hrtimer initialization failed or has
- * not been done yet.
- */
 void hrtimer_run_queues(void)
 void hrtimer_run_queues(void)
 {
 {
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1229,18 +1266,6 @@ void hrtimer_run_queues(void)
 	if (hrtimer_hres_active())
 	if (hrtimer_hres_active())
 		return;
 		return;
 
 
-	/*
-	 * This _is_ ugly: We have to check in the softirq context,
-	 * whether we can switch to highres and / or nohz mode. The
-	 * clocksource switch happens in the timer interrupt with
-	 * xtime_lock held. Notification from there only sets the
-	 * check bit in the tick_oneshot code, otherwise we might
-	 * deadlock vs. xtime_lock.
-	 */
-	if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
-		if (hrtimer_switch_to_hres())
-			return;
-
 	hrtimer_get_softirq_time(cpu_base);
 	hrtimer_get_softirq_time(cpu_base);
 
 
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1268,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
 	sl->timer.function = hrtimer_wakeup;
 	sl->timer.function = hrtimer_wakeup;
 	sl->task = task;
 	sl->task = task;
 #ifdef CONFIG_HIGH_RES_TIMERS
 #ifdef CONFIG_HIGH_RES_TIMERS
-	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART;
+	sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
 #endif
 #endif
 }
 }
 
 
@@ -1279,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
 	do {
 	do {
 		set_current_state(TASK_INTERRUPTIBLE);
 		set_current_state(TASK_INTERRUPTIBLE);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
 		hrtimer_start(&t->timer, t->timer.expires, mode);
+		if (!hrtimer_active(&t->timer))
+			t->task = NULL;
 
 
 		if (likely(t->task))
 		if (likely(t->task))
 			schedule();
 			schedule();
@@ -1389,6 +1416,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 	for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 		cpu_base->clock_base[i].cpu_base = cpu_base;
 
 
+	INIT_LIST_HEAD(&cpu_base->cb_pending);
 	hrtimer_init_hres(cpu_base);
 	hrtimer_init_hres(cpu_base);
 }
 }
 
 

+ 11 - 1
kernel/kthread.c

@@ -15,6 +15,8 @@
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <asm/semaphore.h>
 #include <asm/semaphore.h>
 
 
+#define KTHREAD_NICE_LEVEL (-5)
+
 static DEFINE_SPINLOCK(kthread_create_lock);
 static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
 	if (pid < 0) {
 	if (pid < 0) {
 		create->result = ERR_PTR(pid);
 		create->result = ERR_PTR(pid);
 	} else {
 	} else {
+		struct sched_param param = { .sched_priority = 0 };
 		wait_for_completion(&create->started);
 		wait_for_completion(&create->started);
 		read_lock(&tasklist_lock);
 		read_lock(&tasklist_lock);
 		create->result = find_task_by_pid(pid);
 		create->result = find_task_by_pid(pid);
 		read_unlock(&tasklist_lock);
 		read_unlock(&tasklist_lock);
+		/*
+		 * root may have changed our (kthreadd's) priority or CPU mask.
+		 * The kernel thread should not inherit these properties.
+		 */
+		sched_setscheduler(create->result, SCHED_NORMAL, &param);
+		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
+		set_cpus_allowed(create->result, CPU_MASK_ALL);
 	}
 	}
 	complete(&create->done);
 	complete(&create->done);
 }
 }
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
 	/* Setup a clean context for our children to inherit. */
 	/* Setup a clean context for our children to inherit. */
 	set_task_comm(tsk, "kthreadd");
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	ignore_signals(tsk);
-	set_user_nice(tsk, -5);
+	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 	set_cpus_allowed(tsk, CPU_MASK_ALL);
 
 
 	current->flags |= PF_NOFREEZE;
 	current->flags |= PF_NOFREEZE;

+ 239 - 0
kernel/latencytop.c

@@ -0,0 +1,239 @@
+/*
+ * latencytop.c: Latency display infrastructure
+ *
+ * (C) Copyright 2008 Intel Corporation
+ * Author: Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#include <linux/latencytop.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+
+static DEFINE_SPINLOCK(latency_lock);
+
+#define MAXLR 128
+static struct latency_record latency_record[MAXLR];
+
+int latencytop_enabled;
+
+void clear_all_latency_tracing(struct task_struct *p)
+{
+	unsigned long flags;
+
+	if (!latencytop_enabled)
+		return;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&p->latency_record, 0, sizeof(p->latency_record));
+	p->latency_record_count = 0;
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void clear_global_latency_tracing(void)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&latency_lock, flags);
+	memset(&latency_record, 0, sizeof(latency_record));
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static void __sched
+account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
+{
+	int firstnonnull = MAXLR + 1;
+	int i;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* skip kernel threads for now */
+	if (!tsk->mm)
+		return;
+
+	for (i = 0; i < MAXLR; i++) {
+		int q;
+		int same = 1;
+		/* Nothing stored: */
+		if (!latency_record[i].backtrace[0]) {
+			if (firstnonnull > i)
+				firstnonnull = i;
+			continue;
+		}
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (latency_record[i].backtrace[q] !=
+				lat->backtrace[q])
+				same = 0;
+			if (same && lat->backtrace[q] == 0)
+				break;
+			if (same && lat->backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			latency_record[i].count++;
+			latency_record[i].time += lat->time;
+			if (lat->time > latency_record[i].max)
+				latency_record[i].max = lat->time;
+			return;
+		}
+	}
+
+	i = firstnonnull;
+	if (i >= MAXLR - 1)
+		return;
+
+	/* Allocted a new one: */
+	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
+}
+
+static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+{
+	struct stack_trace trace;
+
+	memset(&trace, 0, sizeof(trace));
+	trace.max_entries = LT_BACKTRACEDEPTH;
+	trace.entries = &lat->backtrace[0];
+	trace.skip = 0;
+	save_stack_trace_tsk(tsk, &trace);
+}
+
+void __sched
+account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+{
+	unsigned long flags;
+	int i, q;
+	struct latency_record lat;
+
+	if (!latencytop_enabled)
+		return;
+
+	/* Long interruptible waits are generally user requested... */
+	if (inter && usecs > 5000)
+		return;
+
+	memset(&lat, 0, sizeof(lat));
+	lat.count = 1;
+	lat.time = usecs;
+	lat.max = usecs;
+	store_stacktrace(tsk, &lat);
+
+	spin_lock_irqsave(&latency_lock, flags);
+
+	account_global_scheduler_latency(tsk, &lat);
+
+	/*
+	 * short term hack; if we're > 32 we stop; future we recycle:
+	 */
+	tsk->latency_record_count++;
+	if (tsk->latency_record_count >= LT_SAVECOUNT)
+		goto out_unlock;
+
+	for (i = 0; i < LT_SAVECOUNT ; i++) {
+		struct latency_record *mylat;
+		int same = 1;
+		mylat = &tsk->latency_record[i];
+		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+			if (mylat->backtrace[q] !=
+				lat.backtrace[q])
+				same = 0;
+			if (same && lat.backtrace[q] == 0)
+				break;
+			if (same && lat.backtrace[q] == ULONG_MAX)
+				break;
+		}
+		if (same) {
+			mylat->count++;
+			mylat->time += lat.time;
+			if (lat.time > mylat->max)
+				mylat->max = lat.time;
+			goto out_unlock;
+		}
+	}
+
+	/* Allocated a new one: */
+	i = tsk->latency_record_count;
+	memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
+
+out_unlock:
+	spin_unlock_irqrestore(&latency_lock, flags);
+}
+
+static int lstats_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	seq_puts(m, "Latency Top version : v0.1\n");
+
+	for (i = 0; i < MAXLR; i++) {
+		if (latency_record[i].backtrace[0]) {
+			int q;
+			seq_printf(m, "%i %li %li ",
+				latency_record[i].count,
+				latency_record[i].time,
+				latency_record[i].max);
+			for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
+				char sym[KSYM_NAME_LEN];
+				char *c;
+				if (!latency_record[i].backtrace[q])
+					break;
+				if (latency_record[i].backtrace[q] == ULONG_MAX)
+					break;
+				sprint_symbol(sym, latency_record[i].backtrace[q]);
+				c = strchr(sym, '+');
+				if (c)
+					*c = 0;
+				seq_printf(m, "%s ", sym);
+			}
+			seq_printf(m, "\n");
+		}
+	}
+	return 0;
+}
+
+static ssize_t
+lstats_write(struct file *file, const char __user *buf, size_t count,
+	     loff_t *offs)
+{
+	clear_global_latency_tracing();
+
+	return count;
+}
+
+static int lstats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, lstats_show, NULL);
+}
+
+static struct file_operations lstats_fops = {
+	.open		= lstats_open,
+	.read		= seq_read,
+	.write		= lstats_write,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_lstats_procfs(void)
+{
+	struct proc_dir_entry *pe;
+
+	pe = create_proc_entry("latency_stats", 0644, NULL);
+	if (!pe)
+		return -ENOMEM;
+
+	pe->proc_fops = &lstats_fops;
+
+	return 0;
+}
+__initcall(init_lstats_procfs);

+ 11 - 1
kernel/lockdep.c

@@ -3206,7 +3206,11 @@ retry:
 
 
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 EXPORT_SYMBOL_GPL(debug_show_all_locks);
 
 
-void debug_show_held_locks(struct task_struct *task)
+/*
+ * Careful: only use this function if you are sure that
+ * the task cannot run in parallel!
+ */
+void __debug_show_held_locks(struct task_struct *task)
 {
 {
 	if (unlikely(!debug_locks)) {
 	if (unlikely(!debug_locks)) {
 		printk("INFO: lockdep is turned off.\n");
 		printk("INFO: lockdep is turned off.\n");
@@ -3214,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
 	}
 	}
 	lockdep_print_held_locks(task);
 	lockdep_print_held_locks(task);
 }
 }
+EXPORT_SYMBOL_GPL(__debug_show_held_locks);
+
+void debug_show_held_locks(struct task_struct *task)
+{
+		__debug_show_held_locks(task);
+}
 
 
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 EXPORT_SYMBOL_GPL(debug_show_held_locks);
 
 

+ 21 - 6
kernel/module.c

@@ -496,6 +496,8 @@ static struct module_attribute modinfo_##field = {                    \
 MODINFO_ATTR(version);
 MODINFO_ATTR(version);
 MODINFO_ATTR(srcversion);
 MODINFO_ATTR(srcversion);
 
 
+static char last_unloaded_module[MODULE_NAME_LEN+1];
+
 #ifdef CONFIG_MODULE_UNLOAD
 #ifdef CONFIG_MODULE_UNLOAD
 /* Init the unload section of the module. */
 /* Init the unload section of the module. */
 static void module_unload_init(struct module *mod)
 static void module_unload_init(struct module *mod)
@@ -719,6 +721,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
 		mod->exit();
 		mod->exit();
 		mutex_lock(&module_mutex);
 		mutex_lock(&module_mutex);
 	}
 	}
+	/* Store the name of the last unloaded module for diagnostic purposes */
+	sprintf(last_unloaded_module, mod->name);
 	free_module(mod);
 	free_module(mod);
 
 
  out:
  out:
@@ -2357,21 +2361,30 @@ static void m_stop(struct seq_file *m, void *p)
 	mutex_unlock(&module_mutex);
 	mutex_unlock(&module_mutex);
 }
 }
 
 
-static char *taint_flags(unsigned int taints, char *buf)
+static char *module_flags(struct module *mod, char *buf)
 {
 {
 	int bx = 0;
 	int bx = 0;
 
 
-	if (taints) {
+	if (mod->taints ||
+	    mod->state == MODULE_STATE_GOING ||
+	    mod->state == MODULE_STATE_COMING) {
 		buf[bx++] = '(';
 		buf[bx++] = '(';
-		if (taints & TAINT_PROPRIETARY_MODULE)
+		if (mod->taints & TAINT_PROPRIETARY_MODULE)
 			buf[bx++] = 'P';
 			buf[bx++] = 'P';
-		if (taints & TAINT_FORCED_MODULE)
+		if (mod->taints & TAINT_FORCED_MODULE)
 			buf[bx++] = 'F';
 			buf[bx++] = 'F';
 		/*
 		/*
 		 * TAINT_FORCED_RMMOD: could be added.
 		 * TAINT_FORCED_RMMOD: could be added.
 		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
 		 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
 		 * apply to modules.
 		 * apply to modules.
 		 */
 		 */
+
+		/* Show a - for module-is-being-unloaded */
+		if (mod->state == MODULE_STATE_GOING)
+			buf[bx++] = '-';
+		/* Show a + for module-is-being-loaded */
+		if (mod->state == MODULE_STATE_COMING)
+			buf[bx++] = '+';
 		buf[bx++] = ')';
 		buf[bx++] = ')';
 	}
 	}
 	buf[bx] = '\0';
 	buf[bx] = '\0';
@@ -2398,7 +2411,7 @@ static int m_show(struct seq_file *m, void *p)
 
 
 	/* Taints info */
 	/* Taints info */
 	if (mod->taints)
 	if (mod->taints)
-		seq_printf(m, " %s", taint_flags(mod->taints, buf));
+		seq_printf(m, " %s", module_flags(mod, buf));
 
 
 	seq_printf(m, "\n");
 	seq_printf(m, "\n");
 	return 0;
 	return 0;
@@ -2493,7 +2506,9 @@ void print_modules(void)
 
 
 	printk("Modules linked in:");
 	printk("Modules linked in:");
 	list_for_each_entry(mod, &modules, list)
 	list_for_each_entry(mod, &modules, list)
-		printk(" %s%s", mod->name, taint_flags(mod->taints, buf));
+		printk(" %s%s", mod->name, module_flags(mod, buf));
+	if (last_unloaded_module[0])
+		printk(" [last unloaded: %s]", last_unloaded_module);
 	printk("\n");
 	printk("\n");
 }
 }
 
 

+ 30 - 0
kernel/posix-cpu-timers.c

@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
 {
 {
 	int maxfire;
 	int maxfire;
 	struct list_head *timers = tsk->cpu_timers;
 	struct list_head *timers = tsk->cpu_timers;
+	struct signal_struct *const sig = tsk->signal;
 
 
 	maxfire = 20;
 	maxfire = 20;
 	tsk->it_prof_expires = cputime_zero;
 	tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
 		t->firing = 1;
 		t->firing = 1;
 		list_move_tail(&t->entry, firing);
 		list_move_tail(&t->entry, firing);
 	}
 	}
+
+	/*
+	 * Check for the special case thread timers.
+	 */
+	if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+		unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+		unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+
+		if (hard != RLIM_INFINITY &&
+		    tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the hard limit, we just die.
+			 * No need to calculate anything else now.
+			 */
+			__group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+			return;
+		}
+		if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+			/*
+			 * At the soft limit, send a SIGXCPU every second.
+			 */
+			if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+			    < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+				sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+								USEC_PER_SEC;
+			}
+			__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+		}
+	}
 }
 }
 
 
 /*
 /*

+ 41 - 16
kernel/printk.c

@@ -573,11 +573,6 @@ static int __init printk_time_setup(char *str)
 
 
 __setup("time", printk_time_setup);
 __setup("time", printk_time_setup);
 
 
-__attribute__((weak)) unsigned long long printk_clock(void)
-{
-	return sched_clock();
-}
-
 /* Check if we have any console registered that can be called early in boot. */
 /* Check if we have any console registered that can be called early in boot. */
 static int have_callable_console(void)
 static int have_callable_console(void)
 {
 {
@@ -628,30 +623,57 @@ asmlinkage int printk(const char *fmt, ...)
 /* cpu currently holding logbuf_lock */
 /* cpu currently holding logbuf_lock */
 static volatile unsigned int printk_cpu = UINT_MAX;
 static volatile unsigned int printk_cpu = UINT_MAX;
 
 
+const char printk_recursion_bug_msg [] =
+			KERN_CRIT "BUG: recent printk recursion!\n";
+static int printk_recursion_bug;
+
 asmlinkage int vprintk(const char *fmt, va_list args)
 asmlinkage int vprintk(const char *fmt, va_list args)
 {
 {
+	static int log_level_unknown = 1;
+	static char printk_buf[1024];
+
 	unsigned long flags;
 	unsigned long flags;
-	int printed_len;
+	int printed_len = 0;
+	int this_cpu;
 	char *p;
 	char *p;
-	static char printk_buf[1024];
-	static int log_level_unknown = 1;
 
 
 	boot_delay_msec();
 	boot_delay_msec();
 
 
 	preempt_disable();
 	preempt_disable();
-	if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
-		/* If a crash is occurring during printk() on this CPU,
-		 * make sure we can't deadlock */
-		zap_locks();
-
 	/* This stops the holder of console_sem just where we want him */
 	/* This stops the holder of console_sem just where we want him */
 	raw_local_irq_save(flags);
 	raw_local_irq_save(flags);
+	this_cpu = smp_processor_id();
+
+	/*
+	 * Ouch, printk recursed into itself!
+	 */
+	if (unlikely(printk_cpu == this_cpu)) {
+		/*
+		 * If a crash is occurring during printk() on this CPU,
+		 * then try to get the crash message out but make sure
+		 * we can't deadlock. Otherwise just return to avoid the
+		 * recursion and return - but flag the recursion so that
+		 * it can be printed at the next appropriate moment:
+		 */
+		if (!oops_in_progress) {
+			printk_recursion_bug = 1;
+			goto out_restore_irqs;
+		}
+		zap_locks();
+	}
+
 	lockdep_off();
 	lockdep_off();
 	spin_lock(&logbuf_lock);
 	spin_lock(&logbuf_lock);
-	printk_cpu = smp_processor_id();
+	printk_cpu = this_cpu;
 
 
+	if (printk_recursion_bug) {
+		printk_recursion_bug = 0;
+		strcpy(printk_buf, printk_recursion_bug_msg);
+		printed_len = sizeof(printk_recursion_bug_msg);
+	}
 	/* Emit the output into the temporary buffer */
 	/* Emit the output into the temporary buffer */
-	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
+	printed_len += vscnprintf(printk_buf + printed_len,
+				  sizeof(printk_buf), fmt, args);
 
 
 	/*
 	/*
 	 * Copy the output into log_buf.  If the caller didn't provide
 	 * Copy the output into log_buf.  If the caller didn't provide
@@ -680,7 +702,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 					loglev_char = default_message_loglevel
 					loglev_char = default_message_loglevel
 						+ '0';
 						+ '0';
 				}
 				}
-				t = printk_clock();
+				t = 0;
+				if (system_state != SYSTEM_BOOTING)
+					t = ktime_to_ns(ktime_get());
 				nanosec_rem = do_div(t, 1000000000);
 				nanosec_rem = do_div(t, 1000000000);
 				tlen = sprintf(tbuf,
 				tlen = sprintf(tbuf,
 						"<%c>[%5lu.%06lu] ",
 						"<%c>[%5lu.%06lu] ",
@@ -744,6 +768,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
 		printk_cpu = UINT_MAX;
 		printk_cpu = UINT_MAX;
 		spin_unlock(&logbuf_lock);
 		spin_unlock(&logbuf_lock);
 		lockdep_on();
 		lockdep_on();
+out_restore_irqs:
 		raw_local_irq_restore(flags);
 		raw_local_irq_restore(flags);
 	}
 	}
 
 

+ 49 - 50
kernel/profile.c

@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
 static DEFINE_MUTEX(profile_flip_mutex);
 static DEFINE_MUTEX(profile_flip_mutex);
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 
 
-static int __init profile_setup(char * str)
+static int __init profile_setup(char *str)
 {
 {
 	static char __initdata schedstr[] = "schedule";
 	static char __initdata schedstr[] = "schedule";
 	static char __initdata sleepstr[] = "sleep";
 	static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
 
 
 void __init profile_init(void)
 void __init profile_init(void)
 {
 {
-	if (!prof_on) 
+	if (!prof_on)
 		return;
 		return;
- 
+
 	/* only text is profiled */
 	/* only text is profiled */
 	prof_len = (_etext - _stext) >> prof_shift;
 	prof_len = (_etext - _stext) >> prof_shift;
 	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
 	prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
 }
 }
 
 
 /* Profile event notifications */
 /* Profile event notifications */
- 
+
 #ifdef CONFIG_PROFILING
 #ifdef CONFIG_PROFILING
- 
+
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
 static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
- 
-void profile_task_exit(struct task_struct * task)
+
+void profile_task_exit(struct task_struct *task)
 {
 {
 	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 	blocking_notifier_call_chain(&task_exit_notifier, 0, task);
 }
 }
- 
-int profile_handoff_task(struct task_struct * task)
+
+int profile_handoff_task(struct task_struct *task)
 {
 {
 	int ret;
 	int ret;
 	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
 	ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
 	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 	blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
 }
 }
 
 
-int task_handoff_register(struct notifier_block * n)
+int task_handoff_register(struct notifier_block *n)
 {
 {
 	return atomic_notifier_chain_register(&task_free_notifier, n);
 	return atomic_notifier_chain_register(&task_free_notifier, n);
 }
 }
+EXPORT_SYMBOL_GPL(task_handoff_register);
 
 
-int task_handoff_unregister(struct notifier_block * n)
+int task_handoff_unregister(struct notifier_block *n)
 {
 {
 	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 	return atomic_notifier_chain_unregister(&task_free_notifier, n);
 }
 }
+EXPORT_SYMBOL_GPL(task_handoff_unregister);
 
 
-int profile_event_register(enum profile_type type, struct notifier_block * n)
+int profile_event_register(enum profile_type type, struct notifier_block *n)
 {
 {
 	int err = -EINVAL;
 	int err = -EINVAL;
- 
+
 	switch (type) {
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_register(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_register(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_register(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_register(
+				&munmap_notifier, n);
+		break;
 	}
 	}
- 
+
 	return err;
 	return err;
 }
 }
+EXPORT_SYMBOL_GPL(profile_event_register);
 
 
- 
-int profile_event_unregister(enum profile_type type, struct notifier_block * n)
+int profile_event_unregister(enum profile_type type, struct notifier_block *n)
 {
 {
 	int err = -EINVAL;
 	int err = -EINVAL;
- 
+
 	switch (type) {
 	switch (type) {
-		case PROFILE_TASK_EXIT:
-			err = blocking_notifier_chain_unregister(
-					&task_exit_notifier, n);
-			break;
-		case PROFILE_MUNMAP:
-			err = blocking_notifier_chain_unregister(
-					&munmap_notifier, n);
-			break;
+	case PROFILE_TASK_EXIT:
+		err = blocking_notifier_chain_unregister(
+				&task_exit_notifier, n);
+		break;
+	case PROFILE_MUNMAP:
+		err = blocking_notifier_chain_unregister(
+				&munmap_notifier, n);
+		break;
 	}
 	}
 
 
 	return err;
 	return err;
 }
 }
+EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 
 int register_timer_hook(int (*hook)(struct pt_regs *))
 int register_timer_hook(int (*hook)(struct pt_regs *))
 {
 {
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
 	timer_hook = hook;
 	timer_hook = hook;
 	return 0;
 	return 0;
 }
 }
+EXPORT_SYMBOL_GPL(register_timer_hook);
 
 
 void unregister_timer_hook(int (*hook)(struct pt_regs *))
 void unregister_timer_hook(int (*hook)(struct pt_regs *))
 {
 {
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
 	/* make sure all CPUs see the NULL hook */
 	/* make sure all CPUs see the NULL hook */
 	synchronize_sched();  /* Allow ongoing interrupts to complete. */
 	synchronize_sched();  /* Allow ongoing interrupts to complete. */
 }
 }
-
-EXPORT_SYMBOL_GPL(register_timer_hook);
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
 EXPORT_SYMBOL_GPL(unregister_timer_hook);
-EXPORT_SYMBOL_GPL(task_handoff_register);
-EXPORT_SYMBOL_GPL(task_handoff_unregister);
-EXPORT_SYMBOL_GPL(profile_event_register);
-EXPORT_SYMBOL_GPL(profile_event_unregister);
 
 
 #endif /* CONFIG_PROFILING */
 #endif /* CONFIG_PROFILING */
 
 
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
 			per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
 		}
 		}
 		break;
 		break;
-	out_free:
+out_free:
 		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
 		page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
 		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
 		per_cpu(cpu_profile_hits, cpu)[1] = NULL;
 		__free_page(page);
 		__free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
 	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 	atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
 }
 }
 #endif /* !CONFIG_SMP */
 #endif /* !CONFIG_SMP */
-
 EXPORT_SYMBOL_GPL(profile_hits);
 EXPORT_SYMBOL_GPL(profile_hits);
 
 
 void profile_tick(int type)
 void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
 
 
-static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
+static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
 			int count, int *eof, void *data)
 			int count, int *eof, void *data)
 {
 {
 	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
 	int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
 	return len;
 	return len;
 }
 }
 
 
-static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer,
-					unsigned long count, void *data)
+static int prof_cpu_mask_write_proc(struct file *file,
+	const char __user *buffer,  unsigned long count, void *data)
 {
 {
 	cpumask_t *mask = (cpumask_t *)data;
 	cpumask_t *mask = (cpumask_t *)data;
 	unsigned long full_count = count, err;
 	unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
 	struct proc_dir_entry *entry;
 	struct proc_dir_entry *entry;
 
 
 	/* create /proc/irq/prof_cpu_mask */
 	/* create /proc/irq/prof_cpu_mask */
-	if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir)))
+	entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
+	if (!entry)
 		return;
 		return;
 	entry->data = (void *)&prof_cpu_mask;
 	entry->data = (void *)&prof_cpu_mask;
 	entry->read_proc = prof_cpu_mask_read_proc;
 	entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 {
 {
 	unsigned long p = *ppos;
 	unsigned long p = *ppos;
 	ssize_t read;
 	ssize_t read;
-	char * pnt;
+	char *pnt;
 	unsigned int sample_step = 1 << prof_shift;
 	unsigned int sample_step = 1 << prof_shift;
 
 
 	profile_flip_buffers();
 	profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
 	read = 0;
 	read = 0;
 
 
 	while (p < sizeof(unsigned int) && count > 0) {
 	while (p < sizeof(unsigned int) && count > 0) {
-		if (put_user(*((char *)(&sample_step)+p),buf))
+		if (put_user(*((char *)(&sample_step)+p), buf))
 			return -EFAULT;
 			return -EFAULT;
 		buf++; p++; count--; read++;
 		buf++; p++; count--; read++;
 	}
 	}
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
 	pnt = (char *)prof_buffer + p - sizeof(atomic_t);
-	if (copy_to_user(buf,(void *)pnt,count))
+	if (copy_to_user(buf, (void *)pnt, count))
 		return -EFAULT;
 		return -EFAULT;
 	read += count;
 	read += count;
 	*ppos += read;
 	*ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
 			     size_t count, loff_t *ppos)
 			     size_t count, loff_t *ppos)
 {
 {
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
-	extern int setup_profiling_timer (unsigned int multiplier);
+	extern int setup_profiling_timer(unsigned int multiplier);
 
 
 	if (count == sizeof(int)) {
 	if (count == sizeof(int)) {
 		unsigned int multiplier;
 		unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
 		return 0;
 		return 0;
 	if (create_hash_tables())
 	if (create_hash_tables())
 		return -1;
 		return -1;
-	if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL)))
+	entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
+	if (!entry)
 		return 0;
 		return 0;
 	entry->proc_fops = &proc_profile_operations;
 	entry->proc_fops = &proc_profile_operations;
 	entry->size = (1+prof_len) * sizeof(atomic_t);
 	entry->size = (1+prof_len) * sizeof(atomic_t);

+ 575 - 0
kernel/rcuclassic.c

@@ -0,0 +1,575 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2001
+ *
+ * Authors: Dipankar Sarma <dipankar@in.ibm.com>
+ *	    Manfred Spraul <manfred@colorfullife.com>
+ *
+ * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
+ * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
+ * Papers:
+ * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
+ * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
+
+/* Definition for rcupdate control block. */
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+static struct rcu_ctrlblk rcu_bh_ctrlblk = {
+	.cur = -300,
+	.completed = -300,
+	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
+	.cpumask = CPU_MASK_NONE,
+};
+
+DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+
+static int blimit = 10;
+static int qhimark = 10000;
+static int qlowmark = 100;
+
+#ifdef CONFIG_SMP
+static void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	int cpu;
+	cpumask_t cpumask;
+	set_need_resched();
+	if (unlikely(!rcp->signaled)) {
+		rcp->signaled = 1;
+		/*
+		 * Don't send IPI to itself. With irqs disabled,
+		 * rdp->cpu is the current cpu.
+		 */
+		cpumask = rcp->cpumask;
+		cpu_clear(rdp->cpu, cpumask);
+		for_each_cpu_mask(cpu, cpumask)
+			smp_send_reschedule(cpu);
+	}
+}
+#else
+static inline void force_quiescent_state(struct rcu_data *rdp,
+			struct rcu_ctrlblk *rcp)
+{
+	set_need_resched();
+}
+#endif
+
+/**
+ * call_rcu - Queue an RCU callback for invocation after a grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ */
+void call_rcu(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_ctrlblk);
+	}
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/**
+ * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual update function to be invoked after the grace period
+ *
+ * The update function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_bh() assumes
+ * that the read-side critical sections end on completion of a softirq
+ * handler. This means that read-side critical sections in process
+ * context must not be interrupted by softirqs. This interface is to be
+ * used when most of the read-side critical sections are in softirq context.
+ * RCU read-side critical sections are delimited by rcu_read_lock() and
+ * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
+ * and rcu_read_unlock_bh(), if in process context. These may be nested.
+ */
+void call_rcu_bh(struct rcu_head *head,
+				void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = &__get_cpu_var(rcu_bh_data);
+	*rdp->nxttail = head;
+	rdp->nxttail = &head->next;
+
+	if (unlikely(++rdp->qlen > qhimark)) {
+		rdp->blimit = INT_MAX;
+		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
+	}
+
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_bh);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed_bh(void)
+{
+	return rcu_bh_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+/* Raises the softirq for processing rcu_callbacks. */
+static inline void raise_rcu_softirq(void)
+{
+	raise_softirq(RCU_SOFTIRQ);
+	/*
+	 * The smp_mb() here is required to ensure that this cpu's
+	 * __rcu_process_callbacks() reads the most recently updated
+	 * value of rcu->cur.
+	 */
+	smp_mb();
+}
+
+/*
+ * Invoke the completed RCU callbacks. They are expected to be in
+ * a per-cpu list.
+ */
+static void rcu_do_batch(struct rcu_data *rdp)
+{
+	struct rcu_head *next, *list;
+	int count = 0;
+
+	list = rdp->donelist;
+	while (list) {
+		next = list->next;
+		prefetch(next);
+		list->func(list);
+		list = next;
+		if (++count >= rdp->blimit)
+			break;
+	}
+	rdp->donelist = list;
+
+	local_irq_disable();
+	rdp->qlen -= count;
+	local_irq_enable();
+	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
+		rdp->blimit = blimit;
+
+	if (!rdp->donelist)
+		rdp->donetail = &rdp->donelist;
+	else
+		raise_rcu_softirq();
+}
+
+/*
+ * Grace period handling:
+ * The grace period handling consists out of two steps:
+ * - A new grace period is started.
+ *   This is done by rcu_start_batch. The start is not broadcasted to
+ *   all cpus, they must pick this up by comparing rcp->cur with
+ *   rdp->quiescbatch. All cpus are recorded  in the
+ *   rcu_ctrlblk.cpumask bitmap.
+ * - All cpus must go through a quiescent state.
+ *   Since the start of the grace period is not broadcasted, at least two
+ *   calls to rcu_check_quiescent_state are required:
+ *   The first call just notices that a new grace period is running. The
+ *   following calls check if there was a quiescent state since the beginning
+ *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
+ *   the bitmap is empty, then the grace period is completed.
+ *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
+ *   period (if necessary).
+ */
+/*
+ * Register a new batch of callbacks, and start it up if there is currently no
+ * active batch and the batch to be registered has not already occurred.
+ * Caller must hold rcu_ctrlblk.lock.
+ */
+static void rcu_start_batch(struct rcu_ctrlblk *rcp)
+{
+	if (rcp->next_pending &&
+			rcp->completed == rcp->cur) {
+		rcp->next_pending = 0;
+		/*
+		 * next_pending == 0 must be visible in
+		 * __rcu_process_callbacks() before it can see new value of cur.
+		 */
+		smp_wmb();
+		rcp->cur++;
+
+		/*
+		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
+		 * Barrier  Otherwise it can cause tickless idle CPUs to be
+		 * included in rcp->cpumask, which will extend graceperiods
+		 * unnecessarily.
+		 */
+		smp_mb();
+		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
+
+		rcp->signaled = 0;
+	}
+}
+
+/*
+ * cpu went through a quiescent state since the beginning of the grace period.
+ * Clear it from the cpu mask and complete the grace period if it was the last
+ * cpu. Start another grace period if someone has further entries pending
+ */
+static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
+{
+	cpu_clear(cpu, rcp->cpumask);
+	if (cpus_empty(rcp->cpumask)) {
+		/* batch completed ! */
+		rcp->completed = rcp->cur;
+		rcu_start_batch(rcp);
+	}
+}
+
+/*
+ * Check if the cpu has gone through a quiescent state (say context
+ * switch). If so and if it already hasn't done so in this RCU
+ * quiescent cycle, then indicate that it has done so.
+ */
+static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->quiescbatch != rcp->cur) {
+		/* start new grace period: */
+		rdp->qs_pending = 1;
+		rdp->passed_quiesc = 0;
+		rdp->quiescbatch = rcp->cur;
+		return;
+	}
+
+	/* Grace period already completed for this cpu?
+	 * qs_pending is checked instead of the actual bitmap to avoid
+	 * cacheline trashing.
+	 */
+	if (!rdp->qs_pending)
+		return;
+
+	/*
+	 * Was there a quiescent state since the beginning of the grace
+	 * period? If no, then exit and wait for the next call.
+	 */
+	if (!rdp->passed_quiesc)
+		return;
+	rdp->qs_pending = 0;
+
+	spin_lock(&rcp->lock);
+	/*
+	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
+	 * during cpu startup. Ignore the quiescent state.
+	 */
+	if (likely(rdp->quiescbatch == rcp->cur))
+		cpu_quiet(rdp->cpu, rcp);
+
+	spin_unlock(&rcp->lock);
+}
+
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
+ * locking requirements, the list it's pulling from has to belong to a cpu
+ * which is dead and hence not processing interrupts.
+ */
+static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
+				struct rcu_head **tail)
+{
+	local_irq_disable();
+	*this_rdp->nxttail = list;
+	if (list)
+		this_rdp->nxttail = tail;
+	local_irq_enable();
+}
+
+static void __rcu_offline_cpu(struct rcu_data *this_rdp,
+				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* if the cpu going offline owns the grace period
+	 * we can block indefinitely waiting for it, so flush
+	 * it here
+	 */
+	spin_lock_bh(&rcp->lock);
+	if (rcp->cur != rcp->completed)
+		cpu_quiet(rdp->cpu, rcp);
+	spin_unlock_bh(&rcp->lock);
+	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
+	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
+	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
+}
+
+static void rcu_offline_cpu(int cpu)
+{
+	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
+	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
+
+	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
+					&per_cpu(rcu_data, cpu));
+	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
+					&per_cpu(rcu_bh_data, cpu));
+	put_cpu_var(rcu_data);
+	put_cpu_var(rcu_bh_data);
+}
+
+#else
+
+static void rcu_offline_cpu(int cpu)
+{
+}
+
+#endif
+
+/*
+ * This does the RCU processing work from softirq context.
+ */
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
+					struct rcu_data *rdp)
+{
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
+		*rdp->donetail = rdp->curlist;
+		rdp->donetail = rdp->curtail;
+		rdp->curlist = NULL;
+		rdp->curtail = &rdp->curlist;
+	}
+
+	if (rdp->nxtlist && !rdp->curlist) {
+		local_irq_disable();
+		rdp->curlist = rdp->nxtlist;
+		rdp->curtail = rdp->nxttail;
+		rdp->nxtlist = NULL;
+		rdp->nxttail = &rdp->nxtlist;
+		local_irq_enable();
+
+		/*
+		 * start the next batch of callbacks
+		 */
+
+		/* determine batch number */
+		rdp->batch = rcp->cur + 1;
+		/* see the comment and corresponding wmb() in
+		 * the rcu_start_batch()
+		 */
+		smp_rmb();
+
+		if (!rcp->next_pending) {
+			/* and start it/schedule start if it's a new batch */
+			spin_lock(&rcp->lock);
+			rcp->next_pending = 1;
+			rcu_start_batch(rcp);
+			spin_unlock(&rcp->lock);
+		}
+	}
+
+	rcu_check_quiescent_state(rcp, rdp);
+	if (rdp->donelist)
+		rcu_do_batch(rdp);
+}
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
+	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
+}
+
+static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
+{
+	/* This cpu has pending rcu entries and the grace period
+	 * for them has completed.
+	 */
+	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
+		return 1;
+
+	/* This cpu has no pending entries, but there are new entries */
+	if (!rdp->curlist && rdp->nxtlist)
+		return 1;
+
+	/* This cpu has finished callbacks to invoke */
+	if (rdp->donelist)
+		return 1;
+
+	/* The rcu core waits for a quiescent state from the cpu */
+	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
+		return 1;
+
+	/* nothing to do */
+	return 0;
+}
+
+/*
+ * Check to see if there is any immediate RCU-related work to be done
+ * by the current CPU, returning 1 if so.  This function is part of the
+ * RCU implementation; it is -not- an exported member of the RCU API.
+ */
+int rcu_pending(int cpu)
+{
+	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
+		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
+}
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
+
+	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	if (user ||
+	    (idle_cpu(cpu) && !in_softirq() &&
+				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+		rcu_qsctr_inc(cpu);
+		rcu_bh_qsctr_inc(cpu);
+	} else if (!in_softirq())
+		rcu_bh_qsctr_inc(cpu);
+	raise_rcu_softirq();
+}
+
+static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+						struct rcu_data *rdp)
+{
+	memset(rdp, 0, sizeof(*rdp));
+	rdp->curtail = &rdp->curlist;
+	rdp->nxttail = &rdp->nxtlist;
+	rdp->donetail = &rdp->donelist;
+	rdp->quiescbatch = rcp->completed;
+	rdp->qs_pending = 0;
+	rdp->cpu = cpu;
+	rdp->blimit = blimit;
+}
+
+static void __cpuinit rcu_online_cpu(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
+
+	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
+	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call	= rcu_cpu_notify,
+};
+
+/*
+ * Initializes rcu mechanism.  Assumed to be called early.
+ * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
+ * Note that rcu_qsctr and friends are implicitly
+ * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
+ */
+void __init __rcu_init(void)
+{
+	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
+			(void *)(long)smp_processor_id());
+	/* Register notifier for non-boot CPUs */
+	register_cpu_notifier(&rcu_nb);
+}
+
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);

+ 35 - 541
kernel/rcupdate.c

@@ -15,7 +15,7 @@
  * along with this program; if not, write to the Free Software
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  *
- * Copyright (C) IBM Corporation, 2001
+ * Copyright IBM Corporation, 2001
  *
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
@@ -35,165 +35,57 @@
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
-#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
 #include <asm/atomic.h>
 #include <linux/bitops.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
 #include <linux/completion.h>
 #include <linux/completion.h>
-#include <linux/moduleparam.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
 #include <linux/notifier.h>
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
+#include <linux/module.h>
 
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
-/* Definition for rcupdate control block. */
-static struct rcu_ctrlblk rcu_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
-};
-static struct rcu_ctrlblk rcu_bh_ctrlblk = {
-	.cur = -300,
-	.completed = -300,
-	.lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
-	.cpumask = CPU_MASK_NONE,
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
 };
 };
 
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
-
-/* Fake initialization required by compiler */
-static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
-static int blimit = 10;
-static int qhimark = 10000;
-static int qlowmark = 100;
-
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
 static struct completion rcu_barrier_completion;
 
 
-#ifdef CONFIG_SMP
-static void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
-{
-	int cpu;
-	cpumask_t cpumask;
-	set_need_resched();
-	if (unlikely(!rcp->signaled)) {
-		rcp->signaled = 1;
-		/*
-		 * Don't send IPI to itself. With irqs disabled,
-		 * rdp->cpu is the current cpu.
-		 */
-		cpumask = rcp->cpumask;
-		cpu_clear(rdp->cpu, cpumask);
-		for_each_cpu_mask(cpu, cpumask)
-			smp_send_reschedule(cpu);
-	}
-}
-#else
-static inline void force_quiescent_state(struct rcu_data *rdp,
-			struct rcu_ctrlblk *rcp)
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
 {
-	set_need_resched();
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
 }
 }
-#endif
 
 
 /**
 /**
- * call_rcu - Queue an RCU callback for invocation after a grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
+ * synchronize_rcu - wait until a grace period has elapsed.
  *
  *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
  * read-side critical sections have completed.  RCU read-side critical
  * read-side critical sections have completed.  RCU read-side critical
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
  * and may be nested.
  * and may be nested.
  */
  */
-void fastcall call_rcu(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
-{
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_ctrlblk);
-	}
-	local_irq_restore(flags);
-}
-
-/**
- * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
- * @head: structure to be used for queueing the RCU updates.
- * @func: actual update function to be invoked after the grace period
- *
- * The update function will be invoked some time after a full grace
- * period elapses, in other words after all currently executing RCU
- * read-side critical sections have completed. call_rcu_bh() assumes
- * that the read-side critical sections end on completion of a softirq
- * handler. This means that read-side critical sections in process
- * context must not be interrupted by softirqs. This interface is to be
- * used when most of the read-side critical sections are in softirq context.
- * RCU read-side critical sections are delimited by rcu_read_lock() and
- * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
- * and rcu_read_unlock_bh(), if in process context. These may be nested.
- */
-void fastcall call_rcu_bh(struct rcu_head *head,
-				void (*func)(struct rcu_head *rcu))
+void synchronize_rcu(void)
 {
 {
-	unsigned long flags;
-	struct rcu_data *rdp;
-
-	head->func = func;
-	head->next = NULL;
-	local_irq_save(flags);
-	rdp = &__get_cpu_var(rcu_bh_data);
-	*rdp->nxttail = head;
-	rdp->nxttail = &head->next;
-
-	if (unlikely(++rdp->qlen > qhimark)) {
-		rdp->blimit = INT_MAX;
-		force_quiescent_state(rdp, &rcu_bh_ctrlblk);
-	}
-
-	local_irq_restore(flags);
-}
+	struct rcu_synchronize rcu;
 
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed(void)
-{
-	return rcu_ctrlblk.completed;
-}
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
 
 
-/*
- * Return the number of RCU batches processed thus far.  Useful
- * for debug and statistics.
- */
-long rcu_batches_completed_bh(void)
-{
-	return rcu_bh_ctrlblk.completed;
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
 }
 }
+EXPORT_SYMBOL_GPL(synchronize_rcu);
 
 
 static void rcu_barrier_callback(struct rcu_head *notused)
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
 {
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
 static void rcu_barrier_func(void *notused)
 static void rcu_barrier_func(void *notused)
 {
 {
 	int cpu = smp_processor_id();
 	int cpu = smp_processor_id();
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_head *head;
+	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
 
 
-	head = &rdp->barrier;
 	atomic_inc(&rcu_barrier_cpu_count);
 	atomic_inc(&rcu_barrier_cpu_count);
 	call_rcu(head, rcu_barrier_callback);
 	call_rcu(head, rcu_barrier_callback);
 }
 }
@@ -225,420 +115,24 @@ void rcu_barrier(void)
 	mutex_lock(&rcu_barrier_mutex);
 	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	init_completion(&rcu_barrier_completion);
 	atomic_set(&rcu_barrier_cpu_count, 0);
 	atomic_set(&rcu_barrier_cpu_count, 0);
+	/*
+	 * The queueing of callbacks in all CPUs must be atomic with
+	 * respect to RCU, otherwise one CPU may queue a callback,
+	 * wait for a grace period, decrement barrier count and call
+	 * complete(), while other CPUs have not yet queued anything.
+	 * So, we need to make sure that grace periods cannot complete
+	 * until all the callbacks are queued.
+	 */
+	rcu_read_lock();
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
 	on_each_cpu(rcu_barrier_func, NULL, 0, 1);
+	rcu_read_unlock();
 	wait_for_completion(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
 	mutex_unlock(&rcu_barrier_mutex);
 }
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
 
-/*
- * Invoke the completed RCU callbacks. They are expected to be in
- * a per-cpu list.
- */
-static void rcu_do_batch(struct rcu_data *rdp)
-{
-	struct rcu_head *next, *list;
-	int count = 0;
-
-	list = rdp->donelist;
-	while (list) {
-		next = list->next;
-		prefetch(next);
-		list->func(list);
-		list = next;
-		if (++count >= rdp->blimit)
-			break;
-	}
-	rdp->donelist = list;
-
-	local_irq_disable();
-	rdp->qlen -= count;
-	local_irq_enable();
-	if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
-		rdp->blimit = blimit;
-
-	if (!rdp->donelist)
-		rdp->donetail = &rdp->donelist;
-	else
-		tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
-}
-
-/*
- * Grace period handling:
- * The grace period handling consists out of two steps:
- * - A new grace period is started.
- *   This is done by rcu_start_batch. The start is not broadcasted to
- *   all cpus, they must pick this up by comparing rcp->cur with
- *   rdp->quiescbatch. All cpus are recorded  in the
- *   rcu_ctrlblk.cpumask bitmap.
- * - All cpus must go through a quiescent state.
- *   Since the start of the grace period is not broadcasted, at least two
- *   calls to rcu_check_quiescent_state are required:
- *   The first call just notices that a new grace period is running. The
- *   following calls check if there was a quiescent state since the beginning
- *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
- *   the bitmap is empty, then the grace period is completed.
- *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
- *   period (if necessary).
- */
-/*
- * Register a new batch of callbacks, and start it up if there is currently no
- * active batch and the batch to be registered has not already occurred.
- * Caller must hold rcu_ctrlblk.lock.
- */
-static void rcu_start_batch(struct rcu_ctrlblk *rcp)
-{
-	if (rcp->next_pending &&
-			rcp->completed == rcp->cur) {
-		rcp->next_pending = 0;
-		/*
-		 * next_pending == 0 must be visible in
-		 * __rcu_process_callbacks() before it can see new value of cur.
-		 */
-		smp_wmb();
-		rcp->cur++;
-
-		/*
-		 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
-		 * Barrier  Otherwise it can cause tickless idle CPUs to be
-		 * included in rcp->cpumask, which will extend graceperiods
-		 * unnecessarily.
-		 */
-		smp_mb();
-		cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
-
-		rcp->signaled = 0;
-	}
-}
-
-/*
- * cpu went through a quiescent state since the beginning of the grace period.
- * Clear it from the cpu mask and complete the grace period if it was the last
- * cpu. Start another grace period if someone has further entries pending
- */
-static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
-{
-	cpu_clear(cpu, rcp->cpumask);
-	if (cpus_empty(rcp->cpumask)) {
-		/* batch completed ! */
-		rcp->completed = rcp->cur;
-		rcu_start_batch(rcp);
-	}
-}
-
-/*
- * Check if the cpu has gone through a quiescent state (say context
- * switch). If so and if it already hasn't done so in this RCU
- * quiescent cycle, then indicate that it has done so.
- */
-static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->quiescbatch != rcp->cur) {
-		/* start new grace period: */
-		rdp->qs_pending = 1;
-		rdp->passed_quiesc = 0;
-		rdp->quiescbatch = rcp->cur;
-		return;
-	}
-
-	/* Grace period already completed for this cpu?
-	 * qs_pending is checked instead of the actual bitmap to avoid
-	 * cacheline trashing.
-	 */
-	if (!rdp->qs_pending)
-		return;
-
-	/* 
-	 * Was there a quiescent state since the beginning of the grace
-	 * period? If no, then exit and wait for the next call.
-	 */
-	if (!rdp->passed_quiesc)
-		return;
-	rdp->qs_pending = 0;
-
-	spin_lock(&rcp->lock);
-	/*
-	 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
-	 * during cpu startup. Ignore the quiescent state.
-	 */
-	if (likely(rdp->quiescbatch == rcp->cur))
-		cpu_quiet(rdp->cpu, rcp);
-
-	spin_unlock(&rcp->lock);
-}
-
-
-#ifdef CONFIG_HOTPLUG_CPU
-
-/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
- * locking requirements, the list it's pulling from has to belong to a cpu
- * which is dead and hence not processing interrupts.
- */
-static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
-				struct rcu_head **tail)
-{
-	local_irq_disable();
-	*this_rdp->nxttail = list;
-	if (list)
-		this_rdp->nxttail = tail;
-	local_irq_enable();
-}
-
-static void __rcu_offline_cpu(struct rcu_data *this_rdp,
-				struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* if the cpu going offline owns the grace period
-	 * we can block indefinitely waiting for it, so flush
-	 * it here
-	 */
-	spin_lock_bh(&rcp->lock);
-	if (rcp->cur != rcp->completed)
-		cpu_quiet(rdp->cpu, rcp);
-	spin_unlock_bh(&rcp->lock);
-	rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
-	rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
-	rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
-}
-
-static void rcu_offline_cpu(int cpu)
-{
-	struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
-	struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
-
-	__rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
-					&per_cpu(rcu_data, cpu));
-	__rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
-					&per_cpu(rcu_bh_data, cpu));
-	put_cpu_var(rcu_data);
-	put_cpu_var(rcu_bh_data);
-	tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
-}
-
-#else
-
-static void rcu_offline_cpu(int cpu)
-{
-}
-
-#endif
-
-/*
- * This does the RCU processing work from tasklet context. 
- */
-static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
-					struct rcu_data *rdp)
-{
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
-		*rdp->donetail = rdp->curlist;
-		rdp->donetail = rdp->curtail;
-		rdp->curlist = NULL;
-		rdp->curtail = &rdp->curlist;
-	}
-
-	if (rdp->nxtlist && !rdp->curlist) {
-		local_irq_disable();
-		rdp->curlist = rdp->nxtlist;
-		rdp->curtail = rdp->nxttail;
-		rdp->nxtlist = NULL;
-		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
-
-		/*
-		 * start the next batch of callbacks
-		 */
-
-		/* determine batch number */
-		rdp->batch = rcp->cur + 1;
-		/* see the comment and corresponding wmb() in
-		 * the rcu_start_batch()
-		 */
-		smp_rmb();
-
-		if (!rcp->next_pending) {
-			/* and start it/schedule start if it's a new batch */
-			spin_lock(&rcp->lock);
-			rcp->next_pending = 1;
-			rcu_start_batch(rcp);
-			spin_unlock(&rcp->lock);
-		}
-	}
-
-	rcu_check_quiescent_state(rcp, rdp);
-	if (rdp->donelist)
-		rcu_do_batch(rdp);
-}
-
-static void rcu_process_callbacks(unsigned long unused)
-{
-	__rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
-	__rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
-}
-
-static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
-{
-	/* This cpu has pending rcu entries and the grace period
-	 * for them has completed.
-	 */
-	if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
-		return 1;
-
-	/* This cpu has no pending entries, but there are new entries */
-	if (!rdp->curlist && rdp->nxtlist)
-		return 1;
-
-	/* This cpu has finished callbacks to invoke */
-	if (rdp->donelist)
-		return 1;
-
-	/* The rcu core waits for a quiescent state from the cpu */
-	if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
-		return 1;
-
-	/* nothing to do */
-	return 0;
-}
-
-/*
- * Check to see if there is any immediate RCU-related work to be done
- * by the current CPU, returning 1 if so.  This function is part of the
- * RCU implementation; it is -not- an exported member of the RCU API.
- */
-int rcu_pending(int cpu)
-{
-	return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
-		__rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
-}
-
-/*
- * Check to see if any future RCU-related work will need to be done
- * by the current CPU, even if none need be done immediately, returning
- * 1 if so.  This function is part of the RCU implementation; it is -not-
- * an exported member of the RCU API.
- */
-int rcu_needs_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
-
-	return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
-}
-
-void rcu_check_callbacks(int cpu, int user)
-{
-	if (user || 
-	    (idle_cpu(cpu) && !in_softirq() && 
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
-		rcu_qsctr_inc(cpu);
-		rcu_bh_qsctr_inc(cpu);
-	} else if (!in_softirq())
-		rcu_bh_qsctr_inc(cpu);
-	tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
-}
-
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
-						struct rcu_data *rdp)
-{
-	memset(rdp, 0, sizeof(*rdp));
-	rdp->curtail = &rdp->curlist;
-	rdp->nxttail = &rdp->nxtlist;
-	rdp->donetail = &rdp->donelist;
-	rdp->quiescbatch = rcp->completed;
-	rdp->qs_pending = 0;
-	rdp->cpu = cpu;
-	rdp->blimit = blimit;
-}
-
-static void __cpuinit rcu_online_cpu(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
-
-	rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
-	rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
-	tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
-}
-
-static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
-				unsigned long action, void *hcpu)
-{
-	long cpu = (long)hcpu;
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		rcu_online_cpu(cpu);
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-		rcu_offline_cpu(cpu);
-		break;
-	default:
-		break;
-	}
-	return NOTIFY_OK;
-}
-
-static struct notifier_block __cpuinitdata rcu_nb = {
-	.notifier_call	= rcu_cpu_notify,
-};
-
-/*
- * Initializes rcu mechanism.  Assumed to be called early.
- * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
- * Note that rcu_qsctr and friends are implicitly
- * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
- */
 void __init rcu_init(void)
 void __init rcu_init(void)
 {
 {
-	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
-			(void *)(long)smp_processor_id());
-	/* Register notifier for non-boot CPUs */
-	register_cpu_notifier(&rcu_nb);
-}
-
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
+	__rcu_init();
 }
 }
 
 
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
- *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
- */
-void synchronize_rcu(void)
-{
-	struct rcu_synchronize rcu;
-
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
-
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
-}
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-EXPORT_SYMBOL_GPL(rcu_batches_completed);
-EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
-EXPORT_SYMBOL_GPL(call_rcu);
-EXPORT_SYMBOL_GPL(call_rcu_bh);
-EXPORT_SYMBOL_GPL(synchronize_rcu);

+ 953 - 0
kernel/rcupreempt.c

@@ -0,0 +1,953 @@
+/*
+ * Read-Copy Update mechanism for mutual exclusion, realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *		With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
+ *		for pushing me away from locks and towards counters, and
+ *		to Suparna Bhattacharya for pushing me completely away
+ *		from atomic instructions on the read side.
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * Design Document: http://lwn.net/Articles/253651/
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/cpumask.h>
+#include <linux/rcupreempt_trace.h>
+
+/*
+ * Macro that prevents the compiler from reordering accesses, but does
+ * absolutely -nothing- to prevent CPUs from reordering.  This is used
+ * only to mediate communication between mainline code and hardware
+ * interrupt and NMI handlers.
+ */
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+/*
+ * PREEMPT_RCU data structures.
+ */
+
+/*
+ * GP_STAGES specifies the number of times the state machine has
+ * to go through the all the rcu_try_flip_states (see below)
+ * in a single Grace Period.
+ *
+ * GP in GP_STAGES stands for Grace Period ;)
+ */
+#define GP_STAGES    2
+struct rcu_data {
+	spinlock_t	lock;		/* Protect rcu_data fields. */
+	long		completed;	/* Number of last completed batch. */
+	int		waitlistcount;
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist[GP_STAGES];
+	struct rcu_head **waittail[GP_STAGES];
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+	long rcu_flipctr[2];
+#ifdef CONFIG_RCU_TRACE
+	struct rcupreempt_trace trace;
+#endif /* #ifdef CONFIG_RCU_TRACE */
+};
+
+/*
+ * States for rcu_try_flip() and friends.
+ */
+
+enum rcu_try_flip_states {
+
+	/*
+	 * Stay here if nothing is happening. Flip the counter if somthing
+	 * starts happening. Denoted by "I"
+	 */
+	rcu_try_flip_idle_state,
+
+	/*
+	 * Wait here for all CPUs to notice that the counter has flipped. This
+	 * prevents the old set of counters from ever being incremented once
+	 * we leave this state, which in turn is necessary because we cannot
+	 * test any individual counter for zero -- we can only check the sum.
+	 * Denoted by "A".
+	 */
+	rcu_try_flip_waitack_state,
+
+	/*
+	 * Wait here for the sum of the old per-CPU counters to reach zero.
+	 * Denoted by "Z".
+	 */
+	rcu_try_flip_waitzero_state,
+
+	/*
+	 * Wait here for each of the other CPUs to execute a memory barrier.
+	 * This is necessary to ensure that these other CPUs really have
+	 * completed executing their RCU read-side critical sections, despite
+	 * their CPUs wildly reordering memory. Denoted by "M".
+	 */
+	rcu_try_flip_waitmb_state,
+};
+
+struct rcu_ctrlblk {
+	spinlock_t	fliplock;	/* Protect state-machine transitions. */
+	long		completed;	/* Number of last completed batch. */
+	enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
+							the rcu state machine */
+};
+
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
+	.completed = 0,
+	.rcu_try_flip_state = rcu_try_flip_idle_state,
+};
+
+
+#ifdef CONFIG_RCU_TRACE
+static char *rcu_try_flip_state_names[] =
+	{ "idle", "waitack", "waitzero", "waitmb" };
+#endif /* #ifdef CONFIG_RCU_TRACE */
+
+static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has seen
+ * the most recent counter flip.
+ */
+
+enum rcu_flip_flag_values {
+	rcu_flip_seen,		/* Steady/initial state, last flip seen. */
+				/* Only GP detector can update. */
+	rcu_flipped		/* Flip just completed, need confirmation. */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
+								= rcu_flip_seen;
+
+/*
+ * Enum and per-CPU flag to determine when each CPU has executed the
+ * needed memory barrier to fence in memory references from its last RCU
+ * read-side critical section in the just-completed grace period.
+ */
+
+enum rcu_mb_flag_values {
+	rcu_mb_done,		/* Steady/initial state, no mb()s required. */
+				/* Only GP detector can update. */
+	rcu_mb_needed		/* Flip just completed, need an mb(). */
+				/* Only corresponding CPU can update. */
+};
+static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
+								= rcu_mb_done;
+
+/*
+ * RCU_DATA_ME: find the current CPU's rcu_data structure.
+ * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
+ */
+#define RCU_DATA_ME()		(&__get_cpu_var(rcu_data))
+#define RCU_DATA_CPU(cpu)	(&per_cpu(rcu_data, cpu))
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable, but where the CPU number is so cached.
+ */
+#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is not
+ * cached in a local variable.
+ */
+#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
+
+/*
+ * Helper macro for tracing when the appropriate rcu_data is pointed
+ * to by a local variable.
+ */
+#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
+}
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+
+EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
+
+void __rcu_read_lock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting != 0) {
+
+		/* An earlier rcu_read_lock() covers us, just count it. */
+
+		t->rcu_read_lock_nesting = nesting + 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * We disable interrupts for the following reasons:
+		 * - If we get scheduling clock interrupt here, and we
+		 *   end up acking the counter flip, it's like a promise
+		 *   that we will never increment the old counter again.
+		 *   Thus we will break that promise if that
+		 *   scheduling clock interrupt happens between the time
+		 *   we pick the .completed field and the time that we
+		 *   increment our counter.
+		 *
+		 * - We don't want to be preempted out here.
+		 *
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so increment
+		 * the current counter for the current CPU.  Use volatile
+		 * casts to prevent the compiler from reordering.
+		 */
+
+		idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
+
+		/*
+		 * Now that the per-CPU counter has been incremented, we
+		 * are protected from races with rcu_read_lock() invoked
+		 * from NMI handlers on this CPU.  We can therefore safely
+		 * increment the nesting counter, relieving further NMIs
+		 * of the need to increment the per-CPU counter.
+		 */
+
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
+
+		/*
+		 * Now that we have preventing any NMIs from storing
+		 * to the ->rcu_flipctr_idx, we can safely use it to
+		 * remember which counter to decrement in the matching
+		 * rcu_read_unlock().
+		 */
+
+		ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_lock);
+
+void __rcu_read_unlock(void)
+{
+	int idx;
+	struct task_struct *t = current;
+	int nesting;
+
+	nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
+	if (nesting > 1) {
+
+		/*
+		 * We are still protected by the enclosing rcu_read_lock(),
+		 * so simply decrement the counter.
+		 */
+
+		t->rcu_read_lock_nesting = nesting - 1;
+
+	} else {
+		unsigned long flags;
+
+		/*
+		 * Disable local interrupts to prevent the grace-period
+		 * detection state machine from seeing us half-done.
+		 * NMIs can still occur, of course, and might themselves
+		 * contain rcu_read_lock() and rcu_read_unlock().
+		 */
+
+		local_irq_save(flags);
+
+		/*
+		 * Outermost nesting of rcu_read_unlock(), so we must
+		 * decrement the current counter for the current CPU.
+		 * This must be done carefully, because NMIs can
+		 * occur at any point in this code, and any rcu_read_lock()
+		 * and rcu_read_unlock() pairs in the NMI handlers
+		 * must interact non-destructively with this code.
+		 * Lots of volatile casts, and -very- careful ordering.
+		 *
+		 * Changes to this code, including this one, must be
+		 * inspected, validated, and tested extremely carefully!!!
+		 */
+
+		/*
+		 * First, pick up the index.
+		 */
+
+		idx = ACCESS_ONCE(t->rcu_flipctr_idx);
+
+		/*
+		 * Now that we have fetched the counter index, it is
+		 * safe to decrement the per-task RCU nesting counter.
+		 * After this, any interrupts or NMIs will increment and
+		 * decrement the per-CPU counters.
+		 */
+		ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
+
+		/*
+		 * It is now safe to decrement this task's nesting count.
+		 * NMIs that occur after this statement will route their
+		 * rcu_read_lock() calls through this "else" clause, and
+		 * will thus start incrementing the per-CPU counter on
+		 * their own.  They will also clobber ->rcu_flipctr_idx,
+		 * but that is OK, since we have already fetched it.
+		 */
+
+		ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
+		local_irq_restore(flags);
+	}
+}
+EXPORT_SYMBOL_GPL(__rcu_read_unlock);
+
+/*
+ * If a global counter flip has occurred since the last time that we
+ * advanced callbacks, advance them.  Hardware interrupts must be
+ * disabled when calling this function.
+ */
+static void __rcu_advance_callbacks(struct rcu_data *rdp)
+{
+	int cpu;
+	int i;
+	int wlc = 0;
+
+	if (rdp->completed != rcu_ctrlblk.completed) {
+		if (rdp->waitlist[GP_STAGES - 1] != NULL) {
+			*rdp->donetail = rdp->waitlist[GP_STAGES - 1];
+			rdp->donetail = rdp->waittail[GP_STAGES - 1];
+			RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
+		}
+		for (i = GP_STAGES - 2; i >= 0; i--) {
+			if (rdp->waitlist[i] != NULL) {
+				rdp->waitlist[i + 1] = rdp->waitlist[i];
+				rdp->waittail[i + 1] = rdp->waittail[i];
+				wlc++;
+			} else {
+				rdp->waitlist[i + 1] = NULL;
+				rdp->waittail[i + 1] =
+					&rdp->waitlist[i + 1];
+			}
+		}
+		if (rdp->nextlist != NULL) {
+			rdp->waitlist[0] = rdp->nextlist;
+			rdp->waittail[0] = rdp->nexttail;
+			wlc++;
+			rdp->nextlist = NULL;
+			rdp->nexttail = &rdp->nextlist;
+			RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
+		} else {
+			rdp->waitlist[0] = NULL;
+			rdp->waittail[0] = &rdp->waitlist[0];
+		}
+		rdp->waitlistcount = wlc;
+		rdp->completed = rcu_ctrlblk.completed;
+	}
+
+	/*
+	 * Check to see if this CPU needs to report that it has seen
+	 * the most recent counter flip, thereby declaring that all
+	 * subsequent rcu_read_lock() invocations will respect this flip.
+	 */
+
+	cpu = raw_smp_processor_id();
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+}
+
+/*
+ * Get here when RCU is idle.  Decide whether we need to
+ * move out of idle state, and return non-zero if so.
+ * "Straightforward" approach for the moment, might later
+ * use callback-list lengths, grace-period duration, or
+ * some such to determine when to exit idle state.
+ * Might also need a pre-idle test that does not acquire
+ * the lock, but let's get the simple case working first...
+ */
+
+static int
+rcu_try_flip_idle(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
+	if (!rcu_pending(smp_processor_id())) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
+		return 0;
+	}
+
+	/*
+	 * Do the flip.
+	 */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
+	rcu_ctrlblk.completed++;  /* stands in for rcu_try_flip_g2 */
+
+	/*
+	 * Need a memory barrier so that other CPUs see the new
+	 * counter value before they see the subsequent change of all
+	 * the rcu_flip_flag instances to rcu_flipped.
+	 */
+
+	smp_mb();	/* see above block comment. */
+
+	/* Now ask each CPU for acknowledgement of the flip. */
+
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+		per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
+
+	return 1;
+}
+
+/*
+ * Wait for CPUs to acknowledge the flip.
+ */
+
+static int
+rcu_try_flip_waitack(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+		if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
+			return 0;
+		}
+
+	/*
+	 * Make sure our checks above don't bleed into subsequent
+	 * waiting for the sum of the counters to reach zero.
+	 */
+
+	smp_mb();	/* see above block comment. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
+	return 1;
+}
+
+/*
+ * Wait for collective ``last'' counter to reach zero,
+ * then tell all CPUs to do an end-of-grace-period memory barrier.
+ */
+
+static int
+rcu_try_flip_waitzero(void)
+{
+	int cpu;
+	int lastidx = !(rcu_ctrlblk.completed & 0x1);
+	int sum = 0;
+
+	/* Check to see if the sum of the "last" counters is zero. */
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+		sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
+	if (sum != 0) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
+		return 0;
+	}
+
+	/*
+	 * This ensures that the other CPUs see the call for
+	 * memory barriers -after- the sum to zero has been
+	 * detected here
+	 */
+	smp_mb();  /*  ^^^^^^^^^^^^ */
+
+	/* Call for a memory barrier from each CPU. */
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
+	return 1;
+}
+
+/*
+ * Wait for all CPUs to do their end-of-grace-period memory barrier.
+ * Return 0 once all CPUs have done so.
+ */
+
+static int
+rcu_try_flip_waitmb(void)
+{
+	int cpu;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
+	for_each_cpu_mask(cpu, rcu_cpu_online_map)
+		if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
+			RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
+			return 0;
+		}
+
+	smp_mb(); /* Ensure that the above checks precede any following flip. */
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
+	return 1;
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * at least GP_STAGES consecutive flips is a grace period.
+ *
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void rcu_try_flip(void)
+{
+	unsigned long flags;
+
+	RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
+		RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
+		return;
+	}
+
+	/*
+	 * Take the next transition(s) through the RCU grace-period
+	 * flip-counter state machine.
+	 */
+
+	switch (rcu_ctrlblk.rcu_try_flip_state) {
+	case rcu_try_flip_idle_state:
+		if (rcu_try_flip_idle())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitack_state;
+		break;
+	case rcu_try_flip_waitack_state:
+		if (rcu_try_flip_waitack())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitzero_state;
+		break;
+	case rcu_try_flip_waitzero_state:
+		if (rcu_try_flip_waitzero())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_waitmb_state;
+		break;
+	case rcu_try_flip_waitmb_state:
+		if (rcu_try_flip_waitmb())
+			rcu_ctrlblk.rcu_try_flip_state =
+				rcu_try_flip_idle_state;
+	}
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+/*
+ * Check to see if this CPU needs to do a memory barrier in order to
+ * ensure that any prior RCU read-side critical sections have committed
+ * their counter manipulations and critical-section memory references
+ * before declaring the grace period to be completed.
+ */
+static void rcu_check_mb(int cpu)
+{
+	if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
+		smp_mb();  /* Ensure RCU read-side accesses are visible. */
+		per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
+	}
+}
+
+void rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	rcu_check_mb(cpu);
+	if (rcu_ctrlblk.completed == rdp->completed)
+		rcu_try_flip();
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	if (rdp->donelist == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+	} else {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		raise_softirq(RCU_SOFTIRQ);
+	}
+}
+
+/*
+ * Needed by dynticks, to make sure all RCU processing has finished
+ * when we go idle:
+ */
+void rcu_advance_callbacks(int cpu, int user)
+{
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	if (rcu_ctrlblk.completed == rdp->completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rdp->completed)
+			return;
+	}
+	spin_lock_irqsave(&rdp->lock, flags);
+	RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
+	__rcu_advance_callbacks(rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
+		*dsttail = srclist; \
+		if (srclist != NULL) { \
+			dsttail = srctail; \
+			srclist = NULL; \
+			srctail = &srclist;\
+		} \
+	} while (0)
+
+void rcu_offline_cpu(int cpu)
+{
+	int i;
+	struct rcu_head *list = NULL;
+	unsigned long flags;
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+	struct rcu_head **tail = &list;
+
+	/*
+	 * Remove all callbacks from the newly dead CPU, retaining order.
+	 * Otherwise rcu_barrier() will fail
+	 */
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
+	for (i = GP_STAGES - 1; i >= 0; i--)
+		rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
+						list, tail);
+	rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	rdp->waitlistcount = 0;
+
+	/* Disengage the newly dead CPU from the grace-period computation. */
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	rcu_check_mb(cpu);
+	if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
+		smp_mb();  /* Subsequent counter accesses must see new value */
+		per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
+		smp_mb();  /* Subsequent RCU read-side critical sections */
+			   /*  seen -after- acknowledgement. */
+	}
+
+	RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+	RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
+
+	RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
+	RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
+
+	cpu_clear(cpu, rcu_cpu_online_map);
+
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+
+	/*
+	 * Place the removed callbacks on the current CPU's queue.
+	 * Make them all start a new grace period: simple approach,
+	 * in theory could starve a given set of callbacks, but
+	 * you would need to be doing some serious CPU hotplugging
+	 * to make this happen.  If this becomes a problem, adding
+	 * a synchronize_rcu() to the hotplug path would be a simple
+	 * fix.
+	 */
+
+	rdp = RCU_DATA_ME();
+	spin_lock_irqsave(&rdp->lock, flags);
+	*rdp->nexttail = list;
+	if (list)
+		rdp->nexttail = tail;
+	spin_unlock_irqrestore(&rdp->lock, flags);
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
+	cpu_set(cpu, rcu_cpu_online_map);
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
+}
+
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+
+void rcu_offline_cpu(int cpu)
+{
+}
+
+void __devinit rcu_online_cpu(int cpu)
+{
+}
+
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
+
+static void rcu_process_callbacks(struct softirq_action *unused)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+	struct rcu_data *rdp = RCU_DATA_ME();
+
+	spin_lock_irqsave(&rdp->lock, flags);
+	list = rdp->donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rdp->lock, flags);
+		return;
+	}
+	rdp->donelist = NULL;
+	rdp->donetail = &rdp->donelist;
+	RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
+	spin_unlock_irqrestore(&rdp->lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+		RCU_TRACE_ME(rcupreempt_trace_invoke);
+	}
+}
+
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	head->func = func;
+	head->next = NULL;
+	local_irq_save(flags);
+	rdp = RCU_DATA_ME();
+	spin_lock(&rdp->lock);
+	__rcu_advance_callbacks(rdp);
+	*rdp->nexttail = head;
+	rdp->nexttail = &head->next;
+	RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
+	spin_unlock(&rdp->lock);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu);
+
+/*
+ * Wait until all currently running preempt_disable() code segments
+ * (including hardware-irq-disable segments) complete.  Note that
+ * in -rt this does -not- necessarily result in all currently executing
+ * interrupt -handlers- having completed.
+ */
+void __synchronize_sched(void)
+{
+	cpumask_t oldmask;
+	int cpu;
+
+	if (sched_getaffinity(0, &oldmask) < 0)
+		oldmask = cpu_possible_map;
+	for_each_online_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
+EXPORT_SYMBOL_GPL(__synchronize_sched);
+
+/*
+ * Check to see if any future RCU-related work will need to be done
+ * by the current CPU, even if none need be done immediately, returning
+ * 1 if so.  Assumes that notifiers would take care of handling any
+ * outstanding requests from the RCU core.
+ *
+ * This function is part of the RCU implementation; it is -not-
+ * an exported member of the RCU API.
+ */
+int rcu_needs_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return (rdp->donelist != NULL ||
+		!!rdp->waitlistcount ||
+		rdp->nextlist != NULL);
+}
+
+int rcu_pending(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	/* The CPU has at least one callback queued somewhere. */
+
+	if (rdp->donelist != NULL ||
+	    !!rdp->waitlistcount ||
+	    rdp->nextlist != NULL)
+		return 1;
+
+	/* The RCU core needs an acknowledgement from this CPU. */
+
+	if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
+	    (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
+		return 1;
+
+	/* This CPU has fallen behind the global grace-period number. */
+
+	if (rdp->completed != rcu_ctrlblk.completed)
+		return 1;
+
+	/* Nothing needed from this CPU. */
+
+	return 0;
+}
+
+static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
+				unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		rcu_online_cpu(cpu);
+		break;
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		rcu_offline_cpu(cpu);
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata rcu_nb = {
+	.notifier_call = rcu_cpu_notify,
+};
+
+void __init __rcu_init(void)
+{
+	int cpu;
+	int i;
+	struct rcu_data *rdp;
+
+	printk(KERN_NOTICE "Preemptible RCU implementation.\n");
+	for_each_possible_cpu(cpu) {
+		rdp = RCU_DATA_CPU(cpu);
+		spin_lock_init(&rdp->lock);
+		rdp->completed = 0;
+		rdp->waitlistcount = 0;
+		rdp->nextlist = NULL;
+		rdp->nexttail = &rdp->nextlist;
+		for (i = 0; i < GP_STAGES; i++) {
+			rdp->waitlist[i] = NULL;
+			rdp->waittail[i] = &rdp->waitlist[i];
+		}
+		rdp->donelist = NULL;
+		rdp->donetail = &rdp->donelist;
+		rdp->rcu_flipctr[0] = 0;
+		rdp->rcu_flipctr[1] = 0;
+	}
+	register_cpu_notifier(&rcu_nb);
+
+	/*
+	 * We don't need protection against CPU-Hotplug here
+	 * since
+	 * a) If a CPU comes online while we are iterating over the
+	 *    cpu_online_map below, we would only end up making a
+	 *    duplicate call to rcu_online_cpu() which sets the corresponding
+	 *    CPU's mask in the rcu_cpu_online_map.
+	 *
+	 * b) A CPU cannot go offline at this point in time since the user
+	 *    does not have access to the sysfs interface, nor do we
+	 *    suspend the system.
+	 */
+	for_each_online_cpu(cpu)
+		rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,	(void *)(long) cpu);
+
+	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
+}
+
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+#ifdef CONFIG_RCU_TRACE
+long *rcupreempt_flipctr(int cpu)
+{
+	return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
+
+int rcupreempt_flip_flag(int cpu)
+{
+	return per_cpu(rcu_flip_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
+
+int rcupreempt_mb_flag(int cpu)
+{
+	return per_cpu(rcu_mb_flag, cpu);
+}
+EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
+
+char *rcupreempt_try_flip_state_name(void)
+{
+	return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
+}
+EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
+
+struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
+{
+	struct rcu_data *rdp = RCU_DATA_CPU(cpu);
+
+	return &rdp->trace;
+}
+EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
+
+#endif /* #ifdef RCU_TRACE */

+ 330 - 0
kernel/rcupreempt_trace.c

@@ -0,0 +1,330 @@
+/*
+ * Read-Copy Update tracing for realtime implementation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2006
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
+ *
+ * For detailed explanation of Read-Copy Update mechanism see -
+ * 		Documentation/RCU/ *.txt
+ *
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcupdate.h>
+#include <linux/cpu.h>
+#include <linux/mutex.h>
+#include <linux/rcupreempt_trace.h>
+#include <linux/debugfs.h>
+
+static struct mutex rcupreempt_trace_mutex;
+static char *rcupreempt_trace_buf;
+#define RCUPREEMPT_TRACE_BUF_SIZE 4096
+
+void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
+{
+	trace->done_length += trace->wait_length;
+	trace->done_add += trace->wait_length;
+	trace->wait_length = 0;
+}
+void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
+{
+	trace->wait_length += trace->next_length;
+	trace->wait_add += trace->next_length;
+	trace->next_length = 0;
+}
+void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_1);
+}
+void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->rcu_try_flip_e1);
+}
+void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_i1++;
+}
+void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ie1++;
+}
+void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_g1++;
+}
+void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a1++;
+}
+void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ae1++;
+}
+void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_a2++;
+}
+void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z1++;
+}
+void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_ze1++;
+}
+void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_z2++;
+}
+void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m1++;
+}
+void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_me1++;
+}
+void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
+{
+	trace->rcu_try_flip_m2++;
+}
+void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
+{
+	trace->rcu_check_callbacks++;
+}
+void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
+{
+	trace->done_remove += trace->done_length;
+	trace->done_length = 0;
+}
+void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
+{
+	atomic_inc(&trace->done_invoked);
+}
+void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
+{
+	trace->next_add++;
+	trace->next_length++;
+}
+
+static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
+{
+	struct rcupreempt_trace *cp;
+	int cpu;
+
+	memset(sp, 0, sizeof(*sp));
+	for_each_possible_cpu(cpu) {
+		cp = rcupreempt_trace_cpu(cpu);
+		sp->next_length += cp->next_length;
+		sp->next_add += cp->next_add;
+		sp->wait_length += cp->wait_length;
+		sp->wait_add += cp->wait_add;
+		sp->done_length += cp->done_length;
+		sp->done_add += cp->done_add;
+		sp->done_remove += cp->done_remove;
+		atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
+		sp->rcu_check_callbacks += cp->rcu_check_callbacks;
+		atomic_set(&sp->rcu_try_flip_1,
+			   atomic_read(&cp->rcu_try_flip_1));
+		atomic_set(&sp->rcu_try_flip_e1,
+			   atomic_read(&cp->rcu_try_flip_e1));
+		sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
+		sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
+		sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
+		sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
+		sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
+		sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
+		sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
+		sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
+		sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
+		sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
+		sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
+		sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
+	}
+}
+
+static ssize_t rcustats_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct rcupreempt_trace trace;
+	ssize_t bcount;
+	int cnt = 0;
+
+	rcupreempt_trace_sum(&trace);
+	mutex_lock(&rcupreempt_trace_mutex);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "ggp=%ld rcc=%ld\n",
+		 rcu_batches_completed(),
+		 trace.rcu_check_callbacks);
+	snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+		 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
+		 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
+
+		 trace.next_add, trace.next_length,
+		 trace.wait_add, trace.wait_length,
+		 trace.done_add, trace.done_length,
+		 trace.done_remove, atomic_read(&trace.done_invoked),
+		 atomic_read(&trace.rcu_try_flip_1),
+		 atomic_read(&trace.rcu_try_flip_e1),
+		 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
+		 trace.rcu_try_flip_g1,
+		 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
+			 trace.rcu_try_flip_a2,
+		 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
+			 trace.rcu_try_flip_z2,
+		 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
+			trace.rcu_try_flip_m2);
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcugp_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	long oldgp = rcu_batches_completed();
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+	synchronize_rcu();
+	snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
+		"oldggp=%ld  newggp=%ld\n", oldgp, rcu_batches_completed());
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_batches_completed() & 0x1;
+	ssize_t bcount;
+
+	mutex_lock(&rcupreempt_trace_mutex);
+
+	cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
+				"CPU last cur F M\n");
+	for_each_online_cpu(cpu) {
+		long *flipctr = rcupreempt_flipctr(cpu);
+		cnt += snprintf(&rcupreempt_trace_buf[cnt],
+				RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+					"%3d %4ld %3ld %d %d\n",
+			       cpu,
+			       flipctr[!f],
+			       flipctr[f],
+			       rcupreempt_flip_flag(cpu),
+			       rcupreempt_mb_flag(cpu));
+	}
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"ggp = %ld, state = %s\n",
+			rcu_batches_completed(),
+			rcupreempt_try_flip_state_name());
+	cnt += snprintf(&rcupreempt_trace_buf[cnt],
+			RCUPREEMPT_TRACE_BUF_SIZE - cnt,
+			"\n");
+	bcount = simple_read_from_buffer(buffer, count, ppos,
+			rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
+	mutex_unlock(&rcupreempt_trace_mutex);
+	return bcount;
+}
+
+static struct file_operations rcustats_fops = {
+	.owner = THIS_MODULE,
+	.read = rcustats_read,
+};
+
+static struct file_operations rcugp_fops = {
+	.owner = THIS_MODULE,
+	.read = rcugp_read,
+};
+
+static struct file_operations rcuctrs_fops = {
+	.owner = THIS_MODULE,
+	.read = rcuctrs_read,
+};
+
+static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
+static int rcupreempt_debugfs_init(void)
+{
+	rcudir = debugfs_create_dir("rcu", NULL);
+	if (!rcudir)
+		goto out;
+	statdir = debugfs_create_file("rcustats", 0444, rcudir,
+						NULL, &rcustats_fops);
+	if (!statdir)
+		goto free_out;
+
+	gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
+	if (!gpdir)
+		goto free_out;
+
+	ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
+						NULL, &rcuctrs_fops);
+	if (!ctrsdir)
+		goto free_out;
+	return 0;
+free_out:
+	if (statdir)
+		debugfs_remove(statdir);
+	if (gpdir)
+		debugfs_remove(gpdir);
+	debugfs_remove(rcudir);
+out:
+	return 1;
+}
+
+static int __init rcupreempt_trace_init(void)
+{
+	mutex_init(&rcupreempt_trace_mutex);
+	rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
+	if (!rcupreempt_trace_buf)
+		return 1;
+	return rcupreempt_debugfs_init();
+}
+
+static void __exit rcupreempt_trace_cleanup(void)
+{
+	debugfs_remove(statdir);
+	debugfs_remove(gpdir);
+	debugfs_remove(ctrsdir);
+	debugfs_remove(rcudir);
+	kfree(rcupreempt_trace_buf);
+}
+
+
+module_init(rcupreempt_trace_init);
+module_exit(rcupreempt_trace_cleanup);

+ 3 - 3
kernel/rcutorture.c

@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	cpumask_t tmp_mask = CPU_MASK_ALL;
 	int i;
 	int i;
 
 
-	lock_cpu_hotplug();
+	get_online_cpus();
 
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
 	if (num_online_cpus() == 1) {
 	if (num_online_cpus() == 1) {
-		unlock_cpu_hotplug();
+		put_online_cpus();
 		return;
 		return;
 	}
 	}
 
 
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
 	else
 	else
 		rcu_idle_cpu--;
 		rcu_idle_cpu--;
 
 
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 }
 
 
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
 /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the

+ 1014 - 370
kernel/sched.c

@@ -22,6 +22,8 @@
  *              by Peter Williams
  *              by Peter Williams
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
  *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
+ *              Thomas Gleixner, Mike Kravetz
  */
  */
 
 
 #include <linux/mm.h>
 #include <linux/mm.h>
@@ -63,6 +65,7 @@
 #include <linux/reciprocal_div.h>
 #include <linux/reciprocal_div.h>
 #include <linux/unistd.h>
 #include <linux/unistd.h>
 #include <linux/pagemap.h>
 #include <linux/pagemap.h>
+#include <linux/hrtimer.h>
 
 
 #include <asm/tlb.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/irq_regs.h>
@@ -96,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 #define MAX_USER_PRIO		(USER_PRIO(MAX_PRIO))
 
 
 /*
 /*
- * Some helpers for converting nanosecond timing to jiffy resolution
+ * Helpers for converting nanosecond timing to jiffy resolution
  */
  */
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
 #define NS_TO_JIFFIES(TIME)	((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define JIFFIES_TO_NS(TIME)	((TIME) * (NSEC_PER_SEC / HZ))
 
 
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_LOAD		SCHED_LOAD_SCALE
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
 #define NICE_0_SHIFT		SCHED_LOAD_SHIFT
@@ -159,6 +161,8 @@ struct rt_prio_array {
 
 
 struct cfs_rq;
 struct cfs_rq;
 
 
+static LIST_HEAD(task_groups);
+
 /* task group related information */
 /* task group related information */
 struct task_group {
 struct task_group {
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -168,10 +172,50 @@ struct task_group {
 	struct sched_entity **se;
 	struct sched_entity **se;
 	/* runqueue "owned" by this group on each cpu */
 	/* runqueue "owned" by this group on each cpu */
 	struct cfs_rq **cfs_rq;
 	struct cfs_rq **cfs_rq;
+
+	struct sched_rt_entity **rt_se;
+	struct rt_rq **rt_rq;
+
+	unsigned int rt_ratio;
+
+	/*
+	 * shares assigned to a task group governs how much of cpu bandwidth
+	 * is allocated to the group. The more shares a group has, the more is
+	 * the cpu bandwidth allocated to it.
+	 *
+	 * For ex, lets say that there are three task groups, A, B and C which
+	 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+	 * cpu bandwidth allocated by the scheduler to task groups A, B and C
+	 * should be:
+	 *
+	 *	Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+	 *	Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+	 *	Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+	 *
+	 * The weight assigned to a task group's schedulable entities on every
+	 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+	 * group's shares. For ex: lets say that task group A has been
+	 * assigned shares of 1000 and there are two CPUs in a system. Then,
+	 *
+	 *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+	 *
+	 * Note: It's not necessary that each of a task's group schedulable
+	 *	 entity have the same weight on all CPUs. If the group
+	 *	 has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+	 *	 better distribution of weight could be:
+	 *
+	 *	tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+	 *	tg_A->se[1]->load.weight = 1/2 * 2000 =  667
+	 *
+	 * rebalance_shares() is responsible for distributing the shares of a
+	 * task groups like this among the group's schedulable entities across
+	 * cpus.
+	 *
+	 */
 	unsigned long shares;
 	unsigned long shares;
-	/* spinlock to serialize modification to shares */
-	spinlock_t lock;
+
 	struct rcu_head rcu;
 	struct rcu_head rcu;
+	struct list_head list;
 };
 };
 
 
 /* Default task group's sched entity on each cpu */
 /* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
 /* Default task group's cfs_rq on each cpu */
 /* Default task group's cfs_rq on each cpu */
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
 
 
+static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
+static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
+
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct sched_entity *init_sched_entity_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
 
 
+static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
+static struct rt_rq *init_rt_rq_p[NR_CPUS];
+
+/* task_group_mutex serializes add/remove of task groups and also changes to
+ * a task group's cpu shares.
+ */
+static DEFINE_MUTEX(task_group_mutex);
+
+/* doms_cur_mutex serializes access to doms_cur[] array */
+static DEFINE_MUTEX(doms_cur_mutex);
+
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
 /* Default task group.
 /* Default task group.
  *	Every task in system belong to this group at bootup.
  *	Every task in system belong to this group at bootup.
  */
  */
 struct task_group init_task_group = {
 struct task_group init_task_group = {
-	.se     = init_sched_entity_p,
+	.se	= init_sched_entity_p,
 	.cfs_rq = init_cfs_rq_p,
 	.cfs_rq = init_cfs_rq_p,
+
+	.rt_se	= init_sched_rt_entity_p,
+	.rt_rq	= init_rt_rq_p,
 };
 };
 
 
 #ifdef CONFIG_FAIR_USER_SCHED
 #ifdef CONFIG_FAIR_USER_SCHED
-# define INIT_TASK_GRP_LOAD	2*NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else
 #else
-# define INIT_TASK_GRP_LOAD	NICE_0_LOAD
+# define INIT_TASK_GROUP_LOAD	NICE_0_LOAD
 #endif
 #endif
 
 
-static int init_task_group_load = INIT_TASK_GRP_LOAD;
+#define MIN_GROUP_SHARES	2
+
+static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 
 
 /* return group to which a task belongs */
 /* return group to which a task belongs */
 static inline struct task_group *task_group(struct task_struct *p)
 static inline struct task_group *task_group(struct task_struct *p)
@@ -215,15 +286,42 @@ static inline struct task_group *task_group(struct task_struct *p)
 }
 }
 
 
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
 /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu)
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 {
 {
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
 	p->se.parent = task_group(p)->se[cpu];
 	p->se.parent = task_group(p)->se[cpu];
+
+	p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
+	p->rt.parent = task_group(p)->rt_se[cpu];
+}
+
+static inline void lock_task_group_list(void)
+{
+	mutex_lock(&task_group_mutex);
+}
+
+static inline void unlock_task_group_list(void)
+{
+	mutex_unlock(&task_group_mutex);
+}
+
+static inline void lock_doms_cur(void)
+{
+	mutex_lock(&doms_cur_mutex);
+}
+
+static inline void unlock_doms_cur(void)
+{
+	mutex_unlock(&doms_cur_mutex);
 }
 }
 
 
 #else
 #else
 
 
-static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline void lock_task_group_list(void) { }
+static inline void unlock_task_group_list(void) { }
+static inline void lock_doms_cur(void) { }
+static inline void unlock_doms_cur(void) { }
 
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 
@@ -264,10 +362,56 @@ struct cfs_rq {
 /* Real-Time classes' related field in a runqueue: */
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
 struct rt_rq {
 	struct rt_prio_array active;
 	struct rt_prio_array active;
-	int rt_load_balance_idx;
-	struct list_head *rt_load_balance_head, *rt_load_balance_curr;
+	unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	int highest_prio; /* highest queued rt task prio */
+#endif
+#ifdef CONFIG_SMP
+	unsigned long rt_nr_migratory;
+	int overloaded;
+#endif
+	int rt_throttled;
+	u64 rt_time;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rq *rq;
+	struct list_head leaf_rt_rq_list;
+	struct task_group *tg;
+	struct sched_rt_entity *rt_se;
+#endif
 };
 };
 
 
+#ifdef CONFIG_SMP
+
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+	atomic_t refcount;
+	cpumask_t span;
+	cpumask_t online;
+
+	/*
+	 * The "RT overload" flag: it gets set if a CPU has more than
+	 * one runnable RT task.
+	 */
+	cpumask_t rto_mask;
+	atomic_t rto_count;
+};
+
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+static struct root_domain def_root_domain;
+
+#endif
+
 /*
 /*
  * This is the main, per-CPU runqueue data structure.
  * This is the main, per-CPU runqueue data structure.
  *
  *
@@ -296,11 +440,15 @@ struct rq {
 	u64 nr_switches;
 	u64 nr_switches;
 
 
 	struct cfs_rq cfs;
 	struct cfs_rq cfs;
+	struct rt_rq rt;
+	u64 rt_period_expire;
+	int rt_throttled;
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	/* list of leaf cfs_rq on this cpu: */
 	/* list of leaf cfs_rq on this cpu: */
 	struct list_head leaf_cfs_rq_list;
 	struct list_head leaf_cfs_rq_list;
+	struct list_head leaf_rt_rq_list;
 #endif
 #endif
-	struct rt_rq rt;
 
 
 	/*
 	/*
 	 * This is part of a global counter where only the total sum
 	 * This is part of a global counter where only the total sum
@@ -317,7 +465,7 @@ struct rq {
 	u64 clock, prev_clock_raw;
 	u64 clock, prev_clock_raw;
 	s64 clock_max_delta;
 	s64 clock_max_delta;
 
 
-	unsigned int clock_warps, clock_overflows;
+	unsigned int clock_warps, clock_overflows, clock_underflows;
 	u64 idle_clock;
 	u64 idle_clock;
 	unsigned int clock_deep_idle_events;
 	unsigned int clock_deep_idle_events;
 	u64 tick_timestamp;
 	u64 tick_timestamp;
@@ -325,6 +473,7 @@ struct rq {
 	atomic_t nr_iowait;
 	atomic_t nr_iowait;
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
+	struct root_domain *rd;
 	struct sched_domain *sd;
 	struct sched_domain *sd;
 
 
 	/* For active balancing */
 	/* For active balancing */
@@ -337,6 +486,12 @@ struct rq {
 	struct list_head migration_queue;
 	struct list_head migration_queue;
 #endif
 #endif
 
 
+#ifdef CONFIG_SCHED_HRTICK
+	unsigned long hrtick_flags;
+	ktime_t hrtick_expire;
+	struct hrtimer hrtick_timer;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	/* latency stats */
 	/* latency stats */
 	struct sched_info rq_sched_info;
 	struct sched_info rq_sched_info;
@@ -363,7 +518,6 @@ struct rq {
 };
 };
 
 
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static DEFINE_MUTEX(sched_hotcpu_mutex);
 
 
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
 {
 {
@@ -441,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
 
+unsigned long rt_needs_cpu(int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	u64 delta;
+
+	if (!rq->rt_throttled)
+		return 0;
+
+	if (rq->clock > rq->rt_period_expire)
+		return 1;
+
+	delta = rq->rt_period_expire - rq->clock;
+	do_div(delta, NSEC_PER_SEC / HZ);
+
+	return (unsigned long)delta;
+}
+
 /*
 /*
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
  */
  */
@@ -459,6 +630,8 @@ enum {
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_START_DEBIT		= 4,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_TREE_AVG		= 8,
 	SCHED_FEAT_APPROX_AVG		= 16,
 	SCHED_FEAT_APPROX_AVG		= 16,
+	SCHED_FEAT_HRTICK		= 32,
+	SCHED_FEAT_DOUBLE_TICK		= 64,
 };
 };
 
 
 const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_features =
@@ -466,7 +639,9 @@ const_debug unsigned int sysctl_sched_features =
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_WAKEUP_PREEMPT	* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_START_DEBIT		* 1 |
 		SCHED_FEAT_TREE_AVG		* 0 |
 		SCHED_FEAT_TREE_AVG		* 0 |
-		SCHED_FEAT_APPROX_AVG		* 0;
+		SCHED_FEAT_APPROX_AVG		* 0 |
+		SCHED_FEAT_HRTICK		* 1 |
+		SCHED_FEAT_DOUBLE_TICK		* 0;
 
 
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
 
 
@@ -476,6 +651,21 @@ const_debug unsigned int sysctl_sched_features =
  */
  */
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 
 
+/*
+ * period over which we measure -rt task cpu usage in ms.
+ * default: 1s
+ */
+const_debug unsigned int sysctl_sched_rt_period = 1000;
+
+#define SCHED_RT_FRAC_SHIFT	16
+#define SCHED_RT_FRAC		(1UL << SCHED_RT_FRAC_SHIFT)
+
+/*
+ * ratio of time -rt tasks may consume.
+ * default: 95%
+ */
+const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+
 /*
 /*
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
  * clock constructed from sched_clock():
  * clock constructed from sched_clock():
@@ -668,7 +858,6 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	struct rq *rq = cpu_rq(smp_processor_id());
 	struct rq *rq = cpu_rq(smp_processor_id());
 	u64 now = sched_clock();
 	u64 now = sched_clock();
 
 
-	touch_softlockup_watchdog();
 	rq->idle_clock += delta_ns;
 	rq->idle_clock += delta_ns;
 	/*
 	/*
 	 * Override the previous timestamp and ignore all
 	 * Override the previous timestamp and ignore all
@@ -680,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 	rq->prev_clock_raw = now;
 	rq->prev_clock_raw = now;
 	rq->clock += delta_ns;
 	rq->clock += delta_ns;
 	spin_unlock(&rq->lock);
 	spin_unlock(&rq->lock);
+	touch_softlockup_watchdog();
 }
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
 
+static void __resched_task(struct task_struct *p, int tif_bit);
+
+static inline void resched_task(struct task_struct *p)
+{
+	__resched_task(p, TIF_NEED_RESCHED);
+}
+
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use HR-timers to deliver accurate preemption points.
+ *
+ * Its all a bit involved since we cannot program an hrt while holding the
+ * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
+ * reschedule event.
+ *
+ * When we get rescheduled we reprogram the hrtick_timer outside of the
+ * rq->lock.
+ */
+static inline void resched_hrt(struct task_struct *p)
+{
+	__resched_task(p, TIF_HRTICK_RESCHED);
+}
+
+static inline void resched_rq(struct rq *rq)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rq->lock, flags);
+	resched_task(rq->curr);
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+enum {
+	HRTICK_SET,		/* re-programm hrtick_timer */
+	HRTICK_RESET,		/* not a new slice */
+};
+
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+	if (!sched_feat(HRTICK))
+		return 0;
+	return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+
+/*
+ * Called to set the hrtick timer state.
+ *
+ * called with rq->lock held and irqs disabled
+ */
+static void hrtick_start(struct rq *rq, u64 delay, int reset)
+{
+	assert_spin_locked(&rq->lock);
+
+	/*
+	 * preempt at: now + delay
+	 */
+	rq->hrtick_expire =
+		ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
+	/*
+	 * indicate we need to program the timer
+	 */
+	__set_bit(HRTICK_SET, &rq->hrtick_flags);
+	if (reset)
+		__set_bit(HRTICK_RESET, &rq->hrtick_flags);
+
+	/*
+	 * New slices are called from the schedule path and don't need a
+	 * forced reschedule.
+	 */
+	if (reset)
+		resched_hrt(rq->curr);
+}
+
+static void hrtick_clear(struct rq *rq)
+{
+	if (hrtimer_active(&rq->hrtick_timer))
+		hrtimer_cancel(&rq->hrtick_timer);
+}
+
+/*
+ * Update the timer from the possible pending state.
+ */
+static void hrtick_set(struct rq *rq)
+{
+	ktime_t time;
+	int set, reset;
+	unsigned long flags;
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
+	reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
+	time = rq->hrtick_expire;
+	clear_thread_flag(TIF_HRTICK_RESCHED);
+	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (set) {
+		hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
+		if (reset && !hrtimer_active(&rq->hrtick_timer))
+			resched_rq(rq);
+	} else
+		hrtick_clear(rq);
+}
+
+/*
+ * High-resolution timer tick.
+ * Runs from hardirq context with interrupts disabled.
+ */
+static enum hrtimer_restart hrtick(struct hrtimer *timer)
+{
+	struct rq *rq = container_of(timer, struct rq, hrtick_timer);
+
+	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
+
+	spin_lock(&rq->lock);
+	__update_rq_clock(rq);
+	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+	spin_unlock(&rq->lock);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+	rq->hrtick_flags = 0;
+	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	rq->hrtick_timer.function = hrtick;
+	rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+}
+
+void hrtick_resched(void)
+{
+	struct rq *rq;
+	unsigned long flags;
+
+	if (!test_thread_flag(TIF_HRTICK_RESCHED))
+		return;
+
+	local_irq_save(flags);
+	rq = cpu_rq(smp_processor_id());
+	hrtick_set(rq);
+	local_irq_restore(flags);
+}
+#else
+static inline void hrtick_clear(struct rq *rq)
+{
+}
+
+static inline void hrtick_set(struct rq *rq)
+{
+}
+
+static inline void init_rq_hrtick(struct rq *rq)
+{
+}
+
+void hrtick_resched(void)
+{
+}
+#endif
+
 /*
 /*
  * resched_task - mark a task 'to be rescheduled now'.
  * resched_task - mark a task 'to be rescheduled now'.
  *
  *
@@ -696,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
 #endif
 
 
-static void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 {
 	int cpu;
 	int cpu;
 
 
 	assert_spin_locked(&task_rq(p)->lock);
 	assert_spin_locked(&task_rq(p)->lock);
 
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (unlikely(test_tsk_thread_flag(p, tif_bit)))
 		return;
 		return;
 
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_thread_flag(p, tif_bit);
 
 
 	cpu = task_cpu(p);
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
 	if (cpu == smp_processor_id())
@@ -728,10 +1085,10 @@ static void resched_cpu(int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 	spin_unlock_irqrestore(&rq->lock, flags);
 }
 }
 #else
 #else
-static inline void resched_task(struct task_struct *p)
+static void __resched_task(struct task_struct *p, int tif_bit)
 {
 {
 	assert_spin_locked(&task_rq(p)->lock);
 	assert_spin_locked(&task_rq(p)->lock);
-	set_tsk_need_resched(p);
+	set_tsk_thread_flag(p, tif_bit);
 }
 }
 #endif
 #endif
 
 
@@ -871,6 +1228,23 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
 #endif
 
 
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+	update_load_sub(&rq->load, load);
+}
+
+#ifdef CONFIG_SMP
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long cpu_avg_load_per_task(int cpu);
+static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#endif /* CONFIG_SMP */
+
 #include "sched_stats.h"
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_idletask.c"
 #include "sched_fair.c"
 #include "sched_fair.c"
@@ -881,41 +1255,14 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 
 
 #define sched_class_highest (&rt_sched_class)
 #define sched_class_highest (&rt_sched_class)
 
 
-/*
- * Update delta_exec, delta_fair fields for rq.
- *
- * delta_fair clock advances at a rate inversely proportional to
- * total load (rq->load.weight) on the runqueue, while
- * delta_exec advances at the same rate as wall-clock (provided
- * cpu is not idle).
- *
- * delta_exec / delta_fair is a measure of the (smoothened) load on this
- * runqueue over any given interval. This (smoothened) load is used
- * during load balance.
- *
- * This function is called /before/ updating rq->load
- * and when switching tasks.
- */
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-	update_load_sub(&rq->load, p->se.load.weight);
-}
-
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
 {
 	rq->nr_running++;
 	rq->nr_running++;
-	inc_load(rq, p);
 }
 }
 
 
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
 {
 	rq->nr_running--;
 	rq->nr_running--;
-	dec_load(rq, p);
 }
 }
 
 
 static void set_load_weight(struct task_struct *p)
 static void set_load_weight(struct task_struct *p)
@@ -1039,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu)
 
 
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
 {
-	set_task_cfs_rq(p, cpu);
+	set_task_rq(p, cpu);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	/*
 	/*
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
 	 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
@@ -1051,12 +1398,24 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #endif
 #endif
 }
 }
 
 
+static inline void check_class_changed(struct rq *rq, struct task_struct *p,
+				       const struct sched_class *prev_class,
+				       int oldprio, int running)
+{
+	if (prev_class != p->sched_class) {
+		if (prev_class->switched_from)
+			prev_class->switched_from(rq, p, running);
+		p->sched_class->switched_to(rq, p, running);
+	} else
+		p->sched_class->prio_changed(rq, p, oldprio, running);
+}
+
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 
 
 /*
 /*
  * Is this task likely cache-hot:
  * Is this task likely cache-hot:
  */
  */
-static inline int
+static int
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 {
 {
 	s64 delta;
 	s64 delta;
@@ -1281,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
 /*
 /*
  * Return the average load per task on the cpu's run queue
  * Return the average load per task on the cpu's run queue
  */
  */
-static inline unsigned long cpu_avg_load_per_task(int cpu)
+static unsigned long cpu_avg_load_per_task(int cpu)
 {
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long total = weighted_cpuload(cpu);
 	unsigned long total = weighted_cpuload(cpu);
@@ -1438,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
 
 
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 
 
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-	cpumask_t tmp;
-	struct sched_domain *sd;
-	int i;
-
-	/*
-	 * If it is idle, then it is the best cpu to run this task.
-	 *
-	 * This cpu is also the best, if it has more than one task already.
-	 * Siblings must be also busy(in most cases) as they didn't already
-	 * pickup the extra load from this cpu and hence we need not check
-	 * sibling runqueue info. This will avoid the checks and cache miss
-	 * penalities associated with that.
-	 */
-	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
-		return cpu;
-
-	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_IDLE) {
-			cpus_and(tmp, sd->span, p->cpus_allowed);
-			for_each_cpu_mask(i, tmp) {
-				if (idle_cpu(i)) {
-					if (i != task_cpu(p)) {
-						schedstat_inc(p,
-							se.nr_wakeups_idle);
-					}
-					return i;
-				}
-			}
-		} else {
-			break;
-		}
-	}
-	return cpu;
-}
-#else
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-	return cpu;
-}
-#endif
-
 /***
 /***
  * try_to_wake_up - wake up a thread
  * try_to_wake_up - wake up a thread
  * @p: the to-be-woken-up thread
  * @p: the to-be-woken-up thread
@@ -1510,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	unsigned long flags;
 	unsigned long flags;
 	long old_state;
 	long old_state;
 	struct rq *rq;
 	struct rq *rq;
-#ifdef CONFIG_SMP
-	struct sched_domain *sd, *this_sd = NULL;
-	unsigned long load, this_load;
-	int new_cpu;
-#endif
 
 
 	rq = task_rq_lock(p, &flags);
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	old_state = p->state;
@@ -1532,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (unlikely(task_running(rq, p)))
 	if (unlikely(task_running(rq, p)))
 		goto out_activate;
 		goto out_activate;
 
 
-	new_cpu = cpu;
-
-	schedstat_inc(rq, ttwu_count);
-	if (cpu == this_cpu) {
-		schedstat_inc(rq, ttwu_local);
-		goto out_set_cpu;
-	}
-
-	for_each_domain(this_cpu, sd) {
-		if (cpu_isset(cpu, sd->span)) {
-			schedstat_inc(sd, ttwu_wake_remote);
-			this_sd = sd;
-			break;
-		}
-	}
-
-	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
-		goto out_set_cpu;
-
-	/*
-	 * Check for affine wakeup and passive balancing possibilities.
-	 */
-	if (this_sd) {
-		int idx = this_sd->wake_idx;
-		unsigned int imbalance;
-
-		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
-
-		load = source_load(cpu, idx);
-		this_load = target_load(this_cpu, idx);
-
-		new_cpu = this_cpu; /* Wake to this CPU if we can */
-
-		if (this_sd->flags & SD_WAKE_AFFINE) {
-			unsigned long tl = this_load;
-			unsigned long tl_per_task;
-
-			/*
-			 * Attract cache-cold tasks on sync wakeups:
-			 */
-			if (sync && !task_hot(p, rq->clock, this_sd))
-				goto out_set_cpu;
-
-			schedstat_inc(p, se.nr_wakeups_affine_attempts);
-			tl_per_task = cpu_avg_load_per_task(this_cpu);
-
-			/*
-			 * If sync wakeup then subtract the (maximum possible)
-			 * effect of the currently running task from the load
-			 * of the current CPU:
-			 */
-			if (sync)
-				tl -= current->se.load.weight;
-
-			if ((tl <= load &&
-				tl + target_load(cpu, idx) <= tl_per_task) ||
-			       100*(tl + p->se.load.weight) <= imbalance*load) {
-				/*
-				 * This domain has SD_WAKE_AFFINE and
-				 * p is cache cold in this domain, and
-				 * there is no bad imbalance.
-				 */
-				schedstat_inc(this_sd, ttwu_move_affine);
-				schedstat_inc(p, se.nr_wakeups_affine);
-				goto out_set_cpu;
-			}
-		}
-
-		/*
-		 * Start passive balancing when half the imbalance_pct
-		 * limit is reached.
-		 */
-		if (this_sd->flags & SD_WAKE_BALANCE) {
-			if (imbalance*this_load <= 100*load) {
-				schedstat_inc(this_sd, ttwu_move_balance);
-				schedstat_inc(p, se.nr_wakeups_passive);
-				goto out_set_cpu;
-			}
-		}
-	}
-
-	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
-out_set_cpu:
-	new_cpu = wake_idle(new_cpu, p);
-	if (new_cpu != cpu) {
-		set_task_cpu(p, new_cpu);
+	cpu = p->sched_class->select_task_rq(p, sync);
+	if (cpu != orig_cpu) {
+		set_task_cpu(p, cpu);
 		task_rq_unlock(rq, &flags);
 		task_rq_unlock(rq, &flags);
 		/* might preempt at this point */
 		/* might preempt at this point */
 		rq = task_rq_lock(p, &flags);
 		rq = task_rq_lock(p, &flags);
@@ -1631,6 +1850,21 @@ out_set_cpu:
 		cpu = task_cpu(p);
 		cpu = task_cpu(p);
 	}
 	}
 
 
+#ifdef CONFIG_SCHEDSTATS
+	schedstat_inc(rq, ttwu_count);
+	if (cpu == this_cpu)
+		schedstat_inc(rq, ttwu_local);
+	else {
+		struct sched_domain *sd;
+		for_each_domain(this_cpu, sd) {
+			if (cpu_isset(cpu, sd->span)) {
+				schedstat_inc(sd, ttwu_wake_remote);
+				break;
+			}
+		}
+	}
+#endif
+
 out_activate:
 out_activate:
 #endif /* CONFIG_SMP */
 #endif /* CONFIG_SMP */
 	schedstat_inc(p, se.nr_wakeups);
 	schedstat_inc(p, se.nr_wakeups);
@@ -1649,6 +1883,10 @@ out_activate:
 
 
 out_running:
 out_running:
 	p->state = TASK_RUNNING;
 	p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 out:
 out:
 	task_rq_unlock(rq, &flags);
 	task_rq_unlock(rq, &flags);
 
 
@@ -1691,7 +1929,7 @@ static void __sched_fork(struct task_struct *p)
 	p->se.wait_max			= 0;
 	p->se.wait_max			= 0;
 #endif
 #endif
 
 
-	INIT_LIST_HEAD(&p->run_list);
+	INIT_LIST_HEAD(&p->rt.run_list);
 	p->se.on_rq = 0;
 	p->se.on_rq = 0;
 
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1771,6 +2009,10 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(p, rq);
 		inc_nr_running(p, rq);
 	}
 	}
 	check_preempt_curr(rq, p);
 	check_preempt_curr(rq, p);
+#ifdef CONFIG_SMP
+	if (p->sched_class->task_wake_up)
+		p->sched_class->task_wake_up(rq, p);
+#endif
 	task_rq_unlock(rq, &flags);
 	task_rq_unlock(rq, &flags);
 }
 }
 
 
@@ -1891,6 +2133,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	prev_state = prev->state;
 	prev_state = prev->state;
 	finish_arch_switch(prev);
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 	finish_lock_switch(rq, prev);
+#ifdef CONFIG_SMP
+	if (current->sched_class->post_schedule)
+		current->sched_class->post_schedule(rq);
+#endif
+
 	fire_sched_in_preempt_notifiers(current);
 	fire_sched_in_preempt_notifiers(current);
 	if (mm)
 	if (mm)
 		mmdrop(mm);
 		mmdrop(mm);
@@ -2124,11 +2371,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
 /*
 /*
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
  */
  */
-static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 	__acquires(this_rq->lock)
 {
 {
+	int ret = 0;
+
 	if (unlikely(!irqs_disabled())) {
 	if (unlikely(!irqs_disabled())) {
 		/* printk() doesn't work good under rq->lock */
 		/* printk() doesn't work good under rq->lock */
 		spin_unlock(&this_rq->lock);
 		spin_unlock(&this_rq->lock);
@@ -2139,9 +2388,11 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
 			spin_unlock(&this_rq->lock);
 			spin_unlock(&this_rq->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&busiest->lock);
 			spin_lock(&this_rq->lock);
 			spin_lock(&this_rq->lock);
+			ret = 1;
 		} else
 		} else
 			spin_lock(&busiest->lock);
 			spin_lock(&busiest->lock);
 	}
 	}
+	return ret;
 }
 }
 
 
 /*
 /*
@@ -3485,12 +3736,14 @@ void scheduler_tick(void)
 	/*
 	/*
 	 * Let rq->clock advance by at least TICK_NSEC:
 	 * Let rq->clock advance by at least TICK_NSEC:
 	 */
 	 */
-	if (unlikely(rq->clock < next_tick))
+	if (unlikely(rq->clock < next_tick)) {
 		rq->clock = next_tick;
 		rq->clock = next_tick;
+		rq->clock_underflows++;
+	}
 	rq->tick_timestamp = rq->clock;
 	rq->tick_timestamp = rq->clock;
 	update_cpu_load(rq);
 	update_cpu_load(rq);
-	if (curr != rq->idle) /* FIXME: needed? */
-		curr->sched_class->task_tick(rq, curr);
+	curr->sched_class->task_tick(rq, curr, 0);
+	update_sched_rt_period(rq);
 	spin_unlock(&rq->lock);
 	spin_unlock(&rq->lock);
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
@@ -3636,6 +3889,8 @@ need_resched_nonpreemptible:
 
 
 	schedule_debug(prev);
 	schedule_debug(prev);
 
 
+	hrtick_clear(rq);
+
 	/*
 	/*
 	 * Do the rq-clock update outside the rq lock:
 	 * Do the rq-clock update outside the rq lock:
 	 */
 	 */
@@ -3654,6 +3909,11 @@ need_resched_nonpreemptible:
 		switch_count = &prev->nvcsw;
 		switch_count = &prev->nvcsw;
 	}
 	}
 
 
+#ifdef CONFIG_SMP
+	if (prev->sched_class->pre_schedule)
+		prev->sched_class->pre_schedule(rq, prev);
+#endif
+
 	if (unlikely(!rq->nr_running))
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 		idle_balance(cpu, rq);
 
 
@@ -3668,14 +3928,20 @@ need_resched_nonpreemptible:
 		++*switch_count;
 		++*switch_count;
 
 
 		context_switch(rq, prev, next); /* unlocks the rq */
 		context_switch(rq, prev, next); /* unlocks the rq */
+		/*
+		 * the context switch might have flipped the stack from under
+		 * us, hence refresh the local variables.
+		 */
+		cpu = smp_processor_id();
+		rq = cpu_rq(cpu);
 	} else
 	} else
 		spin_unlock_irq(&rq->lock);
 		spin_unlock_irq(&rq->lock);
 
 
-	if (unlikely(reacquire_kernel_lock(current) < 0)) {
-		cpu = smp_processor_id();
-		rq = cpu_rq(cpu);
+	hrtick_set(rq);
+
+	if (unlikely(reacquire_kernel_lock(current) < 0))
 		goto need_resched_nonpreemptible;
 		goto need_resched_nonpreemptible;
-	}
+
 	preempt_enable_no_resched();
 	preempt_enable_no_resched();
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
 		goto need_resched;
 		goto need_resched;
@@ -3691,10 +3957,9 @@ EXPORT_SYMBOL(schedule);
 asmlinkage void __sched preempt_schedule(void)
 asmlinkage void __sched preempt_schedule(void)
 {
 {
 	struct thread_info *ti = current_thread_info();
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	struct task_struct *task = current;
 	int saved_lock_depth;
 	int saved_lock_depth;
-#endif
+
 	/*
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task. Just return..
 	 * we do not want to preempt the current task. Just return..
@@ -3710,14 +3975,10 @@ asmlinkage void __sched preempt_schedule(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 * auto-release the semaphore:
 		 */
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
 		task->lock_depth = -1;
-#endif
 		schedule();
 		schedule();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 
 		/*
 		/*
@@ -3738,10 +3999,9 @@ EXPORT_SYMBOL(preempt_schedule);
 asmlinkage void __sched preempt_schedule_irq(void)
 asmlinkage void __sched preempt_schedule_irq(void)
 {
 {
 	struct thread_info *ti = current_thread_info();
 	struct thread_info *ti = current_thread_info();
-#ifdef CONFIG_PREEMPT_BKL
 	struct task_struct *task = current;
 	struct task_struct *task = current;
 	int saved_lock_depth;
 	int saved_lock_depth;
-#endif
+
 	/* Catch callers which need to be fixed */
 	/* Catch callers which need to be fixed */
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 	BUG_ON(ti->preempt_count || !irqs_disabled());
 
 
@@ -3753,16 +4013,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
 		 * clear ->lock_depth so that schedule() doesnt
 		 * clear ->lock_depth so that schedule() doesnt
 		 * auto-release the semaphore:
 		 * auto-release the semaphore:
 		 */
 		 */
-#ifdef CONFIG_PREEMPT_BKL
 		saved_lock_depth = task->lock_depth;
 		saved_lock_depth = task->lock_depth;
 		task->lock_depth = -1;
 		task->lock_depth = -1;
-#endif
 		local_irq_enable();
 		local_irq_enable();
 		schedule();
 		schedule();
 		local_irq_disable();
 		local_irq_disable();
-#ifdef CONFIG_PREEMPT_BKL
 		task->lock_depth = saved_lock_depth;
 		task->lock_depth = saved_lock_depth;
-#endif
 		sub_preempt_count(PREEMPT_ACTIVE);
 		sub_preempt_count(PREEMPT_ACTIVE);
 
 
 		/*
 		/*
@@ -4019,6 +4275,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	unsigned long flags;
 	unsigned long flags;
 	int oldprio, on_rq, running;
 	int oldprio, on_rq, running;
 	struct rq *rq;
 	struct rq *rq;
+	const struct sched_class *prev_class = p->sched_class;
 
 
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 	BUG_ON(prio < 0 || prio > MAX_PRIO);
 
 
@@ -4044,18 +4301,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 	if (on_rq) {
 	if (on_rq) {
 		if (running)
 		if (running)
 			p->sched_class->set_curr_task(rq);
 			p->sched_class->set_curr_task(rq);
+
 		enqueue_task(rq, p, 0);
 		enqueue_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	}
 	task_rq_unlock(rq, &flags);
 	task_rq_unlock(rq, &flags);
 }
 }
@@ -4087,10 +4336,8 @@ void set_user_nice(struct task_struct *p, long nice)
 		goto out_unlock;
 		goto out_unlock;
 	}
 	}
 	on_rq = p->se.on_rq;
 	on_rq = p->se.on_rq;
-	if (on_rq) {
+	if (on_rq)
 		dequeue_task(rq, p, 0);
 		dequeue_task(rq, p, 0);
-		dec_load(rq, p);
-	}
 
 
 	p->static_prio = NICE_TO_PRIO(nice);
 	p->static_prio = NICE_TO_PRIO(nice);
 	set_load_weight(p);
 	set_load_weight(p);
@@ -4100,7 +4347,6 @@ void set_user_nice(struct task_struct *p, long nice)
 
 
 	if (on_rq) {
 	if (on_rq) {
 		enqueue_task(rq, p, 0);
 		enqueue_task(rq, p, 0);
-		inc_load(rq, p);
 		/*
 		/*
 		 * If the task increased its priority or is running and
 		 * If the task increased its priority or is running and
 		 * lowered its priority, then reschedule its CPU:
 		 * lowered its priority, then reschedule its CPU:
@@ -4258,6 +4504,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
 {
 {
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	int retval, oldprio, oldpolicy = -1, on_rq, running;
 	unsigned long flags;
 	unsigned long flags;
+	const struct sched_class *prev_class = p->sched_class;
 	struct rq *rq;
 	struct rq *rq;
 
 
 	/* may grab non-irq protected spin_locks */
 	/* may grab non-irq protected spin_locks */
@@ -4351,18 +4598,10 @@ recheck:
 	if (on_rq) {
 	if (on_rq) {
 		if (running)
 		if (running)
 			p->sched_class->set_curr_task(rq);
 			p->sched_class->set_curr_task(rq);
+
 		activate_task(rq, p, 0);
 		activate_task(rq, p, 0);
-		/*
-		 * Reschedule if we are currently running on this runqueue and
-		 * our priority decreased, or if we are not currently running on
-		 * this runqueue and our priority is higher than the current's
-		 */
-		if (running) {
-			if (p->prio > oldprio)
-				resched_task(rq->curr);
-		} else {
-			check_preempt_curr(rq, p);
-		}
+
+		check_class_changed(rq, p, prev_class, oldprio, running);
 	}
 	}
 	__task_rq_unlock(rq);
 	__task_rq_unlock(rq);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
 	spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4490,13 +4729,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	struct task_struct *p;
 	struct task_struct *p;
 	int retval;
 	int retval;
 
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 	read_lock(&tasklist_lock);
 
 
 	p = find_process_by_pid(pid);
 	p = find_process_by_pid(pid);
 	if (!p) {
 	if (!p) {
 		read_unlock(&tasklist_lock);
 		read_unlock(&tasklist_lock);
-		mutex_unlock(&sched_hotcpu_mutex);
+		put_online_cpus();
 		return -ESRCH;
 		return -ESRCH;
 	}
 	}
 
 
@@ -4536,7 +4775,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
 	}
 	}
 out_unlock:
 out_unlock:
 	put_task_struct(p);
 	put_task_struct(p);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	return retval;
 	return retval;
 }
 }
 
 
@@ -4593,7 +4832,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 	struct task_struct *p;
 	struct task_struct *p;
 	int retval;
 	int retval;
 
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	read_lock(&tasklist_lock);
 	read_lock(&tasklist_lock);
 
 
 	retval = -ESRCH;
 	retval = -ESRCH;
@@ -4609,7 +4848,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
 
 
 out_unlock:
 out_unlock:
 	read_unlock(&tasklist_lock);
 	read_unlock(&tasklist_lock);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 
 	return retval;
 	return retval;
 }
 }
@@ -4683,7 +4922,8 @@ static void __cond_resched(void)
 	} while (need_resched());
 	} while (need_resched());
 }
 }
 
 
-int __sched cond_resched(void)
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
+int __sched _cond_resched(void)
 {
 {
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 	if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
 					system_state == SYSTEM_RUNNING) {
 					system_state == SYSTEM_RUNNING) {
@@ -4692,7 +4932,8 @@ int __sched cond_resched(void)
 	}
 	}
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL(cond_resched);
+EXPORT_SYMBOL(_cond_resched);
+#endif
 
 
 /*
 /*
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4890,7 +5131,7 @@ out_unlock:
 
 
 static const char stat_nam[] = "RSDTtZX";
 static const char stat_nam[] = "RSDTtZX";
 
 
-static void show_task(struct task_struct *p)
+void sched_show_task(struct task_struct *p)
 {
 {
 	unsigned long free = 0;
 	unsigned long free = 0;
 	unsigned state;
 	unsigned state;
@@ -4920,8 +5161,7 @@ static void show_task(struct task_struct *p)
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 		task_pid_nr(p), task_pid_nr(p->real_parent));
 
 
-	if (state != TASK_RUNNING)
-		show_stack(p, NULL);
+	show_stack(p, NULL);
 }
 }
 
 
 void show_state_filter(unsigned long state_filter)
 void show_state_filter(unsigned long state_filter)
@@ -4943,7 +5183,7 @@ void show_state_filter(unsigned long state_filter)
 		 */
 		 */
 		touch_nmi_watchdog();
 		touch_nmi_watchdog();
 		if (!state_filter || (p->state & state_filter))
 		if (!state_filter || (p->state & state_filter))
-			show_task(p);
+			sched_show_task(p);
 	} while_each_thread(g, p);
 	} while_each_thread(g, p);
 
 
 	touch_all_softlockup_watchdogs();
 	touch_all_softlockup_watchdogs();
@@ -4992,11 +5232,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 	spin_unlock_irqrestore(&rq->lock, flags);
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 
 	/* Set the preempt count _outside_ the spinlocks! */
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
-	task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
 	task_thread_info(idle)->preempt_count = 0;
 	task_thread_info(idle)->preempt_count = 0;
-#endif
+
 	/*
 	/*
 	 * The idle tasks have their own, simple scheduling class:
 	 * The idle tasks have their own, simple scheduling class:
 	 */
 	 */
@@ -5077,7 +5314,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 		goto out;
 		goto out;
 	}
 	}
 
 
-	p->cpus_allowed = new_mask;
+	if (p->sched_class->set_cpus_allowed)
+		p->sched_class->set_cpus_allowed(p, &new_mask);
+	else {
+		p->cpus_allowed = new_mask;
+		p->rt.nr_cpus_allowed = cpus_weight(new_mask);
+	}
+
 	/* Can the task run on the task's current CPU? If so, we're done */
 	/* Can the task run on the task's current CPU? If so, we're done */
 	if (cpu_isset(task_cpu(p), new_mask))
 	if (cpu_isset(task_cpu(p), new_mask))
 		goto out;
 		goto out;
@@ -5569,9 +5812,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	struct rq *rq;
 	struct rq *rq;
 
 
 	switch (action) {
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&sched_hotcpu_mutex);
-		break;
 
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 	case CPU_UP_PREPARE_FROZEN:
@@ -5590,6 +5830,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	case CPU_ONLINE_FROZEN:
 	case CPU_ONLINE_FROZEN:
 		/* Strictly unnecessary, as first user will wake it. */
 		/* Strictly unnecessary, as first user will wake it. */
 		wake_up_process(cpu_rq(cpu)->migration_thread);
 		wake_up_process(cpu_rq(cpu)->migration_thread);
+
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_set(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 		break;
 
 
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
@@ -5640,10 +5889,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		}
 		}
 		spin_unlock_irq(&rq->lock);
 		spin_unlock_irq(&rq->lock);
 		break;
 		break;
-#endif
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&sched_hotcpu_mutex);
+
+	case CPU_DOWN_PREPARE:
+		/* Update our root-domain */
+		rq = cpu_rq(cpu);
+		spin_lock_irqsave(&rq->lock, flags);
+		if (rq->rd) {
+			BUG_ON(!cpu_isset(cpu, rq->rd->span));
+			cpu_clear(cpu, rq->rd->online);
+		}
+		spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 		break;
+#endif
 	}
 	}
 	return NOTIFY_OK;
 	return NOTIFY_OK;
 }
 }
@@ -5831,11 +6088,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
 	return 1;
 	return 1;
 }
 }
 
 
+static void rq_attach_root(struct rq *rq, struct root_domain *rd)
+{
+	unsigned long flags;
+	const struct sched_class *class;
+
+	spin_lock_irqsave(&rq->lock, flags);
+
+	if (rq->rd) {
+		struct root_domain *old_rd = rq->rd;
+
+		for (class = sched_class_highest; class; class = class->next) {
+			if (class->leave_domain)
+				class->leave_domain(rq);
+		}
+
+		cpu_clear(rq->cpu, old_rd->span);
+		cpu_clear(rq->cpu, old_rd->online);
+
+		if (atomic_dec_and_test(&old_rd->refcount))
+			kfree(old_rd);
+	}
+
+	atomic_inc(&rd->refcount);
+	rq->rd = rd;
+
+	cpu_set(rq->cpu, rd->span);
+	if (cpu_isset(rq->cpu, cpu_online_map))
+		cpu_set(rq->cpu, rd->online);
+
+	for (class = sched_class_highest; class; class = class->next) {
+		if (class->join_domain)
+			class->join_domain(rq);
+	}
+
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+static void init_rootdomain(struct root_domain *rd)
+{
+	memset(rd, 0, sizeof(*rd));
+
+	cpus_clear(rd->span);
+	cpus_clear(rd->online);
+}
+
+static void init_defrootdomain(void)
+{
+	init_rootdomain(&def_root_domain);
+	atomic_set(&def_root_domain.refcount, 1);
+}
+
+static struct root_domain *alloc_rootdomain(void)
+{
+	struct root_domain *rd;
+
+	rd = kmalloc(sizeof(*rd), GFP_KERNEL);
+	if (!rd)
+		return NULL;
+
+	init_rootdomain(rd);
+
+	return rd;
+}
+
 /*
 /*
- * Attach the domain 'sd' to 'cpu' as its base domain.  Callers must
+ * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
  * hold the hotplug lock.
  * hold the hotplug lock.
  */
  */
-static void cpu_attach_domain(struct sched_domain *sd, int cpu)
+static void
+cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
 {
 {
 	struct rq *rq = cpu_rq(cpu);
 	struct rq *rq = cpu_rq(cpu);
 	struct sched_domain *tmp;
 	struct sched_domain *tmp;
@@ -5860,6 +6182,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 
 
 	sched_domain_debug(sd, cpu);
 	sched_domain_debug(sd, cpu);
 
 
+	rq_attach_root(rq, rd);
 	rcu_assign_pointer(rq->sd, sd);
 	rcu_assign_pointer(rq->sd, sd);
 }
 }
 
 
@@ -6228,6 +6551,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 static int build_sched_domains(const cpumask_t *cpu_map)
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
 {
 	int i;
 	int i;
+	struct root_domain *rd;
 #ifdef CONFIG_NUMA
 #ifdef CONFIG_NUMA
 	struct sched_group **sched_group_nodes = NULL;
 	struct sched_group **sched_group_nodes = NULL;
 	int sd_allnodes = 0;
 	int sd_allnodes = 0;
@@ -6244,6 +6568,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 	sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
 #endif
 #endif
 
 
+	rd = alloc_rootdomain();
+	if (!rd) {
+		printk(KERN_WARNING "Cannot alloc root domain\n");
+		return -ENOMEM;
+	}
+
 	/*
 	/*
 	 * Set up domains for cpus specified by the cpu_map.
 	 * Set up domains for cpus specified by the cpu_map.
 	 */
 	 */
@@ -6460,7 +6790,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
 #else
 #else
 		sd = &per_cpu(phys_domains, i);
 		sd = &per_cpu(phys_domains, i);
 #endif
 #endif
-		cpu_attach_domain(sd, i);
+		cpu_attach_domain(sd, rd, i);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -6518,7 +6848,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
 	unregister_sched_domain_sysctl();
 	unregister_sched_domain_sysctl();
 
 
 	for_each_cpu_mask(i, *cpu_map)
 	for_each_cpu_mask(i, *cpu_map)
-		cpu_attach_domain(NULL, i);
+		cpu_attach_domain(NULL, &def_root_domain, i);
 	synchronize_sched();
 	synchronize_sched();
 	arch_destroy_sched_domains(cpu_map);
 	arch_destroy_sched_domains(cpu_map);
 }
 }
@@ -6548,6 +6878,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
 {
 {
 	int i, j;
 	int i, j;
 
 
+	lock_doms_cur();
+
 	/* always unregister in case we don't destroy any domains */
 	/* always unregister in case we don't destroy any domains */
 	unregister_sched_domain_sysctl();
 	unregister_sched_domain_sysctl();
 
 
@@ -6588,6 +6920,8 @@ match2:
 	ndoms_cur = ndoms_new;
 	ndoms_cur = ndoms_new;
 
 
 	register_sched_domain_sysctl();
 	register_sched_domain_sysctl();
+
+	unlock_doms_cur();
 }
 }
 
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6595,10 +6929,10 @@ static int arch_reinit_sched_domains(void)
 {
 {
 	int err;
 	int err;
 
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	detach_destroy_domains(&cpu_online_map);
 	detach_destroy_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
 	err = arch_init_sched_domains(&cpu_online_map);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 
 
 	return err;
 	return err;
 }
 }
@@ -6709,12 +7043,12 @@ void __init sched_init_smp(void)
 {
 {
 	cpumask_t non_isolated_cpus;
 	cpumask_t non_isolated_cpus;
 
 
-	mutex_lock(&sched_hotcpu_mutex);
+	get_online_cpus();
 	arch_init_sched_domains(&cpu_online_map);
 	arch_init_sched_domains(&cpu_online_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
 	if (cpus_empty(non_isolated_cpus))
 	if (cpus_empty(non_isolated_cpus))
 		cpu_set(smp_processor_id(), non_isolated_cpus);
 		cpu_set(smp_processor_id(), non_isolated_cpus);
-	mutex_unlock(&sched_hotcpu_mutex);
+	put_online_cpus();
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	/* XXX: Theoretical race here - CPU may be hotplugged now */
 	hotcpu_notifier(update_sched_domains, 0);
 	hotcpu_notifier(update_sched_domains, 0);
 
 
@@ -6722,6 +7056,21 @@ void __init sched_init_smp(void)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
 		BUG();
 		BUG();
 	sched_init_granularity();
 	sched_init_granularity();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	if (nr_cpu_ids == 1)
+		return;
+
+	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+					 "group_balance");
+	if (!IS_ERR(lb_monitor_task)) {
+		lb_monitor_task->flags |= PF_NOFREEZE;
+		wake_up_process(lb_monitor_task);
+	} else {
+		printk(KERN_ERR "Could not create load balance monitor thread"
+			"(error = %ld) \n", PTR_ERR(lb_monitor_task));
+	}
+#endif
 }
 }
 #else
 #else
 void __init sched_init_smp(void)
 void __init sched_init_smp(void)
@@ -6746,13 +7095,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
 }
 
 
+static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+	struct rt_prio_array *array;
+	int i;
+
+	array = &rt_rq->active;
+	for (i = 0; i < MAX_RT_PRIO; i++) {
+		INIT_LIST_HEAD(array->queue + i);
+		__clear_bit(i, array->bitmap);
+	}
+	/* delimiter for bitsearch: */
+	__set_bit(MAX_RT_PRIO, array->bitmap);
+
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+	rt_rq->rt_nr_migratory = 0;
+	rt_rq->overloaded = 0;
+#endif
+
+	rt_rq->rt_time = 0;
+	rt_rq->rt_throttled = 0;
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	rt_rq->rq = rq;
+#endif
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
+		struct cfs_rq *cfs_rq, struct sched_entity *se,
+		int cpu, int add)
+{
+	tg->cfs_rq[cpu] = cfs_rq;
+	init_cfs_rq(cfs_rq, rq);
+	cfs_rq->tg = tg;
+	if (add)
+		list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+
+	tg->se[cpu] = se;
+	se->cfs_rq = &rq->cfs;
+	se->my_q = cfs_rq;
+	se->load.weight = tg->shares;
+	se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
+	se->parent = NULL;
+}
+
+static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
+		struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
+		int cpu, int add)
+{
+	tg->rt_rq[cpu] = rt_rq;
+	init_rt_rq(rt_rq, rq);
+	rt_rq->tg = tg;
+	rt_rq->rt_se = rt_se;
+	if (add)
+		list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
+
+	tg->rt_se[cpu] = rt_se;
+	rt_se->rt_rq = &rq->rt;
+	rt_se->my_q = rt_rq;
+	rt_se->parent = NULL;
+	INIT_LIST_HEAD(&rt_se->run_list);
+}
+#endif
+
 void __init sched_init(void)
 void __init sched_init(void)
 {
 {
 	int highest_cpu = 0;
 	int highest_cpu = 0;
 	int i, j;
 	int i, j;
 
 
+#ifdef CONFIG_SMP
+	init_defrootdomain();
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	list_add(&init_task_group.list, &task_groups);
+#endif
+
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
-		struct rt_prio_array *array;
 		struct rq *rq;
 		struct rq *rq;
 
 
 		rq = cpu_rq(i);
 		rq = cpu_rq(i);
@@ -6761,52 +7184,39 @@ void __init sched_init(void)
 		rq->nr_running = 0;
 		rq->nr_running = 0;
 		rq->clock = 1;
 		rq->clock = 1;
 		init_cfs_rq(&rq->cfs, rq);
 		init_cfs_rq(&rq->cfs, rq);
+		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
-		{
-			struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
-			struct sched_entity *se =
-					 &per_cpu(init_sched_entity, i);
-
-			init_cfs_rq_p[i] = cfs_rq;
-			init_cfs_rq(cfs_rq, rq);
-			cfs_rq->tg = &init_task_group;
-			list_add(&cfs_rq->leaf_cfs_rq_list,
-							 &rq->leaf_cfs_rq_list);
-
-			init_sched_entity_p[i] = se;
-			se->cfs_rq = &rq->cfs;
-			se->my_q = cfs_rq;
-			se->load.weight = init_task_group_load;
-			se->load.inv_weight =
-				 div64_64(1ULL<<32, init_task_group_load);
-			se->parent = NULL;
-		}
 		init_task_group.shares = init_task_group_load;
 		init_task_group.shares = init_task_group_load;
-		spin_lock_init(&init_task_group.lock);
+		INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
+		init_tg_cfs_entry(rq, &init_task_group,
+				&per_cpu(init_cfs_rq, i),
+				&per_cpu(init_sched_entity, i), i, 1);
+
+		init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+		INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
+		init_tg_rt_entry(rq, &init_task_group,
+				&per_cpu(init_rt_rq, i),
+				&per_cpu(init_sched_rt_entity, i), i, 1);
 #endif
 #endif
+		rq->rt_period_expire = 0;
+		rq->rt_throttled = 0;
 
 
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 		for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
 			rq->cpu_load[j] = 0;
 			rq->cpu_load[j] = 0;
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 		rq->sd = NULL;
 		rq->sd = NULL;
+		rq->rd = NULL;
 		rq->active_balance = 0;
 		rq->active_balance = 0;
 		rq->next_balance = jiffies;
 		rq->next_balance = jiffies;
 		rq->push_cpu = 0;
 		rq->push_cpu = 0;
 		rq->cpu = i;
 		rq->cpu = i;
 		rq->migration_thread = NULL;
 		rq->migration_thread = NULL;
 		INIT_LIST_HEAD(&rq->migration_queue);
 		INIT_LIST_HEAD(&rq->migration_queue);
+		rq_attach_root(rq, &def_root_domain);
 #endif
 #endif
+		init_rq_hrtick(rq);
 		atomic_set(&rq->nr_iowait, 0);
 		atomic_set(&rq->nr_iowait, 0);
-
-		array = &rq->rt.active;
-		for (j = 0; j < MAX_RT_PRIO; j++) {
-			INIT_LIST_HEAD(array->queue + j);
-			__clear_bit(j, array->bitmap);
-		}
 		highest_cpu = i;
 		highest_cpu = i;
-		/* delimiter for bitsearch: */
-		__set_bit(MAX_RT_PRIO, array->bitmap);
 	}
 	}
 
 
 	set_load_weight(&init_task);
 	set_load_weight(&init_task);
@@ -6975,12 +7385,187 @@ void set_curr_task(int cpu, struct task_struct *p)
 
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
 
+#ifdef CONFIG_SMP
+/*
+ * distribute shares of all task groups among their schedulable entities,
+ * to reflect load distribution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+	struct cfs_rq *cfs_rq;
+	struct rq *rq = cpu_rq(this_cpu);
+	cpumask_t sdspan = sd->span;
+	int balanced = 1;
+
+	/* Walk thr' all the task groups that we have */
+	for_each_leaf_cfs_rq(rq, cfs_rq) {
+		int i;
+		unsigned long total_load = 0, total_shares;
+		struct task_group *tg = cfs_rq->tg;
+
+		/* Gather total task load of this group across cpus */
+		for_each_cpu_mask(i, sdspan)
+			total_load += tg->cfs_rq[i]->load.weight;
+
+		/* Nothing to do if this group has no load */
+		if (!total_load)
+			continue;
+
+		/*
+		 * tg->shares represents the number of cpu shares the task group
+		 * is eligible to hold on a single cpu. On N cpus, it is
+		 * eligible to hold (N * tg->shares) number of cpu shares.
+		 */
+		total_shares = tg->shares * cpus_weight(sdspan);
+
+		/*
+		 * redistribute total_shares across cpus as per the task load
+		 * distribution.
+		 */
+		for_each_cpu_mask(i, sdspan) {
+			unsigned long local_load, local_shares;
+
+			local_load = tg->cfs_rq[i]->load.weight;
+			local_shares = (local_load * total_shares) / total_load;
+			if (!local_shares)
+				local_shares = MIN_GROUP_SHARES;
+			if (local_shares == tg->se[i]->load.weight)
+				continue;
+
+			spin_lock_irq(&cpu_rq(i)->lock);
+			set_se_shares(tg->se[i], local_shares);
+			spin_unlock_irq(&cpu_rq(i)->lock);
+			balanced = 0;
+		}
+	}
+
+	return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate trade-off between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+/* kernel thread that runs rebalance_shares() periodically */
+static int load_balance_monitor(void *unused)
+{
+	unsigned int timeout = sysctl_sched_min_bal_int_shares;
+	struct sched_param schedparm;
+	int ret;
+
+	/*
+	 * We don't want this thread's execution to be limited by the shares
+	 * assigned to default group (init_task_group). Hence make it run
+	 * as a SCHED_RR RT task at the lowest priority.
+	 */
+	schedparm.sched_priority = 1;
+	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
+	if (ret)
+		printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
+				" monitor thread (error = %d) \n", ret);
+
+	while (!kthread_should_stop()) {
+		int i, cpu, balanced = 1;
+
+		/* Prevent cpus going down or coming up */
+		get_online_cpus();
+		/* lockout changes to doms_cur[] array */
+		lock_doms_cur();
+		/*
+		 * Enter a rcu read-side critical section to safely walk rq->sd
+		 * chain on various cpus and to walk task group list
+		 * (rq->leaf_cfs_rq_list) in rebalance_shares().
+		 */
+		rcu_read_lock();
+
+		for (i = 0; i < ndoms_cur; i++) {
+			cpumask_t cpumap = doms_cur[i];
+			struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+			cpu = first_cpu(cpumap);
+
+			/* Find the highest domain at which to balance shares */
+			for_each_domain(cpu, sd) {
+				if (!(sd->flags & SD_LOAD_BALANCE))
+					continue;
+				sd_prev = sd;
+			}
+
+			sd = sd_prev;
+			/* sd == NULL? No load balance reqd in this domain */
+			if (!sd)
+				continue;
+
+			balanced &= rebalance_shares(sd, cpu);
+		}
+
+		rcu_read_unlock();
+
+		unlock_doms_cur();
+		put_online_cpus();
+
+		if (!balanced)
+			timeout = sysctl_sched_min_bal_int_shares;
+		else if (timeout < sysctl_sched_max_bal_int_shares)
+			timeout *= 2;
+
+		msleep_interruptible(timeout);
+	}
+
+	return 0;
+}
+#endif	/* CONFIG_SMP */
+
+static void free_sched_group(struct task_group *tg)
+{
+	int i;
+
+	for_each_possible_cpu(i) {
+		if (tg->cfs_rq)
+			kfree(tg->cfs_rq[i]);
+		if (tg->se)
+			kfree(tg->se[i]);
+		if (tg->rt_rq)
+			kfree(tg->rt_rq[i]);
+		if (tg->rt_se)
+			kfree(tg->rt_se[i]);
+	}
+
+	kfree(tg->cfs_rq);
+	kfree(tg->se);
+	kfree(tg->rt_rq);
+	kfree(tg->rt_se);
+	kfree(tg);
+}
+
 /* allocate runqueue etc for a new task group */
 /* allocate runqueue etc for a new task group */
 struct task_group *sched_create_group(void)
 struct task_group *sched_create_group(void)
 {
 {
 	struct task_group *tg;
 	struct task_group *tg;
 	struct cfs_rq *cfs_rq;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se;
 	struct sched_entity *se;
+	struct rt_rq *rt_rq;
+	struct sched_rt_entity *rt_se;
 	struct rq *rq;
 	struct rq *rq;
 	int i;
 	int i;
 
 
@@ -6994,97 +7579,89 @@ struct task_group *sched_create_group(void)
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
 	if (!tg->se)
 	if (!tg->se)
 		goto err;
 		goto err;
+	tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_rq)
+		goto err;
+	tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
+	if (!tg->rt_se)
+		goto err;
+
+	tg->shares = NICE_0_LOAD;
+	tg->rt_ratio = 0; /* XXX */
 
 
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		rq = cpu_rq(i);
 
 
-		cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL,
-							 cpu_to_node(i));
+		cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!cfs_rq)
 		if (!cfs_rq)
 			goto err;
 			goto err;
 
 
-		se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL,
-							cpu_to_node(i));
+		se = kmalloc_node(sizeof(struct sched_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
 		if (!se)
 		if (!se)
 			goto err;
 			goto err;
 
 
-		memset(cfs_rq, 0, sizeof(struct cfs_rq));
-		memset(se, 0, sizeof(struct sched_entity));
+		rt_rq = kmalloc_node(sizeof(struct rt_rq),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_rq)
+			goto err;
 
 
-		tg->cfs_rq[i] = cfs_rq;
-		init_cfs_rq(cfs_rq, rq);
-		cfs_rq->tg = tg;
+		rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
+				GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
+		if (!rt_se)
+			goto err;
 
 
-		tg->se[i] = se;
-		se->cfs_rq = &rq->cfs;
-		se->my_q = cfs_rq;
-		se->load.weight = NICE_0_LOAD;
-		se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
-		se->parent = NULL;
+		init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
+		init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
 	}
 	}
 
 
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
 		rq = cpu_rq(i);
 		rq = cpu_rq(i);
 		cfs_rq = tg->cfs_rq[i];
 		cfs_rq = tg->cfs_rq[i];
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
 		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
 	}
 	}
-
-	tg->shares = NICE_0_LOAD;
-	spin_lock_init(&tg->lock);
+	list_add_rcu(&tg->list, &task_groups);
+	unlock_task_group_list();
 
 
 	return tg;
 	return tg;
 
 
 err:
 err:
-	for_each_possible_cpu(i) {
-		if (tg->cfs_rq)
-			kfree(tg->cfs_rq[i]);
-		if (tg->se)
-			kfree(tg->se[i]);
-	}
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
-
+	free_sched_group(tg);
 	return ERR_PTR(-ENOMEM);
 	return ERR_PTR(-ENOMEM);
 }
 }
 
 
 /* rcu callback to free various structures associated with a task group */
 /* rcu callback to free various structures associated with a task group */
-static void free_sched_group(struct rcu_head *rhp)
+static void free_sched_group_rcu(struct rcu_head *rhp)
 {
 {
-	struct task_group *tg = container_of(rhp, struct task_group, rcu);
-	struct cfs_rq *cfs_rq;
-	struct sched_entity *se;
-	int i;
-
 	/* now it should be safe to free those cfs_rqs */
 	/* now it should be safe to free those cfs_rqs */
-	for_each_possible_cpu(i) {
-		cfs_rq = tg->cfs_rq[i];
-		kfree(cfs_rq);
-
-		se = tg->se[i];
-		kfree(se);
-	}
-
-	kfree(tg->cfs_rq);
-	kfree(tg->se);
-	kfree(tg);
+	free_sched_group(container_of(rhp, struct task_group, rcu));
 }
 }
 
 
 /* Destroy runqueue etc associated with a task group */
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 void sched_destroy_group(struct task_group *tg)
 {
 {
 	struct cfs_rq *cfs_rq = NULL;
 	struct cfs_rq *cfs_rq = NULL;
+	struct rt_rq *rt_rq = NULL;
 	int i;
 	int i;
 
 
+	lock_task_group_list();
 	for_each_possible_cpu(i) {
 	for_each_possible_cpu(i) {
 		cfs_rq = tg->cfs_rq[i];
 		cfs_rq = tg->cfs_rq[i];
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+		rt_rq = tg->rt_rq[i];
+		list_del_rcu(&rt_rq->leaf_rt_rq_list);
 	}
 	}
+	list_del_rcu(&tg->list);
+	unlock_task_group_list();
 
 
 	BUG_ON(!cfs_rq);
 	BUG_ON(!cfs_rq);
 
 
 	/* wait for possible concurrent references to cfs_rqs complete */
 	/* wait for possible concurrent references to cfs_rqs complete */
-	call_rcu(&tg->rcu, free_sched_group);
+	call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 }
 
 
 /* change task's runqueue when it moves between groups.
 /* change task's runqueue when it moves between groups.
@@ -7100,11 +7677,6 @@ void sched_move_task(struct task_struct *tsk)
 
 
 	rq = task_rq_lock(tsk, &flags);
 	rq = task_rq_lock(tsk, &flags);
 
 
-	if (tsk->sched_class != &fair_sched_class) {
-		set_task_cfs_rq(tsk, task_cpu(tsk));
-		goto done;
-	}
-
 	update_rq_clock(rq);
 	update_rq_clock(rq);
 
 
 	running = task_current(rq, tsk);
 	running = task_current(rq, tsk);
@@ -7116,7 +7688,7 @@ void sched_move_task(struct task_struct *tsk)
 			tsk->sched_class->put_prev_task(rq, tsk);
 			tsk->sched_class->put_prev_task(rq, tsk);
 	}
 	}
 
 
-	set_task_cfs_rq(tsk, task_cpu(tsk));
+	set_task_rq(tsk, task_cpu(tsk));
 
 
 	if (on_rq) {
 	if (on_rq) {
 		if (unlikely(running))
 		if (unlikely(running))
@@ -7124,53 +7696,82 @@ void sched_move_task(struct task_struct *tsk)
 		enqueue_task(rq, tsk, 0);
 		enqueue_task(rq, tsk, 0);
 	}
 	}
 
 
-done:
 	task_rq_unlock(rq, &flags);
 	task_rq_unlock(rq, &flags);
 }
 }
 
 
+/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
 {
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct cfs_rq *cfs_rq = se->cfs_rq;
 	struct rq *rq = cfs_rq->rq;
 	struct rq *rq = cfs_rq->rq;
 	int on_rq;
 	int on_rq;
 
 
-	spin_lock_irq(&rq->lock);
+	if (!shares)
+		shares = MIN_GROUP_SHARES;
 
 
 	on_rq = se->on_rq;
 	on_rq = se->on_rq;
-	if (on_rq)
+	if (on_rq) {
 		dequeue_entity(cfs_rq, se, 0);
 		dequeue_entity(cfs_rq, se, 0);
+		dec_cpu_load(rq, se->load.weight);
+	}
 
 
 	se->load.weight = shares;
 	se->load.weight = shares;
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 	se->load.inv_weight = div64_64((1ULL<<32), shares);
 
 
-	if (on_rq)
+	if (on_rq) {
 		enqueue_entity(cfs_rq, se, 0);
 		enqueue_entity(cfs_rq, se, 0);
-
-	spin_unlock_irq(&rq->lock);
+		inc_cpu_load(rq, se->load.weight);
+	}
 }
 }
 
 
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
 {
 {
 	int i;
 	int i;
+	struct cfs_rq *cfs_rq;
+	struct rq *rq;
+
+	lock_task_group_list();
+	if (tg->shares == shares)
+		goto done;
+
+	if (shares < MIN_GROUP_SHARES)
+		shares = MIN_GROUP_SHARES;
 
 
 	/*
 	/*
-	 * A weight of 0 or 1 can cause arithmetics problems.
-	 * (The default weight is 1024 - so there's no practical
-	 *  limitation from this.)
+	 * Prevent any load balance activity (rebalance_shares,
+	 * load_balance_fair) from referring to this group first,
+	 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
 	 */
 	 */
-	if (shares < 2)
-		shares = 2;
+	for_each_possible_cpu(i) {
+		cfs_rq = tg->cfs_rq[i];
+		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+	}
 
 
-	spin_lock(&tg->lock);
-	if (tg->shares == shares)
-		goto done;
+	/* wait for any ongoing reference to this group to finish */
+	synchronize_sched();
 
 
+	/*
+	 * Now we are free to modify the group's share on each cpu
+	 * w/o tripping rebalance_share or load_balance_fair.
+	 */
 	tg->shares = shares;
 	tg->shares = shares;
-	for_each_possible_cpu(i)
+	for_each_possible_cpu(i) {
+		spin_lock_irq(&cpu_rq(i)->lock);
 		set_se_shares(tg->se[i], shares);
 		set_se_shares(tg->se[i], shares);
+		spin_unlock_irq(&cpu_rq(i)->lock);
+	}
 
 
+	/*
+	 * Enable load balance activity on this group, by inserting it back on
+	 * each cpu's rq->leaf_cfs_rq_list.
+	 */
+	for_each_possible_cpu(i) {
+		rq = cpu_rq(i);
+		cfs_rq = tg->cfs_rq[i];
+		list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+	}
 done:
 done:
-	spin_unlock(&tg->lock);
+	unlock_task_group_list();
 	return 0;
 	return 0;
 }
 }
 
 
@@ -7179,6 +7780,31 @@ unsigned long sched_group_shares(struct task_group *tg)
 	return tg->shares;
 	return tg->shares;
 }
 }
 
 
+/*
+ * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ */
+int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+{
+	struct task_group *tgi;
+	unsigned long total = 0;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tgi, &task_groups, list)
+		total += tgi->rt_ratio;
+	rcu_read_unlock();
+
+	if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
+		return -EINVAL;
+
+	tg->rt_ratio = rt_ratio;
+	return 0;
+}
+
+unsigned long sched_group_rt_ratio(struct task_group *tg)
+{
+	return tg->rt_ratio;
+}
+
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 
 #ifdef CONFIG_FAIR_CGROUP_SCHED
 #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7254,12 +7880,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
 	return (u64) tg->shares;
 	return (u64) tg->shares;
 }
 }
 
 
+static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
+		u64 rt_ratio_val)
+{
+	return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+}
+
+static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	return (u64) tg->rt_ratio;
+}
+
 static struct cftype cpu_files[] = {
 static struct cftype cpu_files[] = {
 	{
 	{
 		.name = "shares",
 		.name = "shares",
 		.read_uint = cpu_shares_read_uint,
 		.read_uint = cpu_shares_read_uint,
 		.write_uint = cpu_shares_write_uint,
 		.write_uint = cpu_shares_write_uint,
 	},
 	},
+	{
+		.name = "rt_ratio",
+		.read_uint = cpu_rt_ratio_read_uint,
+		.write_uint = cpu_rt_ratio_write_uint,
+	},
 };
 };
 
 
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
 static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)

+ 5 - 0
kernel/sched_debug.c

@@ -179,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
 	PN(prev_clock_raw);
 	PN(prev_clock_raw);
 	P(clock_warps);
 	P(clock_warps);
 	P(clock_overflows);
 	P(clock_overflows);
+	P(clock_underflows);
 	P(clock_deep_idle_events);
 	P(clock_deep_idle_events);
 	PN(clock_max_delta);
 	PN(clock_max_delta);
 	P(cpu_load[0]);
 	P(cpu_load[0]);
@@ -299,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.exec_max);
 	PN(se.exec_max);
 	PN(se.slice_max);
 	PN(se.slice_max);
 	PN(se.wait_max);
 	PN(se.wait_max);
+	PN(se.wait_sum);
+	P(se.wait_count);
 	P(sched_info.bkl_count);
 	P(sched_info.bkl_count);
 	P(se.nr_migrations);
 	P(se.nr_migrations);
 	P(se.nr_migrations_cold);
 	P(se.nr_migrations_cold);
@@ -366,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
 {
 {
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_max				= 0;
 	p->se.wait_max				= 0;
+	p->se.wait_sum				= 0;
+	p->se.wait_count			= 0;
 	p->se.sleep_max				= 0;
 	p->se.sleep_max				= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.sum_sleep_runtime			= 0;
 	p->se.block_max				= 0;
 	p->se.block_max				= 0;

+ 349 - 42
kernel/sched_fair.c

@@ -20,6 +20,8 @@
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  */
  */
 
 
+#include <linux/latencytop.h>
+
 /*
 /*
  * Targeted preemption latency for CPU-bound tasks:
  * Targeted preemption latency for CPU-bound tasks:
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
  * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
@@ -248,8 +250,8 @@ static u64 __sched_period(unsigned long nr_running)
 	unsigned long nr_latency = sched_nr_latency;
 	unsigned long nr_latency = sched_nr_latency;
 
 
 	if (unlikely(nr_running > nr_latency)) {
 	if (unlikely(nr_running > nr_latency)) {
+		period = sysctl_sched_min_granularity;
 		period *= nr_running;
 		period *= nr_running;
-		do_div(period, nr_latency);
 	}
 	}
 
 
 	return period;
 	return period;
@@ -383,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 {
 	schedstat_set(se->wait_max, max(se->wait_max,
 	schedstat_set(se->wait_max, max(se->wait_max,
 			rq_of(cfs_rq)->clock - se->wait_start));
 			rq_of(cfs_rq)->clock - se->wait_start));
+	schedstat_set(se->wait_count, se->wait_count + 1);
+	schedstat_set(se->wait_sum, se->wait_sum +
+			rq_of(cfs_rq)->clock - se->wait_start);
 	schedstat_set(se->wait_start, 0);
 	schedstat_set(se->wait_start, 0);
 }
 }
 
 
@@ -434,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_SCHEDSTATS
 #ifdef CONFIG_SCHEDSTATS
 	if (se->sleep_start) {
 	if (se->sleep_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
 		u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
+		struct task_struct *tsk = task_of(se);
 
 
 		if ((s64)delta < 0)
 		if ((s64)delta < 0)
 			delta = 0;
 			delta = 0;
@@ -443,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 
 		se->sleep_start = 0;
 		se->sleep_start = 0;
 		se->sum_sleep_runtime += delta;
 		se->sum_sleep_runtime += delta;
+
+		account_scheduler_latency(tsk, delta >> 10, 1);
 	}
 	}
 	if (se->block_start) {
 	if (se->block_start) {
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
 		u64 delta = rq_of(cfs_rq)->clock - se->block_start;
+		struct task_struct *tsk = task_of(se);
 
 
 		if ((s64)delta < 0)
 		if ((s64)delta < 0)
 			delta = 0;
 			delta = 0;
@@ -462,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		 * time that the task spent sleeping:
 		 * time that the task spent sleeping:
 		 */
 		 */
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
 		if (unlikely(prof_on == SLEEP_PROFILING)) {
-			struct task_struct *tsk = task_of(se);
 
 
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 			profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
 				     delta >> 20);
 				     delta >> 20);
 		}
 		}
+		account_scheduler_latency(tsk, delta >> 10, 0);
 	}
 	}
 #endif
 #endif
 }
 }
@@ -642,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	cfs_rq->curr = NULL;
 	cfs_rq->curr = NULL;
 }
 }
 
 
-static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+static void
+entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 {
 {
 	/*
 	/*
 	 * Update run-time statistics of the 'current'.
 	 * Update run-time statistics of the 'current'.
 	 */
 	 */
 	update_curr(cfs_rq);
 	update_curr(cfs_rq);
 
 
+#ifdef CONFIG_SCHED_HRTICK
+	/*
+	 * queued ticks are scheduled to match the slice, so don't bother
+	 * validating it and just reschedule.
+	 */
+	if (queued)
+		return resched_task(rq_of(cfs_rq)->curr);
+	/*
+	 * don't let the period tick interfere with the hrtick preemption
+	 */
+	if (!sched_feat(DOUBLE_TICK) &&
+			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
+		return;
+#endif
+
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 	if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
 		check_preempt_tick(cfs_rq, curr);
 		check_preempt_tick(cfs_rq, curr);
 }
 }
@@ -690,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
 
 
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
-	list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
+	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 
 /* Do the two (enqueued) entities belong to the same group ? */
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline int
 static inline int
@@ -707,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 	return se->parent;
 	return se->parent;
 }
 }
 
 
+#define GROUP_IMBALANCE_PCT	20
+
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 #else	/* CONFIG_FAIR_GROUP_SCHED */
 
 
 #define for_each_sched_entity(se) \
 #define for_each_sched_entity(se) \
@@ -752,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
 
+#ifdef CONFIG_SCHED_HRTICK
+static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+	int requeue = rq->curr == p;
+	struct sched_entity *se = &p->se;
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+	WARN_ON(task_rq(p) != rq);
+
+	if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+		u64 slice = sched_slice(cfs_rq, se);
+		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
+		s64 delta = slice - ran;
+
+		if (delta < 0) {
+			if (rq->curr == p)
+				resched_task(p);
+			return;
+		}
+
+		/*
+		 * Don't schedule slices shorter than 10000ns, that just
+		 * doesn't make sense. Rely on vruntime for fairness.
+		 */
+		if (!requeue)
+			delta = max(10000LL, delta);
+
+		hrtick_start(rq, delta, requeue);
+	}
+}
+#else
+static inline void
+hrtick_start_fair(struct rq *rq, struct task_struct *p)
+{
+}
+#endif
+
 /*
 /*
  * The enqueue_task method is called before nr_running is
  * The enqueue_task method is called before nr_running is
  * increased. Here we update the fair scheduling stats and
  * increased. Here we update the fair scheduling stats and
@@ -760,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 {
 {
 	struct cfs_rq *cfs_rq;
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL;	/* Highest schedulable entity */
+	int incload = 1;
 
 
 	for_each_sched_entity(se) {
 	for_each_sched_entity(se) {
-		if (se->on_rq)
+		topse = se;
+		if (se->on_rq) {
+			incload = 0;
 			break;
 			break;
+		}
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, wakeup);
 		enqueue_entity(cfs_rq, se, wakeup);
 		wakeup = 1;
 		wakeup = 1;
 	}
 	}
+	/* Increment cpu load if we just enqueued the first task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (incload)
+		inc_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 }
 
 
 /*
 /*
@@ -779,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
 {
 {
 	struct cfs_rq *cfs_rq;
 	struct cfs_rq *cfs_rq;
-	struct sched_entity *se = &p->se;
+	struct sched_entity *se = &p->se,
+			    *topse = NULL; 	/* Highest schedulable entity */
+	int decload = 1;
 
 
 	for_each_sched_entity(se) {
 	for_each_sched_entity(se) {
+		topse = se;
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, sleep);
 		dequeue_entity(cfs_rq, se, sleep);
 		/* Don't dequeue parent if it has other entities besides us */
 		/* Don't dequeue parent if it has other entities besides us */
-		if (cfs_rq->load.weight)
+		if (cfs_rq->load.weight) {
+			if (parent_entity(se))
+				decload = 0;
 			break;
 			break;
+		}
 		sleep = 1;
 		sleep = 1;
 	}
 	}
+	/* Decrement cpu load if we just dequeued the last task of a group on
+	 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
+	 * at the highest grouping level.
+	 */
+	if (decload)
+		dec_cpu_load(rq, topse->load.weight);
+
+	hrtick_start_fair(rq, rq->curr);
 }
 }
 
 
 /*
 /*
@@ -835,6 +926,154 @@ static void yield_task_fair(struct rq *rq)
 	se->vruntime = rightmost->vruntime + 1;
 	se->vruntime = rightmost->vruntime + 1;
 }
 }
 
 
+/*
+ * wake_idle() will wake a task on an idle cpu if task->cpu is
+ * not idle and an idle cpu is available.  The span of cpus to
+ * search starts with cpus closest then further out as needed,
+ * so we always favor a closer, idle cpu.
+ *
+ * Returns the CPU we should wake onto.
+ */
+#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
+static int wake_idle(int cpu, struct task_struct *p)
+{
+	cpumask_t tmp;
+	struct sched_domain *sd;
+	int i;
+
+	/*
+	 * If it is idle, then it is the best cpu to run this task.
+	 *
+	 * This cpu is also the best, if it has more than one task already.
+	 * Siblings must be also busy(in most cases) as they didn't already
+	 * pickup the extra load from this cpu and hence we need not check
+	 * sibling runqueue info. This will avoid the checks and cache miss
+	 * penalities associated with that.
+	 */
+	if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
+		return cpu;
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_IDLE) {
+			cpus_and(tmp, sd->span, p->cpus_allowed);
+			for_each_cpu_mask(i, tmp) {
+				if (idle_cpu(i)) {
+					if (i != task_cpu(p)) {
+						schedstat_inc(p,
+						       se.nr_wakeups_idle);
+					}
+					return i;
+				}
+			}
+		} else {
+			break;
+		}
+	}
+	return cpu;
+}
+#else
+static inline int wake_idle(int cpu, struct task_struct *p)
+{
+	return cpu;
+}
+#endif
+
+#ifdef CONFIG_SMP
+static int select_task_rq_fair(struct task_struct *p, int sync)
+{
+	int cpu, this_cpu;
+	struct rq *rq;
+	struct sched_domain *sd, *this_sd = NULL;
+	int new_cpu;
+
+	cpu      = task_cpu(p);
+	rq       = task_rq(p);
+	this_cpu = smp_processor_id();
+	new_cpu  = cpu;
+
+	if (cpu == this_cpu)
+		goto out_set_cpu;
+
+	for_each_domain(this_cpu, sd) {
+		if (cpu_isset(cpu, sd->span)) {
+			this_sd = sd;
+			break;
+		}
+	}
+
+	if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+		goto out_set_cpu;
+
+	/*
+	 * Check for affine wakeup and passive balancing possibilities.
+	 */
+	if (this_sd) {
+		int idx = this_sd->wake_idx;
+		unsigned int imbalance;
+		unsigned long load, this_load;
+
+		imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+
+		load = source_load(cpu, idx);
+		this_load = target_load(this_cpu, idx);
+
+		new_cpu = this_cpu; /* Wake to this CPU if we can */
+
+		if (this_sd->flags & SD_WAKE_AFFINE) {
+			unsigned long tl = this_load;
+			unsigned long tl_per_task;
+
+			/*
+			 * Attract cache-cold tasks on sync wakeups:
+			 */
+			if (sync && !task_hot(p, rq->clock, this_sd))
+				goto out_set_cpu;
+
+			schedstat_inc(p, se.nr_wakeups_affine_attempts);
+			tl_per_task = cpu_avg_load_per_task(this_cpu);
+
+			/*
+			 * If sync wakeup then subtract the (maximum possible)
+			 * effect of the currently running task from the load
+			 * of the current CPU:
+			 */
+			if (sync)
+				tl -= current->se.load.weight;
+
+			if ((tl <= load &&
+				tl + target_load(cpu, idx) <= tl_per_task) ||
+			       100*(tl + p->se.load.weight) <= imbalance*load) {
+				/*
+				 * This domain has SD_WAKE_AFFINE and
+				 * p is cache cold in this domain, and
+				 * there is no bad imbalance.
+				 */
+				schedstat_inc(this_sd, ttwu_move_affine);
+				schedstat_inc(p, se.nr_wakeups_affine);
+				goto out_set_cpu;
+			}
+		}
+
+		/*
+		 * Start passive balancing when half the imbalance_pct
+		 * limit is reached.
+		 */
+		if (this_sd->flags & SD_WAKE_BALANCE) {
+			if (imbalance*this_load <= 100*load) {
+				schedstat_inc(this_sd, ttwu_move_balance);
+				schedstat_inc(p, se.nr_wakeups_passive);
+				goto out_set_cpu;
+			}
+		}
+	}
+
+	new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
+out_set_cpu:
+	return wake_idle(new_cpu, p);
+}
+#endif /* CONFIG_SMP */
+
+
 /*
 /*
  * Preempt the current task with a newly woken task if needed:
  * Preempt the current task with a newly woken task if needed:
  */
  */
@@ -876,6 +1115,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
 
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 static struct task_struct *pick_next_task_fair(struct rq *rq)
 {
 {
+	struct task_struct *p;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct cfs_rq *cfs_rq = &rq->cfs;
 	struct sched_entity *se;
 	struct sched_entity *se;
 
 
@@ -887,7 +1127,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 		cfs_rq = group_cfs_rq(se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
 	} while (cfs_rq);
 
 
-	return task_of(se);
+	p = task_of(se);
+	hrtick_start_fair(rq, p);
+
+	return p;
 }
 }
 
 
 /*
 /*
@@ -944,25 +1187,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 	return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
 }
 }
 
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-	struct sched_entity *curr;
-	struct task_struct *p;
-
-	if (!cfs_rq->nr_running)
-		return MAX_PRIO;
-
-	curr = cfs_rq->curr;
-	if (!curr)
-		curr = __pick_next_entity(cfs_rq);
-
-	p = task_of(curr);
-
-	return p->prio;
-}
-#endif
-
 static unsigned long
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
 		  unsigned long max_load_move,
@@ -972,28 +1196,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	struct cfs_rq *busy_cfs_rq;
 	struct cfs_rq *busy_cfs_rq;
 	long rem_load_move = max_load_move;
 	long rem_load_move = max_load_move;
 	struct rq_iterator cfs_rq_iterator;
 	struct rq_iterator cfs_rq_iterator;
+	unsigned long load_moved;
 
 
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.start = load_balance_start_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 	cfs_rq_iterator.next = load_balance_next_fair;
 
 
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
-		struct cfs_rq *this_cfs_rq;
-		long imbalance;
-		unsigned long maxload;
+		struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+		unsigned long maxload, task_load, group_weight;
+		unsigned long thisload, per_task_load;
+		struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
+
+		task_load = busy_cfs_rq->load.weight;
+		group_weight = se->load.weight;
 
 
-		this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+		/*
+		 * 'group_weight' is contributed by tasks of total weight
+		 * 'task_load'. To move 'rem_load_move' worth of weight only,
+		 * we need to move a maximum task load of:
+		 *
+		 * 	maxload = (remload / group_weight) * task_load;
+		 */
+		maxload = (rem_load_move * task_load) / group_weight;
 
 
-		imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-		/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-		if (imbalance <= 0)
+		if (!maxload || !task_load)
 			continue;
 			continue;
 
 
-		/* Don't pull more than imbalance/2 */
-		imbalance /= 2;
-		maxload = min(rem_load_move, imbalance);
+		per_task_load = task_load / busy_cfs_rq->nr_running;
+		/*
+		 * balance_tasks will try to forcibly move atleast one task if
+		 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+		 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+		 */
+		 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+			continue;
 
 
-		*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+		/* Disable priority-based load balance */
+		*this_best_prio = 0;
+		thisload = this_cfs_rq->load.weight;
 #else
 #else
 # define maxload rem_load_move
 # define maxload rem_load_move
 #endif
 #endif
@@ -1002,11 +1243,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 * load_balance_[start|next]_fair iterators
 		 * load_balance_[start|next]_fair iterators
 		 */
 		 */
 		cfs_rq_iterator.arg = busy_cfs_rq;
 		cfs_rq_iterator.arg = busy_cfs_rq;
-		rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+		load_moved = balance_tasks(this_rq, this_cpu, busiest,
 					       maxload, sd, idle, all_pinned,
 					       maxload, sd, idle, all_pinned,
 					       this_best_prio,
 					       this_best_prio,
 					       &cfs_rq_iterator);
 					       &cfs_rq_iterator);
 
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		/*
+		 * load_moved holds the task load that was moved. The
+		 * effective (group) weight moved would be:
+		 * 	load_moved_eff = load_moved/task_load * group_weight;
+		 */
+		load_moved = (group_weight * load_moved) / task_load;
+
+		/* Adjust shares on both cpus to reflect load_moved */
+		group_weight -= load_moved;
+		set_se_shares(se, group_weight);
+
+		se = busy_cfs_rq->tg->se[this_cpu];
+		if (!thisload)
+			group_weight = load_moved;
+		else
+			group_weight = se->load.weight + load_moved;
+		set_se_shares(se, group_weight);
+#endif
+
+		rem_load_move -= load_moved;
+
 		if (rem_load_move <= 0)
 		if (rem_load_move <= 0)
 			break;
 			break;
 	}
 	}
@@ -1042,14 +1305,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 /*
 /*
  * scheduler tick hitting a task of our scheduling class:
  * scheduler tick hitting a task of our scheduling class:
  */
  */
-static void task_tick_fair(struct rq *rq, struct task_struct *curr)
+static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
 {
 	struct cfs_rq *cfs_rq;
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &curr->se;
 	struct sched_entity *se = &curr->se;
 
 
 	for_each_sched_entity(se) {
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq = cfs_rq_of(se);
-		entity_tick(cfs_rq, se);
+		entity_tick(cfs_rq, se, queued);
 	}
 	}
 }
 }
 
 
@@ -1087,6 +1350,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
 	resched_task(rq->curr);
 	resched_task(rq->curr);
 }
 }
 
 
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void prio_changed_fair(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_fair(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/*
+	 * We were most likely switched from sched_rt, so
+	 * kick off the schedule if running, otherwise just see
+	 * if we can still preempt the current task.
+	 */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
 /* Account for a task changing its policy or group.
 /* Account for a task changing its policy or group.
  *
  *
  * This routine is mostly called to set cfs_rq->curr field when a task
  * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1108,6 +1407,9 @@ static const struct sched_class fair_sched_class = {
 	.enqueue_task		= enqueue_task_fair,
 	.enqueue_task		= enqueue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.dequeue_task		= dequeue_task_fair,
 	.yield_task		= yield_task_fair,
 	.yield_task		= yield_task_fair,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_fair,
+#endif /* CONFIG_SMP */
 
 
 	.check_preempt_curr	= check_preempt_wakeup,
 	.check_preempt_curr	= check_preempt_wakeup,
 
 
@@ -1122,6 +1424,9 @@ static const struct sched_class fair_sched_class = {
 	.set_curr_task          = set_curr_task_fair,
 	.set_curr_task          = set_curr_task_fair,
 	.task_tick		= task_tick_fair,
 	.task_tick		= task_tick_fair,
 	.task_new		= task_new_fair,
 	.task_new		= task_new_fair,
+
+	.prio_changed		= prio_changed_fair,
+	.switched_to		= switched_to_fair,
 };
 };
 
 
 #ifdef CONFIG_SCHED_DEBUG
 #ifdef CONFIG_SCHED_DEBUG
@@ -1132,7 +1437,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 	print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
 #endif
 #endif
+	rcu_read_lock();
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
 		print_cfs_rq(m, cpu, cfs_rq);
 		print_cfs_rq(m, cpu, cfs_rq);
+	rcu_read_unlock();
 }
 }
 #endif
 #endif

+ 41 - 1
kernel/sched_idletask.c

@@ -5,6 +5,12 @@
  *  handled in sched_fair.c)
  *  handled in sched_fair.c)
  */
  */
 
 
+#ifdef CONFIG_SMP
+static int select_task_rq_idle(struct task_struct *p, int sync)
+{
+	return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif /* CONFIG_SMP */
 /*
 /*
  * Idle tasks are unconditionally rescheduled:
  * Idle tasks are unconditionally rescheduled:
  */
  */
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 }
 #endif
 #endif
 
 
-static void task_tick_idle(struct rq *rq, struct task_struct *curr)
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
 {
 {
 }
 }
 
 
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
 {
 {
 }
 }
 
 
+static void switched_to_idle(struct rq *rq, struct task_struct *p,
+			     int running)
+{
+	/* Can this actually happen?? */
+	if (running)
+		resched_task(rq->curr);
+	else
+		check_preempt_curr(rq, p);
+}
+
+static void prio_changed_idle(struct rq *rq, struct task_struct *p,
+			      int oldprio, int running)
+{
+	/* This can happen for hot plug CPUS */
+
+	/*
+	 * Reschedule if we are currently running on this runqueue and
+	 * our priority decreased, or if we are not currently running on
+	 * this runqueue and our priority is higher than the current's
+	 */
+	if (running) {
+		if (p->prio > oldprio)
+			resched_task(rq->curr);
+	} else
+		check_preempt_curr(rq, p);
+}
+
 /*
 /*
  * Simple, special scheduling class for the per-CPU idle tasks:
  * Simple, special scheduling class for the per-CPU idle tasks:
  */
  */
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
 
 
 	/* dequeue is not valid, we print a debug message there: */
 	/* dequeue is not valid, we print a debug message there: */
 	.dequeue_task		= dequeue_task_idle,
 	.dequeue_task		= dequeue_task_idle,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_idle,
+#endif /* CONFIG_SMP */
 
 
 	.check_preempt_curr	= check_preempt_curr_idle,
 	.check_preempt_curr	= check_preempt_curr_idle,
 
 
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
 
 
 	.set_curr_task          = set_curr_task_idle,
 	.set_curr_task          = set_curr_task_idle,
 	.task_tick		= task_tick_idle,
 	.task_tick		= task_tick_idle,
+
+	.prio_changed		= prio_changed_idle,
+	.switched_to		= switched_to_idle,
+
 	/* no .task_new for idle tasks */
 	/* no .task_new for idle tasks */
 };
 };

+ 1023 - 89
kernel/sched_rt.c

@@ -3,6 +3,217 @@
  * policies)
  * policies)
  */
  */
 
 
+#ifdef CONFIG_SMP
+
+static inline int rt_overloaded(struct rq *rq)
+{
+	return atomic_read(&rq->rd->rto_count);
+}
+
+static inline void rt_set_overload(struct rq *rq)
+{
+	cpu_set(rq->cpu, rq->rd->rto_mask);
+	/*
+	 * Make sure the mask is visible before we set
+	 * the overload count. That is checked to determine
+	 * if we should look at the mask. It would be a shame
+	 * if we looked at the mask, but the mask was not
+	 * updated yet.
+	 */
+	wmb();
+	atomic_inc(&rq->rd->rto_count);
+}
+
+static inline void rt_clear_overload(struct rq *rq)
+{
+	/* the order here really doesn't matter */
+	atomic_dec(&rq->rd->rto_count);
+	cpu_clear(rq->cpu, rq->rd->rto_mask);
+}
+
+static void update_rt_migration(struct rq *rq)
+{
+	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
+		if (!rq->rt.overloaded) {
+			rt_set_overload(rq);
+			rq->rt.overloaded = 1;
+		}
+	} else if (rq->rt.overloaded) {
+		rt_clear_overload(rq);
+		rq->rt.overloaded = 0;
+	}
+}
+#endif /* CONFIG_SMP */
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline int on_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return !list_empty(&rt_se->run_list);
+}
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	if (!rt_rq->tg)
+		return SCHED_RT_FRAC;
+
+	return rt_rq->tg->rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = rt_se->parent)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return rt_se->my_q;
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+
+static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
+		struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
+		enqueue_rt_entity(rt_se);
+		if (rt_rq->highest_prio < curr->prio)
+			resched_task(curr);
+	}
+}
+
+static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+	struct sched_rt_entity *rt_se = rt_rq->rt_se;
+
+	if (rt_se && on_rt_rq(rt_se))
+		dequeue_rt_entity(rt_se);
+}
+
+#else
+
+static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+{
+	return sysctl_sched_rt_ratio;
+}
+
+#define for_each_leaf_rt_rq(rt_rq, rq) \
+	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->rt;
+}
+
+#define for_each_sched_rt_entity(rt_se) \
+	for (; rt_se; rt_se = NULL)
+
+static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
+{
+	return NULL;
+}
+
+static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+{
+}
+
+static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+{
+}
+
+#endif
+
+static inline int rt_se_prio(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	struct rt_rq *rt_rq = group_rt_rq(rt_se);
+
+	if (rt_rq)
+		return rt_rq->highest_prio;
+#endif
+
+	return rt_task_of(rt_se)->prio;
+}
+
+static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+{
+	unsigned int rt_ratio = sched_rt_ratio(rt_rq);
+	u64 period, ratio;
+
+	if (rt_ratio == SCHED_RT_FRAC)
+		return 0;
+
+	if (rt_rq->rt_throttled)
+		return 1;
+
+	period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+	ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+	if (rt_rq->rt_time > ratio) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+
+		rq->rt_throttled = 1;
+		rt_rq->rt_throttled = 1;
+
+		sched_rt_ratio_dequeue(rt_rq);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void update_sched_rt_period(struct rq *rq)
+{
+	struct rt_rq *rt_rq;
+	u64 period;
+
+	while (rq->clock > rq->rt_period_expire) {
+		period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+		rq->rt_period_expire += period;
+
+		for_each_leaf_rt_rq(rt_rq, rq) {
+			unsigned long rt_ratio = sched_rt_ratio(rt_rq);
+			u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+
+			rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
+			if (rt_rq->rt_throttled) {
+				rt_rq->rt_throttled = 0;
+				sched_rt_ratio_enqueue(rt_rq);
+			}
+		}
+
+		rq->rt_throttled = 0;
+	}
+}
+
 /*
 /*
  * Update the current task's runtime statistics. Skip current tasks that
  * Update the current task's runtime statistics. Skip current tasks that
  * are not in our scheduling class.
  * are not in our scheduling class.
@@ -10,6 +221,8 @@
 static void update_curr_rt(struct rq *rq)
 static void update_curr_rt(struct rq *rq)
 {
 {
 	struct task_struct *curr = rq->curr;
 	struct task_struct *curr = rq->curr;
+	struct sched_rt_entity *rt_se = &curr->rt;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 	u64 delta_exec;
 	u64 delta_exec;
 
 
 	if (!task_has_rt_policy(curr))
 	if (!task_has_rt_policy(curr))
@@ -24,47 +237,228 @@ static void update_curr_rt(struct rq *rq)
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.sum_exec_runtime += delta_exec;
 	curr->se.exec_start = rq->clock;
 	curr->se.exec_start = rq->clock;
 	cpuacct_charge(curr, delta_exec);
 	cpuacct_charge(curr, delta_exec);
+
+	rt_rq->rt_time += delta_exec;
+	/*
+	 * might make it a tad more accurate:
+	 *
+	 * update_sched_rt_period(rq);
+	 */
+	if (sched_rt_ratio_exceeded(rt_rq))
+		resched_task(curr);
 }
 }
 
 
-static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_se_prio(rt_se) < rt_rq->highest_prio)
+		rt_rq->highest_prio = rt_se_prio(rt_se);
+#endif
+#ifdef CONFIG_SMP
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+		rq->rt.rt_nr_migratory++;
+	}
+
+	update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	WARN_ON(!rt_rq->rt_nr_running);
+	rt_rq->rt_nr_running--;
+#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
+	if (rt_rq->rt_nr_running) {
+		struct rt_prio_array *array;
+
+		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
+		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
+			/* recalculate */
+			array = &rt_rq->active;
+			rt_rq->highest_prio =
+				sched_find_first_bit(array->bitmap);
+		} /* otherwise leave rq->highest prio alone */
+	} else
+		rt_rq->highest_prio = MAX_RT_PRIO;
+#endif
+#ifdef CONFIG_SMP
+	if (rt_se->nr_cpus_allowed > 1) {
+		struct rq *rq = rq_of_rt_rq(rt_rq);
+		rq->rt.rt_nr_migratory--;
+	}
+
+	update_rt_migration(rq_of_rt_rq(rt_rq));
+#endif /* CONFIG_SMP */
+}
+
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
+{
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+	struct rt_rq *group_rq = group_rt_rq(rt_se);
+
+	if (group_rq && group_rq->rt_throttled)
+		return;
+
+	list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+	__set_bit(rt_se_prio(rt_se), array->bitmap);
+
+	inc_rt_tasks(rt_se, rt_rq);
+}
+
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
 {
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_del_init(&rt_se->run_list);
+	if (list_empty(array->queue + rt_se_prio(rt_se)))
+		__clear_bit(rt_se_prio(rt_se), array->bitmap);
 
 
-	list_add_tail(&p->run_list, array->queue + p->prio);
-	__set_bit(p->prio, array->bitmap);
+	dec_rt_tasks(rt_se, rt_rq);
+}
+
+/*
+ * Because the prio of an upper entry depends on the lower
+ * entries, we must remove entries top - down.
+ *
+ * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
+ *      doesn't matter much for now, as h=2 for GROUP_SCHED.
+ */
+static void dequeue_rt_stack(struct task_struct *p)
+{
+	struct sched_rt_entity *rt_se, *top_se;
+
+	/*
+	 * dequeue all, top - down.
+	 */
+	do {
+		rt_se = &p->rt;
+		top_se = NULL;
+		for_each_sched_rt_entity(rt_se) {
+			if (on_rt_rq(rt_se))
+				top_se = rt_se;
+		}
+		if (top_se)
+			dequeue_rt_entity(top_se);
+	} while (top_se);
 }
 }
 
 
 /*
 /*
  * Adding/removing a task to/from a priority array:
  * Adding/removing a task to/from a priority array:
  */
  */
+static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
+{
+	struct sched_rt_entity *rt_se = &p->rt;
+
+	if (wakeup)
+		rt_se->timeout = 0;
+
+	dequeue_rt_stack(p);
+
+	/*
+	 * enqueue everybody, bottom - up.
+	 */
+	for_each_sched_rt_entity(rt_se)
+		enqueue_rt_entity(rt_se);
+
+	inc_cpu_load(rq, p->se.load.weight);
+}
+
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 {
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
 
 	update_curr_rt(rq);
 	update_curr_rt(rq);
 
 
-	list_del(&p->run_list);
-	if (list_empty(array->queue + p->prio))
-		__clear_bit(p->prio, array->bitmap);
+	dequeue_rt_stack(p);
+
+	/*
+	 * re-enqueue all non-empty rt_rq entities.
+	 */
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = group_rt_rq(rt_se);
+		if (rt_rq && rt_rq->rt_nr_running)
+			enqueue_rt_entity(rt_se);
+	}
+
+	dec_cpu_load(rq, p->se.load.weight);
 }
 }
 
 
 /*
 /*
  * Put task to the end of the run list without the overhead of dequeue
  * Put task to the end of the run list without the overhead of dequeue
  * followed by enqueue.
  * followed by enqueue.
  */
  */
+static
+void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+	struct rt_prio_array *array = &rt_rq->active;
+
+	list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
+}
+
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 static void requeue_task_rt(struct rq *rq, struct task_struct *p)
 {
 {
-	struct rt_prio_array *array = &rq->rt.active;
+	struct sched_rt_entity *rt_se = &p->rt;
+	struct rt_rq *rt_rq;
 
 
-	list_move_tail(&p->run_list, array->queue + p->prio);
+	for_each_sched_rt_entity(rt_se) {
+		rt_rq = rt_rq_of_se(rt_se);
+		requeue_rt_entity(rt_rq, rt_se);
+	}
 }
 }
 
 
-static void
-yield_task_rt(struct rq *rq)
+static void yield_task_rt(struct rq *rq)
 {
 {
 	requeue_task_rt(rq, rq->curr);
 	requeue_task_rt(rq, rq->curr);
 }
 }
 
 
+#ifdef CONFIG_SMP
+static int find_lowest_rq(struct task_struct *task);
+
+static int select_task_rq_rt(struct task_struct *p, int sync)
+{
+	struct rq *rq = task_rq(p);
+
+	/*
+	 * If the current task is an RT task, then
+	 * try to see if we can wake this RT task up on another
+	 * runqueue. Otherwise simply start this RT task
+	 * on its current runqueue.
+	 *
+	 * We want to avoid overloading runqueues. Even if
+	 * the RT task is of higher priority than the current RT task.
+	 * RT tasks behave differently than other tasks. If
+	 * one gets preempted, we try to push it off to another queue.
+	 * So trying to keep a preempting RT task on the same
+	 * cache hot CPU will force the running RT task to
+	 * a cold CPU. So we waste all the cache for the lower
+	 * RT task in hopes of saving some of a RT task
+	 * that is just being woken and probably will have
+	 * cold cache anyway.
+	 */
+	if (unlikely(rt_task(rq->curr)) &&
+	    (p->rt.nr_cpus_allowed > 1)) {
+		int cpu = find_lowest_rq(p);
+
+		return (cpu == -1) ? task_cpu(p) : cpu;
+	}
+
+	/*
+	 * Otherwise, just let it ride on the affined RQ and the
+	 * post-schedule router will push the preempted task away
+	 */
+	return task_cpu(p);
+}
+#endif /* CONFIG_SMP */
+
 /*
 /*
  * Preempt the current task with a newly woken task if needed:
  * Preempt the current task with a newly woken task if needed:
  */
  */
@@ -74,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
 		resched_task(rq->curr);
 		resched_task(rq->curr);
 }
 }
 
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+						   struct rt_rq *rt_rq)
 {
 {
-	struct rt_prio_array *array = &rq->rt.active;
-	struct task_struct *next;
+	struct rt_prio_array *array = &rt_rq->active;
+	struct sched_rt_entity *next = NULL;
 	struct list_head *queue;
 	struct list_head *queue;
 	int idx;
 	int idx;
 
 
 	idx = sched_find_first_bit(array->bitmap);
 	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
+	BUG_ON(idx >= MAX_RT_PRIO);
 
 
 	queue = array->queue + idx;
 	queue = array->queue + idx;
-	next = list_entry(queue->next, struct task_struct, run_list);
-
-	next->se.exec_start = rq->clock;
+	next = list_entry(queue->next, struct sched_rt_entity, run_list);
 
 
 	return next;
 	return next;
 }
 }
 
 
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct sched_rt_entity *rt_se;
+	struct task_struct *p;
+	struct rt_rq *rt_rq;
+
+	rt_rq = &rq->rt;
+
+	if (unlikely(!rt_rq->rt_nr_running))
+		return NULL;
+
+	if (sched_rt_ratio_exceeded(rt_rq))
+		return NULL;
+
+	do {
+		rt_se = pick_next_rt_entity(rq, rt_rq);
+		BUG_ON(!rt_se);
+		rt_rq = group_rt_rq(rt_se);
+	} while (rt_rq);
+
+	p = rt_task_of(rt_se);
+	p->se.exec_start = rq->clock;
+	return p;
+}
+
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 {
 	update_curr_rt(rq);
 	update_curr_rt(rq);
@@ -100,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 }
 }
 
 
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
-/*
- * Load-balancing iterator. Note: while the runqueue stays locked
- * during the whole iteration, the current task might be
- * dequeued so the iterator has to be dequeue-safe. Here we
- * achieve that by always pre-iterating before returning
- * the current task:
- */
-static struct task_struct *load_balance_start_rt(void *arg)
+
+/* Only try algorithms three times */
+#define RT_MAX_TRIES 3
+
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
+static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
+
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
 {
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
+	if (!task_running(rq, p) &&
+	    (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
+	    (p->rt.nr_cpus_allowed > 1))
+		return 1;
+	return 0;
+}
+
+/* Return the second highest RT task, NULL otherwise */
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
+{
+	struct task_struct *next = NULL;
+	struct sched_rt_entity *rt_se;
+	struct rt_prio_array *array;
+	struct rt_rq *rt_rq;
 	int idx;
 	int idx;
 
 
-	idx = sched_find_first_bit(array->bitmap);
-	if (idx >= MAX_RT_PRIO)
-		return NULL;
+	for_each_leaf_rt_rq(rt_rq, rq) {
+		array = &rt_rq->active;
+		idx = sched_find_first_bit(array->bitmap);
+ next_idx:
+		if (idx >= MAX_RT_PRIO)
+			continue;
+		if (next && next->prio < idx)
+			continue;
+		list_for_each_entry(rt_se, array->queue + idx, run_list) {
+			struct task_struct *p = rt_task_of(rt_se);
+			if (pick_rt_task(rq, p, cpu)) {
+				next = p;
+				break;
+			}
+		}
+		if (!next) {
+			idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+			goto next_idx;
+		}
+	}
 
 
-	head = array->queue + idx;
-	curr = head->prev;
+	return next;
+}
 
 
-	p = list_entry(curr, struct task_struct, run_list);
+static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
 
 
-	curr = curr->prev;
+static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
+{
+	int       lowest_prio = -1;
+	int       lowest_cpu  = -1;
+	int       count       = 0;
+	int       cpu;
 
 
-	rq->rt.rt_load_balance_idx = idx;
-	rq->rt.rt_load_balance_head = head;
-	rq->rt.rt_load_balance_curr = curr;
+	cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
 
 
-	return p;
+	/*
+	 * Scan each rq for the lowest prio.
+	 */
+	for_each_cpu_mask(cpu, *lowest_mask) {
+		struct rq *rq = cpu_rq(cpu);
+
+		/* We look for lowest RT prio or non-rt CPU */
+		if (rq->rt.highest_prio >= MAX_RT_PRIO) {
+			/*
+			 * if we already found a low RT queue
+			 * and now we found this non-rt queue
+			 * clear the mask and set our bit.
+			 * Otherwise just return the queue as is
+			 * and the count==1 will cause the algorithm
+			 * to use the first bit found.
+			 */
+			if (lowest_cpu != -1) {
+				cpus_clear(*lowest_mask);
+				cpu_set(rq->cpu, *lowest_mask);
+			}
+			return 1;
+		}
+
+		/* no locking for now */
+		if ((rq->rt.highest_prio > task->prio)
+		    && (rq->rt.highest_prio >= lowest_prio)) {
+			if (rq->rt.highest_prio > lowest_prio) {
+				/* new low - clear old data */
+				lowest_prio = rq->rt.highest_prio;
+				lowest_cpu = cpu;
+				count = 0;
+			}
+			count++;
+		} else
+			cpu_clear(cpu, *lowest_mask);
+	}
+
+	/*
+	 * Clear out all the set bits that represent
+	 * runqueues that were of higher prio than
+	 * the lowest_prio.
+	 */
+	if (lowest_cpu > 0) {
+		/*
+		 * Perhaps we could add another cpumask op to
+		 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
+		 * Then that could be optimized to use memset and such.
+		 */
+		for_each_cpu_mask(cpu, *lowest_mask) {
+			if (cpu >= lowest_cpu)
+				break;
+			cpu_clear(cpu, *lowest_mask);
+		}
+	}
+
+	return count;
 }
 }
 
 
-static struct task_struct *load_balance_next_rt(void *arg)
+static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
 {
 {
-	struct rq *rq = arg;
-	struct rt_prio_array *array = &rq->rt.active;
-	struct list_head *head, *curr;
-	struct task_struct *p;
-	int idx;
+	int first;
+
+	/* "this_cpu" is cheaper to preempt than a remote processor */
+	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+		return this_cpu;
+
+	first = first_cpu(*mask);
+	if (first != NR_CPUS)
+		return first;
+
+	return -1;
+}
+
+static int find_lowest_rq(struct task_struct *task)
+{
+	struct sched_domain *sd;
+	cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
+	int this_cpu = smp_processor_id();
+	int cpu      = task_cpu(task);
+	int count    = find_lowest_cpus(task, lowest_mask);
 
 
-	idx = rq->rt.rt_load_balance_idx;
-	head = rq->rt.rt_load_balance_head;
-	curr = rq->rt.rt_load_balance_curr;
+	if (!count)
+		return -1; /* No targets found */
 
 
 	/*
 	/*
-	 * If we arrived back to the head again then
-	 * iterate to the next queue (if any):
+	 * There is no sense in performing an optimal search if only one
+	 * target is found.
 	 */
 	 */
-	if (unlikely(head == curr)) {
-		int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
+	if (count == 1)
+		return first_cpu(*lowest_mask);
 
 
-		if (next_idx >= MAX_RT_PRIO)
-			return NULL;
+	/*
+	 * At this point we have built a mask of cpus representing the
+	 * lowest priority tasks in the system.  Now we want to elect
+	 * the best one based on our affinity and topology.
+	 *
+	 * We prioritize the last cpu that the task executed on since
+	 * it is most likely cache-hot in that location.
+	 */
+	if (cpu_isset(cpu, *lowest_mask))
+		return cpu;
+
+	/*
+	 * Otherwise, we consult the sched_domains span maps to figure
+	 * out which cpu is logically closest to our hot cache data.
+	 */
+	if (this_cpu == cpu)
+		this_cpu = -1; /* Skip this_cpu opt if the same */
+
+	for_each_domain(cpu, sd) {
+		if (sd->flags & SD_WAKE_AFFINE) {
+			cpumask_t domain_mask;
+			int       best_cpu;
 
 
-		idx = next_idx;
-		head = array->queue + idx;
-		curr = head->prev;
+			cpus_and(domain_mask, sd->span, *lowest_mask);
 
 
-		rq->rt.rt_load_balance_idx = idx;
-		rq->rt.rt_load_balance_head = head;
+			best_cpu = pick_optimal_cpu(this_cpu,
+						    &domain_mask);
+			if (best_cpu != -1)
+				return best_cpu;
+		}
 	}
 	}
 
 
-	p = list_entry(curr, struct task_struct, run_list);
+	/*
+	 * And finally, if there were no matches within the domains
+	 * just give the caller *something* to work with from the compatible
+	 * locations.
+	 */
+	return pick_optimal_cpu(this_cpu, lowest_mask);
+}
 
 
-	curr = curr->prev;
+/* Will lock the rq it finds */
+static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
+{
+	struct rq *lowest_rq = NULL;
+	int tries;
+	int cpu;
 
 
-	rq->rt.rt_load_balance_curr = curr;
+	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+		cpu = find_lowest_rq(task);
 
 
-	return p;
+		if ((cpu == -1) || (cpu == rq->cpu))
+			break;
+
+		lowest_rq = cpu_rq(cpu);
+
+		/* if the prio of this runqueue changed, try again */
+		if (double_lock_balance(rq, lowest_rq)) {
+			/*
+			 * We had to unlock the run queue. In
+			 * the mean time, task could have
+			 * migrated already or had its affinity changed.
+			 * Also make sure that it wasn't scheduled on its rq.
+			 */
+			if (unlikely(task_rq(task) != rq ||
+				     !cpu_isset(lowest_rq->cpu,
+						task->cpus_allowed) ||
+				     task_running(rq, task) ||
+				     !task->se.on_rq)) {
+
+				spin_unlock(&lowest_rq->lock);
+				lowest_rq = NULL;
+				break;
+			}
+		}
+
+		/* If this rq is still suitable use it. */
+		if (lowest_rq->rt.highest_prio > task->prio)
+			break;
+
+		/* try again */
+		spin_unlock(&lowest_rq->lock);
+		lowest_rq = NULL;
+	}
+
+	return lowest_rq;
+}
+
+/*
+ * If the current CPU has more than one RT task, see if the non
+ * running task can migrate over to a CPU that is running a task
+ * of lesser priority.
+ */
+static int push_rt_task(struct rq *rq)
+{
+	struct task_struct *next_task;
+	struct rq *lowest_rq;
+	int ret = 0;
+	int paranoid = RT_MAX_TRIES;
+
+	if (!rq->rt.overloaded)
+		return 0;
+
+	next_task = pick_next_highest_task_rt(rq, -1);
+	if (!next_task)
+		return 0;
+
+ retry:
+	if (unlikely(next_task == rq->curr)) {
+		WARN_ON(1);
+		return 0;
+	}
+
+	/*
+	 * It's possible that the next_task slipped in of
+	 * higher priority than current. If that's the case
+	 * just reschedule current.
+	 */
+	if (unlikely(next_task->prio < rq->curr->prio)) {
+		resched_task(rq->curr);
+		return 0;
+	}
+
+	/* We might release rq lock */
+	get_task_struct(next_task);
+
+	/* find_lock_lowest_rq locks the rq if found */
+	lowest_rq = find_lock_lowest_rq(next_task, rq);
+	if (!lowest_rq) {
+		struct task_struct *task;
+		/*
+		 * find lock_lowest_rq releases rq->lock
+		 * so it is possible that next_task has changed.
+		 * If it has, then try again.
+		 */
+		task = pick_next_highest_task_rt(rq, -1);
+		if (unlikely(task != next_task) && task && paranoid--) {
+			put_task_struct(next_task);
+			next_task = task;
+			goto retry;
+		}
+		goto out;
+	}
+
+	deactivate_task(rq, next_task, 0);
+	set_task_cpu(next_task, lowest_rq->cpu);
+	activate_task(lowest_rq, next_task, 0);
+
+	resched_task(lowest_rq->curr);
+
+	spin_unlock(&lowest_rq->lock);
+
+	ret = 1;
+out:
+	put_task_struct(next_task);
+
+	return ret;
+}
+
+/*
+ * TODO: Currently we just use the second highest prio task on
+ *       the queue, and stop when it can't migrate (or there's
+ *       no more RT tasks).  There may be a case where a lower
+ *       priority RT task has a different affinity than the
+ *       higher RT task. In this case the lower RT task could
+ *       possibly be able to migrate where as the higher priority
+ *       RT task could not.  We currently ignore this issue.
+ *       Enhancements are welcome!
+ */
+static void push_rt_tasks(struct rq *rq)
+{
+	/* push_rt_task will return true if it moved an RT */
+	while (push_rt_task(rq))
+		;
+}
+
+static int pull_rt_task(struct rq *this_rq)
+{
+	int this_cpu = this_rq->cpu, ret = 0, cpu;
+	struct task_struct *p, *next;
+	struct rq *src_rq;
+
+	if (likely(!rt_overloaded(this_rq)))
+		return 0;
+
+	next = pick_next_task_rt(this_rq);
+
+	for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
+		if (this_cpu == cpu)
+			continue;
+
+		src_rq = cpu_rq(cpu);
+		/*
+		 * We can potentially drop this_rq's lock in
+		 * double_lock_balance, and another CPU could
+		 * steal our next task - hence we must cause
+		 * the caller to recalculate the next task
+		 * in that case:
+		 */
+		if (double_lock_balance(this_rq, src_rq)) {
+			struct task_struct *old_next = next;
+
+			next = pick_next_task_rt(this_rq);
+			if (next != old_next)
+				ret = 1;
+		}
+
+		/*
+		 * Are there still pullable RT tasks?
+		 */
+		if (src_rq->rt.rt_nr_running <= 1)
+			goto skip;
+
+		p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+		/*
+		 * Do we have an RT task that preempts
+		 * the to-be-scheduled task?
+		 */
+		if (p && (!next || (p->prio < next->prio))) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->se.on_rq);
+
+			/*
+			 * There's a chance that p is higher in priority
+			 * than what's currently running on its cpu.
+			 * This is just that p is wakeing up and hasn't
+			 * had a chance to schedule. We only pull
+			 * p if it is lower in priority than the
+			 * current task on the run queue or
+			 * this_rq next task is lower in prio than
+			 * the current task on that rq.
+			 */
+			if (p->prio < src_rq->curr->prio ||
+			    (next && next->prio < src_rq->curr->prio))
+				goto skip;
+
+			ret = 1;
+
+			deactivate_task(src_rq, p, 0);
+			set_task_cpu(p, this_cpu);
+			activate_task(this_rq, p, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue. (low likelyhood
+			 * but possible)
+			 *
+			 * Update next so that we won't pick a task
+			 * on another cpu with a priority lower (or equal)
+			 * than the one we just picked.
+			 */
+			next = p;
+
+		}
+ skip:
+		spin_unlock(&src_rq->lock);
+	}
+
+	return ret;
+}
+
+static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
+{
+	/* Try to pull RT tasks here if we lower this rq's prio */
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+		pull_rt_task(rq);
+}
+
+static void post_schedule_rt(struct rq *rq)
+{
+	/*
+	 * If we have more than one rt_task queued, then
+	 * see if we can push the other rt_tasks off to other CPUS.
+	 * Note we may release the rq lock, and since
+	 * the lock was owned by prev, we need to release it
+	 * first via finish_lock_switch and then reaquire it here.
+	 */
+	if (unlikely(rq->rt.overloaded)) {
+		spin_lock_irq(&rq->lock);
+		push_rt_tasks(rq);
+		spin_unlock_irq(&rq->lock);
+	}
+}
+
+
+static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
+{
+	if (!task_running(rq, p) &&
+	    (p->prio >= rq->rt.highest_prio) &&
+	    rq->rt.overloaded)
+		push_rt_tasks(rq);
 }
 }
 
 
 static unsigned long
 static unsigned long
@@ -178,38 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		struct sched_domain *sd, enum cpu_idle_type idle,
 		struct sched_domain *sd, enum cpu_idle_type idle,
 		int *all_pinned, int *this_best_prio)
 		int *all_pinned, int *this_best_prio)
 {
 {
-	struct rq_iterator rt_rq_iterator;
-
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	/* pass 'busiest' rq argument into
-	 * load_balance_[start|next]_rt iterators
-	 */
-	rt_rq_iterator.arg = busiest;
-
-	return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
-			     idle, all_pinned, this_best_prio, &rt_rq_iterator);
+	/* don't touch RT tasks */
+	return 0;
 }
 }
 
 
 static int
 static int
 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		 struct sched_domain *sd, enum cpu_idle_type idle)
 		 struct sched_domain *sd, enum cpu_idle_type idle)
 {
 {
-	struct rq_iterator rt_rq_iterator;
+	/* don't touch RT tasks */
+	return 0;
+}
+
+static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
+{
+	int weight = cpus_weight(*new_mask);
+
+	BUG_ON(!rt_task(p));
 
 
-	rt_rq_iterator.start = load_balance_start_rt;
-	rt_rq_iterator.next = load_balance_next_rt;
-	rt_rq_iterator.arg = busiest;
+	/*
+	 * Update the migration status of the RQ if we have an RT task
+	 * which is running AND changing its weight value.
+	 */
+	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+		struct rq *rq = task_rq(p);
+
+		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
+			rq->rt.rt_nr_migratory++;
+		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
+			BUG_ON(!rq->rt.rt_nr_migratory);
+			rq->rt.rt_nr_migratory--;
+		}
+
+		update_rt_migration(rq);
+	}
 
 
-	return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle,
-				  &rt_rq_iterator);
+	p->cpus_allowed    = *new_mask;
+	p->rt.nr_cpus_allowed = weight;
 }
 }
-#endif
 
 
-static void task_tick_rt(struct rq *rq, struct task_struct *p)
+/* Assumes rq->lock is held */
+static void join_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_set_overload(rq);
+}
+
+/* Assumes rq->lock is held */
+static void leave_domain_rt(struct rq *rq)
+{
+	if (rq->rt.overloaded)
+		rt_clear_overload(rq);
+}
+
+/*
+ * When switch from the rt queue, we bring ourselves to a position
+ * that we might want to pull RT tasks from other runqueues.
+ */
+static void switched_from_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	/*
+	 * If there are other RT tasks then we will reschedule
+	 * and the scheduling of the other RT tasks will handle
+	 * the balancing. But if we are the last RT task
+	 * we may need to handle the pulling of RT tasks
+	 * now.
+	 */
+	if (!rq->rt.rt_nr_running)
+		pull_rt_task(rq);
+}
+#endif /* CONFIG_SMP */
+
+/*
+ * When switching a task to RT, we may overload the runqueue
+ * with RT tasks. In this case we try to push them off to
+ * other runqueues.
+ */
+static void switched_to_rt(struct rq *rq, struct task_struct *p,
+			   int running)
+{
+	int check_resched = 1;
+
+	/*
+	 * If we are already running, then there's nothing
+	 * that needs to be done. But if we are not running
+	 * we may need to preempt the current running task.
+	 * If that current running task is also an RT task
+	 * then see if we can move to another run queue.
+	 */
+	if (!running) {
+#ifdef CONFIG_SMP
+		if (rq->rt.overloaded && push_rt_task(rq) &&
+		    /* Don't resched if we changed runqueues */
+		    rq != task_rq(p))
+			check_resched = 0;
+#endif /* CONFIG_SMP */
+		if (check_resched && p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
+/*
+ * Priority of the task has changed. This may cause
+ * us to initiate a push or pull.
+ */
+static void prio_changed_rt(struct rq *rq, struct task_struct *p,
+			    int oldprio, int running)
+{
+	if (running) {
+#ifdef CONFIG_SMP
+		/*
+		 * If our priority decreases while running, we
+		 * may need to pull tasks to this runqueue.
+		 */
+		if (oldprio < p->prio)
+			pull_rt_task(rq);
+		/*
+		 * If there's a higher priority task waiting to run
+		 * then reschedule.
+		 */
+		if (p->prio > rq->rt.highest_prio)
+			resched_task(p);
+#else
+		/* For UP simply resched on drop of prio */
+		if (oldprio < p->prio)
+			resched_task(p);
+#endif /* CONFIG_SMP */
+	} else {
+		/*
+		 * This task is not running, but if it is
+		 * greater than the current running task
+		 * then reschedule.
+		 */
+		if (p->prio < rq->curr->prio)
+			resched_task(rq->curr);
+	}
+}
+
+static void watchdog(struct rq *rq, struct task_struct *p)
+{
+	unsigned long soft, hard;
+
+	if (!p->signal)
+		return;
+
+	soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
+	hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
+
+	if (soft != RLIM_INFINITY) {
+		unsigned long next;
+
+		p->rt.timeout++;
+		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
+		if (p->rt.timeout > next)
+			p->it_sched_expires = p->se.sum_exec_runtime;
+	}
+}
+
+static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
 {
 	update_curr_rt(rq);
 	update_curr_rt(rq);
 
 
+	watchdog(rq, p);
+
 	/*
 	/*
 	 * RR tasks need a special form of timeslice management.
 	 * RR tasks need a special form of timeslice management.
 	 * FIFO tasks have no timeslices.
 	 * FIFO tasks have no timeslices.
@@ -217,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
 	if (p->policy != SCHED_RR)
 	if (p->policy != SCHED_RR)
 		return;
 		return;
 
 
-	if (--p->time_slice)
+	if (--p->rt.time_slice)
 		return;
 		return;
 
 
-	p->time_slice = DEF_TIMESLICE;
+	p->rt.time_slice = DEF_TIMESLICE;
 
 
 	/*
 	/*
 	 * Requeue to the end of queue if we are not the only element
 	 * Requeue to the end of queue if we are not the only element
 	 * on the queue:
 	 * on the queue:
 	 */
 	 */
-	if (p->run_list.prev != p->run_list.next) {
+	if (p->rt.run_list.prev != p->rt.run_list.next) {
 		requeue_task_rt(rq, p);
 		requeue_task_rt(rq, p);
 		set_tsk_need_resched(p);
 		set_tsk_need_resched(p);
 	}
 	}
@@ -244,6 +1165,9 @@ const struct sched_class rt_sched_class = {
 	.enqueue_task		= enqueue_task_rt,
 	.enqueue_task		= enqueue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.dequeue_task		= dequeue_task_rt,
 	.yield_task		= yield_task_rt,
 	.yield_task		= yield_task_rt,
+#ifdef CONFIG_SMP
+	.select_task_rq		= select_task_rq_rt,
+#endif /* CONFIG_SMP */
 
 
 	.check_preempt_curr	= check_preempt_curr_rt,
 	.check_preempt_curr	= check_preempt_curr_rt,
 
 
@@ -253,8 +1177,18 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	.load_balance		= load_balance_rt,
 	.load_balance		= load_balance_rt,
 	.move_one_task		= move_one_task_rt,
 	.move_one_task		= move_one_task_rt,
+	.set_cpus_allowed       = set_cpus_allowed_rt,
+	.join_domain            = join_domain_rt,
+	.leave_domain           = leave_domain_rt,
+	.pre_schedule		= pre_schedule_rt,
+	.post_schedule		= post_schedule_rt,
+	.task_wake_up		= task_wake_up_rt,
+	.switched_from		= switched_from_rt,
 #endif
 #endif
 
 
 	.set_curr_task          = set_curr_task_rt,
 	.set_curr_task          = set_curr_task_rt,
 	.task_tick		= task_tick_rt,
 	.task_tick		= task_tick_rt,
+
+	.prio_changed		= prio_changed_rt,
+	.switched_to		= switched_to_rt,
 };
 };

+ 105 - 11
kernel/softlockup.c

@@ -8,6 +8,7 @@
  */
  */
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/cpu.h>
+#include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/delay.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(unsigned long, print_timestamp);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 
-static int did_panic;
-int softlockup_thresh = 10;
+static int __read_mostly did_panic;
+unsigned long __read_mostly softlockup_thresh = 60;
 
 
 static int
 static int
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
 softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
  */
  */
 static unsigned long get_timestamp(int this_cpu)
 static unsigned long get_timestamp(int this_cpu)
 {
 {
-	return cpu_clock(this_cpu) >> 30;  /* 2^30 ~= 10^9 */
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 }
 
 
 void touch_softlockup_watchdog(void)
 void touch_softlockup_watchdog(void)
@@ -100,11 +101,7 @@ void softlockup_tick(void)
 
 
 	now = get_timestamp(this_cpu);
 	now = get_timestamp(this_cpu);
 
 
-	/* Wake up the high-prio watchdog task every second: */
-	if (now > (touch_timestamp + 1))
-		wake_up_process(per_cpu(watchdog_task, this_cpu));
-
-	/* Warn about unreasonable 10+ seconds delays: */
+	/* Warn about unreasonable delays: */
 	if (now <= (touch_timestamp + softlockup_thresh))
 	if (now <= (touch_timestamp + softlockup_thresh))
 		return;
 		return;
 
 
@@ -121,12 +118,94 @@ void softlockup_tick(void)
 	spin_unlock(&print_lock);
 	spin_unlock(&print_lock);
 }
 }
 
 
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+/*
+ * Only do the hung-tasks check on one CPU:
+ */
+static int check_cpu __read_mostly = -1;
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (sysctl_hung_task_warnings < 0)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(int this_cpu)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp(this_cpu);
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if ((tainted & TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			break;
+		if (t->state & TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+
+	read_unlock(&tasklist_lock);
+}
+
 /*
 /*
  * The watchdog thread - runs every second and touches the timestamp.
  * The watchdog thread - runs every second and touches the timestamp.
  */
  */
 static int watchdog(void *__bind_cpu)
 static int watchdog(void *__bind_cpu)
 {
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+	int this_cpu = (long)__bind_cpu;
 
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
 
@@ -135,13 +214,18 @@ static int watchdog(void *__bind_cpu)
 
 
 	/*
 	/*
 	 * Run briefly once per second to reset the softlockup timestamp.
 	 * Run briefly once per second to reset the softlockup timestamp.
-	 * If this gets delayed for more than 10 seconds then the
+	 * If this gets delayed for more than 60 seconds then the
 	 * debug-printout triggers in softlockup_tick().
 	 * debug-printout triggers in softlockup_tick().
 	 */
 	 */
 	while (!kthread_should_stop()) {
 	while (!kthread_should_stop()) {
-		set_current_state(TASK_INTERRUPTIBLE);
 		touch_softlockup_watchdog();
 		touch_softlockup_watchdog();
-		schedule();
+		msleep_interruptible(10000);
+
+		if (this_cpu != check_cpu)
+			continue;
+
+		if (sysctl_hung_task_timeout_secs)
+			check_hung_uninterruptible_tasks(this_cpu);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -171,6 +255,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_ONLINE_FROZEN:
+		check_cpu = any_online_cpu(cpu_online_map);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 #ifdef CONFIG_HOTPLUG_CPU
@@ -181,6 +266,15 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		/* Unbind so it can run.  Fall thru. */
 		/* Unbind so it can run.  Fall thru. */
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 		kthread_bind(per_cpu(watchdog_task, hotcpu),
 			     any_online_cpu(cpu_online_map));
 			     any_online_cpu(cpu_online_map));
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		if (hotcpu == check_cpu) {
+			cpumask_t temp_cpu_online_map = cpu_online_map;
+
+			cpu_clear(hotcpu, temp_cpu_online_map);
+			check_cpu = any_online_cpu(temp_cpu_online_map);
+		}
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_DEAD_FROZEN:
 		p = per_cpu(watchdog_task, hotcpu);
 		p = per_cpu(watchdog_task, hotcpu);

+ 2 - 2
kernel/stop_machine.c

@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
 	int ret;
 	int ret;
 
 
 	/* No CPUs can come up or down during this. */
 	/* No CPUs can come up or down during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	p = __stop_machine_run(fn, data, cpu);
 	p = __stop_machine_run(fn, data, cpu);
 	if (!IS_ERR(p))
 	if (!IS_ERR(p))
 		ret = kthread_stop(p);
 		ret = kthread_stop(p);
 	else
 	else
 		ret = PTR_ERR(p);
 		ret = PTR_ERR(p);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 
 
 	return ret;
 	return ret;
 }
 }

+ 74 - 3
kernel/sysctl.c

@@ -81,6 +81,7 @@ extern int compat_log;
 extern int maps_protect;
 extern int maps_protect;
 extern int sysctl_stat_interval;
 extern int sysctl_stat_interval;
 extern int audit_argv_kb;
 extern int audit_argv_kb;
+extern int latencytop_enabled;
 
 
 /* Constants used for minimum and  maximum */
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 #ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -306,9 +307,43 @@ static struct ctl_table kern_table[] = {
 		.procname	= "sched_nr_migrate",
 		.procname	= "sched_nr_migrate",
 		.data		= &sysctl_sched_nr_migrate,
 		.data		= &sysctl_sched_nr_migrate,
 		.maxlen		= sizeof(unsigned int),
 		.maxlen		= sizeof(unsigned int),
-		.mode		= 644,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_period_ms",
+		.data		= &sysctl_sched_rt_period,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 		.proc_handler	= &proc_dointvec,
 	},
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "sched_rt_ratio",
+		.data		= &sysctl_sched_rt_ratio,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_min_bal_int_shares",
+		.data           = &sysctl_sched_min_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+	{
+		.ctl_name       = CTL_UNNUMBERED,
+		.procname       = "sched_max_bal_int_shares",
+		.data           = &sysctl_sched_max_bal_int_shares,
+		.maxlen         = sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec,
+	},
+#endif
 #endif
 #endif
 	{
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.ctl_name	= CTL_UNNUMBERED,
@@ -382,6 +417,15 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec_taint,
 		.proc_handler	= &proc_dointvec_taint,
 	},
 	},
 #endif
 #endif
+#ifdef CONFIG_LATENCYTOP
+	{
+		.procname	= "latencytop",
+		.data		= &latencytop_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
 #ifdef CONFIG_SECURITY_CAPABILITIES
 #ifdef CONFIG_SECURITY_CAPABILITIES
 	{
 	{
 		.procname	= "cap-bound",
 		.procname	= "cap-bound",
@@ -728,13 +772,40 @@ static struct ctl_table kern_table[] = {
 		.ctl_name	= CTL_UNNUMBERED,
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "softlockup_thresh",
 		.procname	= "softlockup_thresh",
 		.data		= &softlockup_thresh,
 		.data		= &softlockup_thresh,
-		.maxlen		= sizeof(int),
+		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
+		.proc_handler	= &proc_doulongvec_minmax,
 		.strategy	= &sysctl_intvec,
 		.strategy	= &sysctl_intvec,
 		.extra1		= &one,
 		.extra1		= &one,
 		.extra2		= &sixty,
 		.extra2		= &sixty,
 	},
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_check_count",
+		.data		= &sysctl_hung_task_check_count,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_timeout_secs",
+		.data		= &sysctl_hung_task_timeout_secs,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_warnings",
+		.data		= &sysctl_hung_task_warnings,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644,
+		.proc_handler	= &proc_doulongvec_minmax,
+		.strategy	= &sysctl_intvec,
+	},
 #endif
 #endif
 #ifdef CONFIG_COMPAT
 #ifdef CONFIG_COMPAT
 	{
 	{

+ 5 - 8
kernel/time/tick-sched.c

@@ -153,6 +153,7 @@ void tick_nohz_update_jiffies(void)
 void tick_nohz_stop_sched_tick(void)
 void tick_nohz_stop_sched_tick(void)
 {
 {
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
 	unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+	unsigned long rt_jiffies;
 	struct tick_sched *ts;
 	struct tick_sched *ts;
 	ktime_t last_update, expires, now, delta;
 	ktime_t last_update, expires, now, delta;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
 	struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
@@ -216,6 +217,10 @@ void tick_nohz_stop_sched_tick(void)
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	next_jiffies = get_next_timer_interrupt(last_jiffies);
 	delta_jiffies = next_jiffies - last_jiffies;
 	delta_jiffies = next_jiffies - last_jiffies;
 
 
+	rt_jiffies = rt_needs_cpu(cpu);
+	if (rt_jiffies && rt_jiffies < delta_jiffies)
+		delta_jiffies = rt_jiffies;
+
 	if (rcu_needs_cpu(cpu))
 	if (rcu_needs_cpu(cpu))
 		delta_jiffies = 1;
 		delta_jiffies = 1;
 	/*
 	/*
@@ -509,7 +514,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 {
 {
 	struct tick_sched *ts =
 	struct tick_sched *ts =
 		container_of(timer, struct tick_sched, sched_timer);
 		container_of(timer, struct tick_sched, sched_timer);
-	struct hrtimer_cpu_base *base = timer->base->cpu_base;
 	struct pt_regs *regs = get_irq_regs();
 	struct pt_regs *regs = get_irq_regs();
 	ktime_t now = ktime_get();
 	ktime_t now = ktime_get();
 	int cpu = smp_processor_id();
 	int cpu = smp_processor_id();
@@ -547,15 +551,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
 			touch_softlockup_watchdog();
 			touch_softlockup_watchdog();
 			ts->idle_jiffies++;
 			ts->idle_jiffies++;
 		}
 		}
-		/*
-		 * update_process_times() might take tasklist_lock, hence
-		 * drop the base lock. sched-tick hrtimers are per-CPU and
-		 * never accessible by userspace APIs, so this is safe to do.
-		 */
-		spin_unlock(&base->lock);
 		update_process_times(user_mode(regs));
 		update_process_times(user_mode(regs));
 		profile_tick(CPU_PROFILING);
 		profile_tick(CPU_PROFILING);
-		spin_lock(&base->lock);
 	}
 	}
 
 
 	/* Do not restart, when we are in the idle loop */
 	/* Do not restart, when we are in the idle loop */

+ 2 - 1
kernel/timer.c

@@ -896,7 +896,7 @@ static void run_timer_softirq(struct softirq_action *h)
 {
 {
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 	tvec_base_t *base = __get_cpu_var(tvec_bases);
 
 
-	hrtimer_run_queues();
+	hrtimer_run_pending();
 
 
 	if (time_after_eq(jiffies, base->timer_jiffies))
 	if (time_after_eq(jiffies, base->timer_jiffies))
 		__run_timers(base);
 		__run_timers(base);
@@ -907,6 +907,7 @@ static void run_timer_softirq(struct softirq_action *h)
  */
  */
 void run_local_timers(void)
 void run_local_timers(void)
 {
 {
+	hrtimer_run_queues();
 	raise_softirq(TIMER_SOFTIRQ);
 	raise_softirq(TIMER_SOFTIRQ);
 	softlockup_tick();
 	softlockup_tick();
 }
 }

+ 20 - 27
kernel/user.c

@@ -319,7 +319,7 @@ void free_uid(struct user_struct *up)
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 {
 {
 	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct hlist_head *hashent = uidhashentry(ns, uid);
-	struct user_struct *up;
+	struct user_struct *up, *new;
 
 
 	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	/* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
 	 * atomic.
 	 * atomic.
@@ -331,13 +331,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	spin_unlock_irq(&uidhash_lock);
 	spin_unlock_irq(&uidhash_lock);
 
 
 	if (!up) {
 	if (!up) {
-		struct user_struct *new;
-
 		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
 		new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
-		if (!new) {
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (!new)
+			goto out_unlock;
 
 
 		new->uid = uid;
 		new->uid = uid;
 		atomic_set(&new->__count, 1);
 		atomic_set(&new->__count, 1);
@@ -353,28 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 #endif
 #endif
 		new->locked_shm = 0;
 		new->locked_shm = 0;
 
 
-		if (alloc_uid_keyring(new, current) < 0) {
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (alloc_uid_keyring(new, current) < 0)
+			goto out_free_user;
 
 
-		if (sched_create_user(new) < 0) {
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (sched_create_user(new) < 0)
+			goto out_put_keys;
 
 
-		if (uids_user_create(new)) {
-			sched_destroy_user(new);
-			key_put(new->uid_keyring);
-			key_put(new->session_keyring);
-			kmem_cache_free(uid_cachep, new);
-			uids_mutex_unlock();
-			return NULL;
-		}
+		if (uids_user_create(new))
+			goto out_destoy_sched;
 
 
 		/*
 		/*
 		 * Before adding this, check whether we raced
 		 * Before adding this, check whether we raced
@@ -402,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
 	uids_mutex_unlock();
 	uids_mutex_unlock();
 
 
 	return up;
 	return up;
+
+out_destoy_sched:
+	sched_destroy_user(new);
+out_put_keys:
+	key_put(new->uid_keyring);
+	key_put(new->session_keyring);
+out_free_user:
+	kmem_cache_free(uid_cachep, new);
+out_unlock:
+	uids_mutex_unlock();
+	return NULL;
 }
 }
 
 
 void switch_uid(struct user_struct *new_user)
 void switch_uid(struct user_struct *new_user)

+ 15 - 20
kernel/workqueue.c

@@ -67,9 +67,8 @@ struct workqueue_struct {
 #endif
 #endif
 };
 };
 
 
-/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
-   threads to each one as cpus come/go. */
-static DEFINE_MUTEX(workqueue_mutex);
+/* Serializes the accesses to the list of workqueues. */
+static DEFINE_SPINLOCK(workqueue_lock);
 static LIST_HEAD(workqueues);
 static LIST_HEAD(workqueues);
 
 
 static int singlethread_cpu __read_mostly;
 static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
  * Returns zero on success.
  * Returns zero on success.
  * Returns -ve errno on failure.
  * Returns -ve errno on failure.
  *
  *
- * Appears to be racy against CPU hotplug.
- *
  * schedule_on_each_cpu() is very slow.
  * schedule_on_each_cpu() is very slow.
  */
  */
 int schedule_on_each_cpu(work_func_t func)
 int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 	if (!works)
 		return -ENOMEM;
 		return -ENOMEM;
 
 
-	preempt_disable();		/* CPU hotplug */
+	get_online_cpus();
 	for_each_online_cpu(cpu) {
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
 
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 		__queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
 	}
 	}
-	preempt_enable();
 	flush_workqueue(keventd_wq);
 	flush_workqueue(keventd_wq);
+	put_online_cpus();
 	free_percpu(works);
 	free_percpu(works);
 	return 0;
 	return 0;
 }
 }
@@ -750,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		err = create_workqueue_thread(cwq, singlethread_cpu);
 		start_workqueue_thread(cwq, -1);
 		start_workqueue_thread(cwq, -1);
 	} else {
 	} else {
-		mutex_lock(&workqueue_mutex);
+		get_online_cpus();
+		spin_lock(&workqueue_lock);
 		list_add(&wq->list, &workqueues);
 		list_add(&wq->list, &workqueues);
+		spin_unlock(&workqueue_lock);
 
 
 		for_each_possible_cpu(cpu) {
 		for_each_possible_cpu(cpu) {
 			cwq = init_cpu_workqueue(wq, cpu);
 			cwq = init_cpu_workqueue(wq, cpu);
@@ -760,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 			err = create_workqueue_thread(cwq, cpu);
 			err = create_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 			start_workqueue_thread(cwq, cpu);
 		}
 		}
-		mutex_unlock(&workqueue_mutex);
+		put_online_cpus();
 	}
 	}
 
 
 	if (err) {
 	if (err) {
@@ -775,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 {
 	/*
 	/*
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
 	 * Our caller is either destroy_workqueue() or CPU_DEAD,
-	 * workqueue_mutex protects cwq->thread
+	 * get_online_cpus() protects cwq->thread.
 	 */
 	 */
 	if (cwq->thread == NULL)
 	if (cwq->thread == NULL)
 		return;
 		return;
@@ -810,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	struct cpu_workqueue_struct *cwq;
 	struct cpu_workqueue_struct *cwq;
 	int cpu;
 	int cpu;
 
 
-	mutex_lock(&workqueue_mutex);
+	get_online_cpus();
+	spin_lock(&workqueue_lock);
 	list_del(&wq->list);
 	list_del(&wq->list);
-	mutex_unlock(&workqueue_mutex);
+	spin_unlock(&workqueue_lock);
+	put_online_cpus();
 
 
 	for_each_cpu_mask(cpu, *cpu_map) {
 	for_each_cpu_mask(cpu, *cpu_map) {
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
 		cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -835,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 	action &= ~CPU_TASKS_FROZEN;
 	action &= ~CPU_TASKS_FROZEN;
 
 
 	switch (action) {
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&workqueue_mutex);
-		return NOTIFY_OK;
-
-	case CPU_LOCK_RELEASE:
-		mutex_unlock(&workqueue_mutex);
-		return NOTIFY_OK;
 
 
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE:
 		cpu_set(cpu, cpu_populated_map);
 		cpu_set(cpu, cpu_populated_map);
@@ -854,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
 		case CPU_UP_PREPARE:
 		case CPU_UP_PREPARE:
 			if (!create_workqueue_thread(cwq, cpu))
 			if (!create_workqueue_thread(cwq, cpu))
 				break;
 				break;
-			printk(KERN_ERR "workqueue for %i failed\n", cpu);
+			printk(KERN_ERR "workqueue [%s] for %i failed\n",
+				wq->name, cpu);
 			return NOTIFY_BAD;
 			return NOTIFY_BAD;
 
 
 		case CPU_ONLINE:
 		case CPU_ONLINE:

+ 14 - 0
lib/Kconfig.debug

@@ -517,4 +517,18 @@ config FAULT_INJECTION_STACKTRACE_FILTER
 	help
 	help
 	  Provide stacktrace filter for fault-injection capabilities
 	  Provide stacktrace filter for fault-injection capabilities
 
 
+config LATENCYTOP
+	bool "Latency measuring infrastructure"
+	select FRAME_POINTER if !MIPS
+	select KALLSYMS
+	select KALLSYMS_ALL
+	select STACKTRACE
+	select SCHEDSTATS
+	select SCHED_DEBUG
+	depends on X86 || X86_64
+	help
+	  Enable this option if you want to use the LatencyTOP tool
+	  to find out which userspace is blocking on what kernel operations.
+
+
 source "samples/Kconfig"
 source "samples/Kconfig"

+ 0 - 123
lib/kernel_lock.c

@@ -9,7 +9,6 @@
 #include <linux/module.h>
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
 
 
-#ifdef CONFIG_PREEMPT_BKL
 /*
 /*
  * The 'big kernel semaphore'
  * The 'big kernel semaphore'
  *
  *
@@ -86,128 +85,6 @@ void __lockfunc unlock_kernel(void)
 		up(&kernel_sem);
 		up(&kernel_sem);
 }
 }
 
 
-#else
-
-/*
- * The 'big kernel lock'
- *
- * This spinlock is taken and released recursively by lock_kernel()
- * and unlock_kernel().  It is transparently dropped and reacquired
- * over schedule().  It is used to protect legacy code that hasn't
- * been migrated to a proper locking design yet.
- *
- * Don't use in new code.
- */
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
-
-
-/*
- * Acquire/release the underlying lock from the scheduler.
- *
- * This is called with preemption disabled, and should
- * return an error value if it cannot get the lock and
- * TIF_NEED_RESCHED gets set.
- *
- * If it successfully gets the lock, it should increment
- * the preemption count like any spinlock does.
- *
- * (This works on UP too - _raw_spin_trylock will never
- * return false in that case)
- */
-int __lockfunc __reacquire_kernel_lock(void)
-{
-	while (!_raw_spin_trylock(&kernel_flag)) {
-		if (test_thread_flag(TIF_NEED_RESCHED))
-			return -EAGAIN;
-		cpu_relax();
-	}
-	preempt_disable();
-	return 0;
-}
-
-void __lockfunc __release_kernel_lock(void)
-{
-	_raw_spin_unlock(&kernel_flag);
-	preempt_enable_no_resched();
-}
-
-/*
- * These are the BKL spinlocks - we try to be polite about preemption. 
- * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
- */
-#ifdef CONFIG_PREEMPT
-static inline void __lock_kernel(void)
-{
-	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
-		/*
-		 * If preemption was disabled even before this
-		 * was called, there's nothing we can be polite
-		 * about - just spin.
-		 */
-		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
-			return;
-		}
-
-		/*
-		 * Otherwise, let's wait for the kernel lock
-		 * with preemption enabled..
-		 */
-		do {
-			preempt_enable();
-			while (spin_is_locked(&kernel_flag))
-				cpu_relax();
-			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
-	}
-}
-
-#else
-
-/*
- * Non-preemption case - just get the spinlock
- */
-static inline void __lock_kernel(void)
-{
-	_raw_spin_lock(&kernel_flag);
-}
-#endif
-
-static inline void __unlock_kernel(void)
-{
-	/*
-	 * the BKL is not covered by lockdep, so we open-code the
-	 * unlocking sequence (and thus avoid the dep-chain ops):
-	 */
-	_raw_spin_unlock(&kernel_flag);
-	preempt_enable();
-}
-
-/*
- * Getting the big kernel lock.
- *
- * This cannot happen asynchronously, so we only need to
- * worry about other CPU's.
- */
-void __lockfunc lock_kernel(void)
-{
-	int depth = current->lock_depth+1;
-	if (likely(!depth))
-		__lock_kernel();
-	current->lock_depth = depth;
-}
-
-void __lockfunc unlock_kernel(void)
-{
-	BUG_ON(current->lock_depth < 0);
-	if (likely(--current->lock_depth < 0))
-		__unlock_kernel();
-}
-
-#endif
-
 EXPORT_SYMBOL(lock_kernel);
 EXPORT_SYMBOL(lock_kernel);
 EXPORT_SYMBOL(unlock_kernel);
 EXPORT_SYMBOL(unlock_kernel);
 
 

+ 1 - 1
mm/oom_kill.c

@@ -286,7 +286,7 @@ static void __oom_kill_task(struct task_struct *p, int verbose)
 	 * all the memory it needs. That way it should be able to
 	 * all the memory it needs. That way it should be able to
 	 * exit() and clear out its resources quickly...
 	 * exit() and clear out its resources quickly...
 	 */
 	 */
-	p->time_slice = HZ;
+	p->rt.time_slice = HZ;
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 	set_tsk_thread_flag(p, TIF_MEMDIE);
 
 
 	force_sig(SIGKILL, p);
 	force_sig(SIGKILL, p);

+ 11 - 7
mm/slab.c

@@ -730,8 +730,7 @@ static inline void init_lock_keys(void)
 #endif
 #endif
 
 
 /*
 /*
- * 1. Guard access to the cache-chain.
- * 2. Protect sanity of cpu_online_map against cpu hotplug events
+ * Guard access to the cache-chain.
  */
  */
 static DEFINE_MUTEX(cache_chain_mutex);
 static DEFINE_MUTEX(cache_chain_mutex);
 static struct list_head cache_chain;
 static struct list_head cache_chain;
@@ -1331,12 +1330,11 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 	int err = 0;
 	int err = 0;
 
 
 	switch (action) {
 	switch (action) {
-	case CPU_LOCK_ACQUIRE:
-		mutex_lock(&cache_chain_mutex);
-		break;
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
 	case CPU_UP_PREPARE_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		err = cpuup_prepare(cpu);
 		err = cpuup_prepare(cpu);
+		mutex_unlock(&cache_chain_mutex);
 		break;
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
 	case CPU_ONLINE_FROZEN:
@@ -1373,9 +1371,8 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb,
 #endif
 #endif
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 	case CPU_UP_CANCELED_FROZEN:
+		mutex_lock(&cache_chain_mutex);
 		cpuup_canceled(cpu);
 		cpuup_canceled(cpu);
-		break;
-	case CPU_LOCK_RELEASE:
 		mutex_unlock(&cache_chain_mutex);
 		mutex_unlock(&cache_chain_mutex);
 		break;
 		break;
 	}
 	}
@@ -2170,6 +2167,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * We use cache_chain_mutex to ensure a consistent view of
 	 * cpu_online_map as well.  Please see cpuup_callback
 	 * cpu_online_map as well.  Please see cpuup_callback
 	 */
 	 */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	mutex_lock(&cache_chain_mutex);
 
 
 	list_for_each_entry(pc, &cache_chain, next) {
 	list_for_each_entry(pc, &cache_chain, next) {
@@ -2396,6 +2394,7 @@ oops:
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		panic("kmem_cache_create(): failed to create slab `%s'\n",
 		      name);
 		      name);
 	mutex_unlock(&cache_chain_mutex);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return cachep;
 	return cachep;
 }
 }
 EXPORT_SYMBOL(kmem_cache_create);
 EXPORT_SYMBOL(kmem_cache_create);
@@ -2547,9 +2546,11 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
 	int ret;
 	int ret;
 	BUG_ON(!cachep || in_interrupt());
 	BUG_ON(!cachep || in_interrupt());
 
 
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	mutex_lock(&cache_chain_mutex);
 	ret = __cache_shrink(cachep);
 	ret = __cache_shrink(cachep);
 	mutex_unlock(&cache_chain_mutex);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 	return ret;
 	return ret;
 }
 }
 EXPORT_SYMBOL(kmem_cache_shrink);
 EXPORT_SYMBOL(kmem_cache_shrink);
@@ -2575,6 +2576,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 	BUG_ON(!cachep || in_interrupt());
 	BUG_ON(!cachep || in_interrupt());
 
 
 	/* Find the cache in the chain of caches. */
 	/* Find the cache in the chain of caches. */
+	get_online_cpus();
 	mutex_lock(&cache_chain_mutex);
 	mutex_lock(&cache_chain_mutex);
 	/*
 	/*
 	 * the chain is never empty, cache_cache is never destroyed
 	 * the chain is never empty, cache_cache is never destroyed
@@ -2584,6 +2586,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 		slab_error(cachep, "Can't free all objects");
 		slab_error(cachep, "Can't free all objects");
 		list_add(&cachep->next, &cache_chain);
 		list_add(&cachep->next, &cache_chain);
 		mutex_unlock(&cache_chain_mutex);
 		mutex_unlock(&cache_chain_mutex);
+		put_online_cpus();
 		return;
 		return;
 	}
 	}
 
 
@@ -2592,6 +2595,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep)
 
 
 	__kmem_cache_destroy(cachep);
 	__kmem_cache_destroy(cachep);
 	mutex_unlock(&cache_chain_mutex);
 	mutex_unlock(&cache_chain_mutex);
+	put_online_cpus();
 }
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 

+ 2 - 2
net/core/flow.c

@@ -293,7 +293,7 @@ void flow_cache_flush(void)
 	static DEFINE_MUTEX(flow_flush_sem);
 	static DEFINE_MUTEX(flow_flush_sem);
 
 
 	/* Don't want cpus going down or up during this. */
 	/* Don't want cpus going down or up during this. */
-	lock_cpu_hotplug();
+	get_online_cpus();
 	mutex_lock(&flow_flush_sem);
 	mutex_lock(&flow_flush_sem);
 	atomic_set(&info.cpuleft, num_online_cpus());
 	atomic_set(&info.cpuleft, num_online_cpus());
 	init_completion(&info.completion);
 	init_completion(&info.completion);
@@ -305,7 +305,7 @@ void flow_cache_flush(void)
 
 
 	wait_for_completion(&info.completion);
 	wait_for_completion(&info.completion);
 	mutex_unlock(&flow_flush_sem);
 	mutex_unlock(&flow_flush_sem);
-	unlock_cpu_hotplug();
+	put_online_cpus();
 }
 }
 
 
 static void __devinit flow_cache_cpu_prepare(int cpu)
 static void __devinit flow_cache_cpu_prepare(int cpu)