9 years ago · 9061cbe62a
--- a/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
+++ b/Documentation/RCU/Design/Requirements/2013-08-is-it-dead.png
--- a/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
+++ b/Documentation/RCU/Design/Requirements/GPpartitionReaders1.svg
@@ -0,0 +1,374 @@
 
				+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
			
 
				+<!-- Created with Inkscape (http://www.inkscape.org/) -->
			
 
				+
			
 
				+<svg
			
 
				+   xmlns:dc="http://purl.org/dc/elements/1.1/"
			
 
				+   xmlns:cc="http://creativecommons.org/ns#"
			
 
				+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
			
 
				+   xmlns:svg="http://www.w3.org/2000/svg"
			
 
				+   xmlns="http://www.w3.org/2000/svg"
			
 
				+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
			
 
				+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
			
 
				+   width="447.99197"
			
 
				+   height="428.19299"
			
 
				+   id="svg2"
			
 
				+   version="1.1"
			
 
				+   inkscape:version="0.48.3.1 r9886"
			
 
				+   sodipodi:docname="GPpartitionReaders1.svg">
			
 
				+  <defs
			
 
				+     id="defs4">
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lend"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lend"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3792"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lstart"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lstart"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3789"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(1.1,0,0,1.1,1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+  </defs>
			
 
				+  <sodipodi:namedview
			
 
				+     id="base"
			
 
				+     pagecolor="#ffffff"
			
 
				+     bordercolor="#666666"
			
 
				+     borderopacity="1.0"
			
 
				+     inkscape:pageopacity="0.0"
			
 
				+     inkscape:pageshadow="2"
			
 
				+     inkscape:zoom="1.6184291"
			
 
				+     inkscape:cx="223.99599"
			
 
				+     inkscape:cy="214.0965"
			
 
				+     inkscape:document-units="px"
			
 
				+     inkscape:current-layer="layer1"
			
 
				+     showgrid="false"
			
 
				+     inkscape:window-width="979"
			
 
				+     inkscape:window-height="836"
			
 
				+     inkscape:window-x="571"
			
 
				+     inkscape:window-y="335"
			
 
				+     inkscape:window-maximized="0"
			
 
				+     fit-margin-top="5"
			
 
				+     fit-margin-left="5"
			
 
				+     fit-margin-right="5"
			
 
				+     fit-margin-bottom="5" />
			
 
				+  <metadata
			
 
				+     id="metadata7">
			
 
				+    <rdf:RDF>
			
 
				+      <cc:Work
			
 
				+         rdf:about="">
			
 
				+        <dc:format>image/svg+xml</dc:format>
			
 
				+        <dc:type
			
 
				+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
			
 
				+        <dc:title></dc:title>
			
 
				+      </cc:Work>
			
 
				+    </rdf:RDF>
			
 
				+  </metadata>
			
 
				+  <g
			
 
				+     inkscape:label="Layer 1"
			
 
				+     inkscape:groupmode="layer"
			
 
				+     id="layer1"
			
 
				+     transform="translate(-28.441125,-185.60612)">
			
 
				+    <flowRoot
			
 
				+       xml:space="preserve"
			
 
				+       id="flowRoot2985"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
			
 
				+         id="flowRegion2987"><rect
			
 
				+           id="rect2989"
			
 
				+           width="82.85714"
			
 
				+           height="11.428572"
			
 
				+           x="240"
			
 
				+           y="492.36218" /></flowRegion><flowPara
			
 
				+         id="flowPara2991"></flowPara></flowRoot>    <g
			
 
				+       id="g4433"
			
 
				+       transform="translate(2,0)">
			
 
				+      <text
			
 
				+         sodipodi:linespacing="125%"
			
 
				+         id="text2993"
			
 
				+         y="-261.66608"
			
 
				+         x="412.12299"
			
 
				+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+         xml:space="preserve"
			
 
				+         transform="matrix(0,1,-1,0,0,0)"><tspan
			
 
				+           y="-261.66608"
			
 
				+           x="412.12299"
			
 
				+           id="tspan2995"
			
 
				+           sodipodi:role="line">synchronize_rcu()</tspan></text>
			
 
				+      <g
			
 
				+         id="g4417"
			
 
				+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
			
 
				+           d="m 97.580736,477.4048 183.140664,0"
			
 
				+           id="path2997"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 96.752718,465.38398 0,22.62742"
			
 
				+           id="path4397"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 281.54942,465.38397 0,22.62742"
			
 
				+           id="path4397-5"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+      </g>
			
 
				+    </g>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.04738"
			
 
				+       y="268.18076"
			
 
				+       id="text4429"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431"
			
 
				+         x="112.04738"
			
 
				+         y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.04738"
			
 
				+       y="439.13766"
			
 
				+       id="text4441"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4443"
			
 
				+         x="112.04738"
			
 
				+         y="439.13766">WRITE_ONCE(b, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="255.60869"
			
 
				+       y="309.29346"
			
 
				+       id="text4445"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4447"
			
 
				+         x="255.60869"
			
 
				+         y="309.29346">r1 = READ_ONCE(a);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="255.14423"
			
 
				+       y="520.61786"
			
 
				+       id="text4449"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4451"
			
 
				+         x="255.14423"
			
 
				+         y="520.61786">WRITE_ONCE(c, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="384.71124"
			
 
				+       id="text4453"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4455"
			
 
				+         x="396.10254"
			
 
				+         y="384.71124">r2 = READ_ONCE(b);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="582.13617"
			
 
				+       id="text4457"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4459"
			
 
				+         x="396.10254"
			
 
				+         y="582.13617">r3 = READ_ONCE(c);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.08231"
			
 
				+       y="213.91006"
			
 
				+       id="text4461"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463"
			
 
				+         x="112.08231"
			
 
				+         y="213.91006">thread0()</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="252.34512"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-6"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-0"
			
 
				+         x="252.34512"
			
 
				+         y="213.91006">thread1()</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.42557"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-2"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-2"
			
 
				+         x="396.42557"
			
 
				+         y="213.91006">thread2()</tspan></text>
			
 
				+    <rect
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="rect4495"
			
 
				+       width="436.28488"
			
 
				+       height="416.4859"
			
 
				+       x="34.648232"
			
 
				+       y="191.10612" />
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 183.14066,191.10612 0,417.193 -0.70711,0"
			
 
				+       id="path4497"
			
 
				+       inkscape:connector-curvature="0" />
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 325.13867,191.10612 0,417.193 -0.70711,0"
			
 
				+       id="path4497-5"
			
 
				+       inkscape:connector-curvature="0" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="111.75929"
			
 
				+       y="251.53981"
			
 
				+       id="text4429-8"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9"
			
 
				+         x="111.75929"
			
 
				+         y="251.53981">rcu_read_lock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="367.91556"
			
 
				+       id="text4429-8-9"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4"
			
 
				+         x="396.10254"
			
 
				+         y="367.91556">rcu_read_lock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="597.40289"
			
 
				+       id="text4429-8-9-3"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-4"
			
 
				+         x="396.10254"
			
 
				+         y="597.40289">rcu_read_unlock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="111.75929"
			
 
				+       y="453.15311"
			
 
				+       id="text4429-8-9-3-1"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-4-6"
			
 
				+         x="111.75929"
			
 
				+         y="453.15311">rcu_read_unlock();</tspan></text>
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+       d="m 33.941125,227.87568 436.284885,0 0,0.7071"
			
 
				+       id="path4608"
			
 
				+       inkscape:connector-curvature="0" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="394.94427"
			
 
				+       y="345.66351"
			
 
				+       id="text4648"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650"
			
 
				+         x="394.94427"
			
 
				+         y="345.66351">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(36.441125,199.60612)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.11968"
			
 
				+       y="475.77856"
			
 
				+       id="text4648-4"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-4"
			
 
				+         x="112.11968"
			
 
				+         y="475.77856">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-7"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(-246.38346,329.72117)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-7-7"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(-103.65246,202.90878)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="254.85066"
			
 
				+       y="348.96619"
			
 
				+       id="text4648-4-3"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-4-5"
			
 
				+         x="254.85066"
			
 
				+         y="348.96619">QS</tspan></text>
			
 
				+  </g>
			
 
				+</svg>
			
--- a/Documentation/RCU/Design/Requirements/RCUApplicability.svg
+++ b/Documentation/RCU/Design/Requirements/RCUApplicability.svg
@@ -0,0 +1,237 @@
 
				+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
			
 
				+<!-- Creator: fig2dev Version 3.2 Patchlevel 5d -->
			
 
				+
			
 
				+<!-- CreationDate: Tue Mar  4 18:34:25 2014 -->
			
 
				+
			
 
				+<!-- Magnification: 3.000 -->
			
 
				+
			
 
				+<svg
			
 
				+   xmlns:dc="http://purl.org/dc/elements/1.1/"
			
 
				+   xmlns:cc="http://creativecommons.org/ns#"
			
 
				+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
			
 
				+   xmlns:svg="http://www.w3.org/2000/svg"
			
 
				+   xmlns="http://www.w3.org/2000/svg"
			
 
				+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
			
 
				+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
			
 
				+   width="1089.1382"
			
 
				+   height="668.21368"
			
 
				+   viewBox="-2121 -36 14554.634 8876.4061"
			
 
				+   id="svg2"
			
 
				+   version="1.1"
			
 
				+   inkscape:version="0.48.3.1 r9886"
			
 
				+   sodipodi:docname="RCUApplicability.svg">
			
 
				+  <metadata
			
 
				+     id="metadata40">
			
 
				+    <rdf:RDF>
			
 
				+      <cc:Work
			
 
				+         rdf:about="">
			
 
				+        <dc:format>image/svg+xml</dc:format>
			
 
				+        <dc:type
			
 
				+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
			
 
				+        <dc:title />
			
 
				+      </cc:Work>
			
 
				+    </rdf:RDF>
			
 
				+  </metadata>
			
 
				+  <defs
			
 
				+     id="defs38" />
			
 
				+  <sodipodi:namedview
			
 
				+     pagecolor="#ffffff"
			
 
				+     bordercolor="#666666"
			
 
				+     borderopacity="1"
			
 
				+     objecttolerance="10"
			
 
				+     gridtolerance="10"
			
 
				+     guidetolerance="10"
			
 
				+     inkscape:pageopacity="0"
			
 
				+     inkscape:pageshadow="2"
			
 
				+     inkscape:window-width="849"
			
 
				+     inkscape:window-height="639"
			
 
				+     id="namedview36"
			
 
				+     showgrid="false"
			
 
				+     inkscape:zoom="0.51326165"
			
 
				+     inkscape:cx="544.56912"
			
 
				+     inkscape:cy="334.10686"
			
 
				+     inkscape:window-x="149"
			
 
				+     inkscape:window-y="448"
			
 
				+     inkscape:window-maximized="0"
			
 
				+     inkscape:current-layer="g4"
			
 
				+     fit-margin-top="5"
			
 
				+     fit-margin-left="5"
			
 
				+     fit-margin-right="5"
			
 
				+     fit-margin-bottom="5" />
			
 
				+  <g
			
 
				+     style="fill:none;stroke-width:0.025in"
			
 
				+     id="g4"
			
 
				+     transform="translate(-2043.6828,14.791398)">
			
 
				+    <!-- Line: box -->
			
 
				+    <rect
			
 
				+       x="0"
			
 
				+       y="0"
			
 
				+       width="14400"
			
 
				+       height="8775"
			
 
				+       rx="0"
			
 
				+       style="fill:#ffa1a1;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
			
 
				+       id="rect6" />
			
 
				+    <!-- Line: box -->
			
 
				+    <rect
			
 
				+       x="1350"
			
 
				+       y="0"
			
 
				+       width="11700"
			
 
				+       height="6075"
			
 
				+       rx="0"
			
 
				+       style="fill:#ffff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
			
 
				+       id="rect8" />
			
 
				+    <!-- Line: box -->
			
 
				+    <rect
			
 
				+       x="2700"
			
 
				+       y="0"
			
 
				+       width="9000"
			
 
				+       height="4275"
			
 
				+       rx="0"
			
 
				+       style="fill:#00ff00;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
			
 
				+       id="rect10" />
			
 
				+    <!-- Line: box -->
			
 
				+    <rect
			
 
				+       x="4050"
			
 
				+       y="0"
			
 
				+       width="6300"
			
 
				+       height="2475"
			
 
				+       rx="0"
			
 
				+       style="fill:#87cfff;stroke:#000000;stroke-width:21;stroke-linecap:butt;stroke-linejoin:miter"
			
 
				+       id="rect12" />
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="900"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text14"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3017">Read-Mostly, Stale &amp;</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="1350"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text16"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3019">Inconsistent Data OK</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="1800"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text18"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3021">(RCU Works Great!!!)</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="3825"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text20"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3023">(RCU Works Well)</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="3375"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text22"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3025">Read-Mostly, Need Consistent Data</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="5175"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text24"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3027">Read-Write, Need Consistent Data</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="6975"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text26"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+       sodipodi:linespacing="125%">Update-Mostly, Need Consistent Data</text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="5625"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text28"
			
 
				+       sodipodi:linespacing="125%"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"><tspan
			
 
				+         style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+         id="tspan3029">(RCU Might Be OK...)</tspan></text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="7875"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text30"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+       sodipodi:linespacing="125%">(1) Provide Existence Guarantees For Update-Friendly Mechanisms</text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="8325"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text32"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+       sodipodi:linespacing="125%">(2) Provide Wait-Free Read-Side Primitives for Real-Time Use)</text>
			
 
				+    <!-- Text -->
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       x="7200"
			
 
				+       y="7425"
			
 
				+       font-style="normal"
			
 
				+       font-weight="normal"
			
 
				+       font-size="324"
			
 
				+       id="text34"
			
 
				+       style="font-size:427.63009644px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;writing-mode:lr-tb;text-anchor:middle;fill:#000000;font-family:Nimbus Sans L;-inkscape-font-specification:Nimbus Sans L"
			
 
				+       sodipodi:linespacing="125%">(RCU is Very Unlikely to be the Right Tool For The Job, But it Can:</text>
			
 
				+  </g>
			
 
				+</svg>
			
--- a/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
+++ b/Documentation/RCU/Design/Requirements/ReadersPartitionGP1.svg
@@ -0,0 +1,639 @@
 
				+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
			
 
				+<!-- Created with Inkscape (http://www.inkscape.org/) -->
			
 
				+
			
 
				+<svg
			
 
				+   xmlns:dc="http://purl.org/dc/elements/1.1/"
			
 
				+   xmlns:cc="http://creativecommons.org/ns#"
			
 
				+   xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
			
 
				+   xmlns:svg="http://www.w3.org/2000/svg"
			
 
				+   xmlns="http://www.w3.org/2000/svg"
			
 
				+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
			
 
				+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
			
 
				+   width="735.25"
			
 
				+   height="516.21875"
			
 
				+   id="svg2"
			
 
				+   version="1.1"
			
 
				+   inkscape:version="0.48.3.1 r9886"
			
 
				+   sodipodi:docname="ReadersPartitionGP1.svg">
			
 
				+  <defs
			
 
				+     id="defs4">
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lend"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lend"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3792"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lstart"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lstart"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3789"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(1.1,0,0,1.1,1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lstart"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lstart-4"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3789-9"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(1.1,0,0,1.1,1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+    <marker
			
 
				+       inkscape:stockid="Arrow2Lend"
			
 
				+       orient="auto"
			
 
				+       refY="0"
			
 
				+       refX="0"
			
 
				+       id="Arrow2Lend-4"
			
 
				+       style="overflow:visible">
			
 
				+      <path
			
 
				+         id="path3792-4"
			
 
				+         style="fill-rule:evenodd;stroke-width:0.625;stroke-linejoin:round"
			
 
				+         d="M 8.7185878,4.0337352 -2.2072895,0.01601326 8.7185884,-4.0017078 c -1.7454984,2.3720609 -1.7354408,5.6174519 -6e-7,8.035443 z"
			
 
				+         transform="matrix(-1.1,0,0,-1.1,-1.1,0)"
			
 
				+         inkscape:connector-curvature="0" />
			
 
				+    </marker>
			
 
				+  </defs>
			
 
				+  <sodipodi:namedview
			
 
				+     id="base"
			
 
				+     pagecolor="#ffffff"
			
 
				+     bordercolor="#666666"
			
 
				+     borderopacity="1.0"
			
 
				+     inkscape:pageopacity="0.0"
			
 
				+     inkscape:pageshadow="2"
			
 
				+     inkscape:zoom="1.3670394"
			
 
				+     inkscape:cx="367.26465"
			
 
				+     inkscape:cy="258.46182"
			
 
				+     inkscape:document-units="px"
			
 
				+     inkscape:current-layer="g4433-6"
			
 
				+     showgrid="false"
			
 
				+     inkscape:window-width="1351"
			
 
				+     inkscape:window-height="836"
			
 
				+     inkscape:window-x="438"
			
 
				+     inkscape:window-y="335"
			
 
				+     inkscape:window-maximized="0"
			
 
				+     fit-margin-top="5"
			
 
				+     fit-margin-left="5"
			
 
				+     fit-margin-right="5"
			
 
				+     fit-margin-bottom="5" />
			
 
				+  <metadata
			
 
				+     id="metadata7">
			
 
				+    <rdf:RDF>
			
 
				+      <cc:Work
			
 
				+         rdf:about="">
			
 
				+        <dc:format>image/svg+xml</dc:format>
			
 
				+        <dc:type
			
 
				+           rdf:resource="http://purl.org/dc/dcmitype/StillImage" />
			
 
				+        <dc:title />
			
 
				+      </cc:Work>
			
 
				+    </rdf:RDF>
			
 
				+  </metadata>
			
 
				+  <g
			
 
				+     inkscape:label="Layer 1"
			
 
				+     inkscape:groupmode="layer"
			
 
				+     id="layer1"
			
 
				+     transform="translate(-29.15625,-185.59375)">
			
 
				+    <flowRoot
			
 
				+       xml:space="preserve"
			
 
				+       id="flowRoot2985"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"><flowRegion
			
 
				+         id="flowRegion2987"><rect
			
 
				+           id="rect2989"
			
 
				+           width="82.85714"
			
 
				+           height="11.428572"
			
 
				+           x="240"
			
 
				+           y="492.36218" /></flowRegion><flowPara
			
 
				+         id="flowPara2991" /></flowRoot>    <g
			
 
				+       id="g4433"
			
 
				+       transform="translate(2,-12)">
			
 
				+      <text
			
 
				+         sodipodi:linespacing="125%"
			
 
				+         id="text2993"
			
 
				+         y="-261.66608"
			
 
				+         x="436.12299"
			
 
				+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+         xml:space="preserve"
			
 
				+         transform="matrix(0,1,-1,0,0,0)"><tspan
			
 
				+           y="-261.66608"
			
 
				+           x="436.12299"
			
 
				+           id="tspan2995"
			
 
				+           sodipodi:role="line">synchronize_rcu()</tspan></text>
			
 
				+      <g
			
 
				+         id="g4417"
			
 
				+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
			
 
				+           d="M 97.580736,477.4048 327.57913,476.09759"
			
 
				+           id="path2997"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 96.752718,465.38398 0,22.62742"
			
 
				+           id="path4397"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 328.40703,465.38397 0,22.62742"
			
 
				+           id="path4397-5"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+      </g>
			
 
				+    </g>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.04738"
			
 
				+       y="268.18076"
			
 
				+       id="text4429"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431"
			
 
				+         x="112.04738"
			
 
				+         y="268.18076">WRITE_ONCE(a, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.04738"
			
 
				+       y="487.13766"
			
 
				+       id="text4441"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4443"
			
 
				+         x="112.04738"
			
 
				+         y="487.13766">WRITE_ONCE(b, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="255.60869"
			
 
				+       y="297.29346"
			
 
				+       id="text4445"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4447"
			
 
				+         x="255.60869"
			
 
				+         y="297.29346">r1 = READ_ONCE(a);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="255.14423"
			
 
				+       y="554.61786"
			
 
				+       id="text4449"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4451"
			
 
				+         x="255.14423"
			
 
				+         y="554.61786">WRITE_ONCE(c, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="370.71124"
			
 
				+       id="text4453"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4455"
			
 
				+         x="396.10254"
			
 
				+         y="370.71124">WRITE_ONCE(d, 1);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="572.13617"
			
 
				+       id="text4457"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4459"
			
 
				+         x="396.10254"
			
 
				+         y="572.13617">r2 = READ_ONCE(c);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.08231"
			
 
				+       y="213.91006"
			
 
				+       id="text4461"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463"
			
 
				+         x="112.08231"
			
 
				+         y="213.91006">thread0()</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="252.34512"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-6"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-0"
			
 
				+         x="252.34512"
			
 
				+         y="213.91006">thread1()</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.42557"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-2"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-2"
			
 
				+         x="396.42557"
			
 
				+         y="213.91006">thread2()</tspan></text>
			
 
				+    <rect
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="rect4495"
			
 
				+       width="724.25244"
			
 
				+       height="505.21201"
			
 
				+       x="34.648232"
			
 
				+       y="191.10612" />
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 183.14066,191.10612 0,504.24243"
			
 
				+       id="path4497"
			
 
				+       inkscape:connector-curvature="0"
			
 
				+       sodipodi:nodetypes="cc" />
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 325.13867,191.10612 0,504.24243"
			
 
				+       id="path4497-5"
			
 
				+       inkscape:connector-curvature="0"
			
 
				+       sodipodi:nodetypes="cc" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="111.75929"
			
 
				+       y="251.53981"
			
 
				+       id="text4429-8"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9"
			
 
				+         x="111.75929"
			
 
				+         y="251.53981">rcu_read_lock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="353.91556"
			
 
				+       id="text4429-8-9"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4"
			
 
				+         x="396.10254"
			
 
				+         y="353.91556">rcu_read_lock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="396.10254"
			
 
				+       y="587.40289"
			
 
				+       id="text4429-8-9-3"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-4"
			
 
				+         x="396.10254"
			
 
				+         y="587.40289">rcu_read_unlock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="111.75929"
			
 
				+       y="501.15311"
			
 
				+       id="text4429-8-9-3-1"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-4-6"
			
 
				+         x="111.75929"
			
 
				+         y="501.15311">rcu_read_unlock();</tspan></text>
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+       d="m 33.941125,227.87568 724.941765,0"
			
 
				+       id="path4608"
			
 
				+       inkscape:connector-curvature="0"
			
 
				+       sodipodi:nodetypes="cc" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="394.94427"
			
 
				+       y="331.66351"
			
 
				+       id="text4648"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650"
			
 
				+         x="394.94427"
			
 
				+         y="331.66351">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(36.441125,185.60612)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="112.11968"
			
 
				+       y="523.77856"
			
 
				+       id="text4648-4"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-4"
			
 
				+         x="112.11968"
			
 
				+         y="523.77856">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-7"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(-246.38346,377.72117)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-7-7"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(-103.65246,190.90878)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="254.85066"
			
 
				+       y="336.96619"
			
 
				+       id="text4648-4-3"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-4-5"
			
 
				+         x="254.85066"
			
 
				+         y="336.96619">QS</tspan></text>
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 470.93311,190.39903 0,504.24243"
			
 
				+       id="path4497-5-6"
			
 
				+       inkscape:connector-curvature="0"
			
 
				+       sodipodi:nodetypes="cc" />
			
 
				+    <path
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-linecap:butt;stroke-linejoin:miter;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       d="m 616.22755,190.38323 0,504.24243"
			
 
				+       id="path4497-5-2"
			
 
				+       inkscape:connector-curvature="0"
			
 
				+       sodipodi:nodetypes="cc" />
			
 
				+    <g
			
 
				+       id="g4433-6"
			
 
				+       transform="translate(288.0964,78.32827)">
			
 
				+      <text
			
 
				+         sodipodi:linespacing="125%"
			
 
				+         id="text2993-7"
			
 
				+         y="-261.66608"
			
 
				+         x="440.12299"
			
 
				+         style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+         xml:space="preserve"
			
 
				+         transform="matrix(0,1,-1,0,0,0)"><tspan
			
 
				+           y="-261.66608"
			
 
				+           x="440.12299"
			
 
				+           id="tspan2995-1"
			
 
				+           sodipodi:role="line">synchronize_rcu()</tspan></text>
			
 
				+      <g
			
 
				+         id="g4417-1"
			
 
				+         transform="matrix(0,1,-1,0,730.90257,222.4928)">
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1;marker-start:url(#Arrow2Lstart);marker-end:url(#Arrow2Lend)"
			
 
				+           d="M 97.580736,477.4048 328.5624,477.07246"
			
 
				+           id="path2997-2"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 96.752718,465.38398 0,22.62742"
			
 
				+           id="path4397-3"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+        <path
			
 
				+           style="fill:none;stroke:#000000;stroke-width:1px;stroke-linecap:butt;stroke-linejoin:miter;stroke-opacity:1"
			
 
				+           d="m 329.39039,465.38397 0,22.62742"
			
 
				+           id="path4397-5-4"
			
 
				+           inkscape:connector-curvature="0"
			
 
				+           sodipodi:nodetypes="cc" />
			
 
				+      </g>
			
 
				+    </g>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="541.70508"
			
 
				+       y="387.6217"
			
 
				+       id="text4445-0"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4447-5"
			
 
				+         x="541.70508"
			
 
				+         y="387.6217">r3 = READ_ONCE(d);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="541.2406"
			
 
				+       y="646.94611"
			
 
				+       id="text4449-6"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4451-6"
			
 
				+         x="541.2406"
			
 
				+         y="646.94611">WRITE_ONCE(e, 1);</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:#ffffff;fill-opacity:1;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-7-7-5"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(182.44393,281.23704)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="540.94702"
			
 
				+       y="427.29443"
			
 
				+       id="text4648-4-3-1"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-4-5-7"
			
 
				+         x="540.94702"
			
 
				+         y="427.29443">QS</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="686.27747"
			
 
				+       y="461.83929"
			
 
				+       id="text4453-7"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4455-1"
			
 
				+         x="686.27747"
			
 
				+         y="461.83929">r4 = READ_ONCE(b);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="686.27747"
			
 
				+       y="669.26422"
			
 
				+       id="text4457-9"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4459-2"
			
 
				+         x="686.27747"
			
 
				+         y="669.26422">r5 = READ_ONCE(e);</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="686.27747"
			
 
				+       y="445.04358"
			
 
				+       id="text4429-8-9-33"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-2"
			
 
				+         x="686.27747"
			
 
				+         y="445.04358">rcu_read_lock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="686.27747"
			
 
				+       y="684.53094"
			
 
				+       id="text4429-8-9-3-8"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4431-9-4-4-5"
			
 
				+         x="686.27747"
			
 
				+         y="684.53094">rcu_read_unlock();</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="685.11914"
			
 
				+       y="422.79153"
			
 
				+       id="text4648-9"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-7"
			
 
				+         x="685.11914"
			
 
				+         y="422.79153">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-8"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(326.61602,276.73415)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="397.85934"
			
 
				+       y="609.59003"
			
 
				+       id="text4648-5"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-77"
			
 
				+         x="397.85934"
			
 
				+         y="609.59003">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-80"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(39.356201,463.53264)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="256.75986"
			
 
				+       y="586.99133"
			
 
				+       id="text4648-5-2"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4650-77-7"
			
 
				+         x="256.75986"
			
 
				+         y="586.99133">QS</tspan></text>
			
 
				+    <path
			
 
				+       sodipodi:type="arc"
			
 
				+       style="fill:none;stroke:#000000;stroke-width:1;stroke-miterlimit:4;stroke-opacity:1;stroke-dasharray:none;stroke-dashoffset:0"
			
 
				+       id="path4652-80-5"
			
 
				+       sodipodi:cx="358.85669"
			
 
				+       sodipodi:cy="142.87541"
			
 
				+       sodipodi:rx="10.960155"
			
 
				+       sodipodi:ry="10.253048"
			
 
				+       d="m 358.86939,132.62237 a 10.960155,10.253048 0 1 1 -0.0228,0"
			
 
				+       transform="translate(-101.74328,440.93395)"
			
 
				+       sodipodi:start="4.7135481"
			
 
				+       sodipodi:end="10.994651"
			
 
				+       sodipodi:open="true" />
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="546.22791"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-2-5"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-2-6"
			
 
				+         x="546.22791"
			
 
				+         y="213.91006">thread3()</tspan></text>
			
 
				+    <text
			
 
				+       xml:space="preserve"
			
 
				+       style="font-size:10px;font-style:normal;font-variant:normal;font-weight:normal;font-stretch:normal;text-align:center;line-height:125%;letter-spacing:0px;word-spacing:0px;writing-mode:lr-tb;text-anchor:middle;fill:#000000;fill-opacity:1;stroke:none;font-family:Symbol;-inkscape-font-specification:Symbol"
			
 
				+       x="684.00067"
			
 
				+       y="213.91006"
			
 
				+       id="text4461-2-1"
			
 
				+       sodipodi:linespacing="125%"><tspan
			
 
				+         sodipodi:role="line"
			
 
				+         id="tspan4463-2-0"
			
 
				+         x="684.00067"
			
 
				+         y="213.91006">thread4()</tspan></text>
			
 
				+  </g>
			
 
				+</svg>
			
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -0,0 +1,2897 @@
 
				+<!-- DO NOT HAND EDIT. -->
			
 
				+<!-- Instead, edit Documentation/RCU/Design/Requirements/Requirements.htmlx and run 'sh htmlqqz.sh Documentation/RCU/Design/Requirements/Requirements' -->
			
 
				+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
			
 
				+        "http://www.w3.org/TR/html4/loose.dtd">
			
 
				+        <html>
			
 
				+        <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
			
 
				+        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
			
 
				+
			
 
				+<h1>A Tour Through RCU's Requirements</h1>
			
 
				+
			
 
				+<p>Copyright IBM Corporation, 2015</p>
			
 
				+<p>Author: Paul E.&nbsp;McKenney</p>
			
 
				+<p><i>The initial version of this document appeared in the
			
 
				+<a href="https://lwn.net/">LWN</a> articles
			
 
				+<a href="https://lwn.net/Articles/652156/">here</a>,
			
 
				+<a href="https://lwn.net/Articles/652677/">here</a>, and
			
 
				+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
			
 
				+
			
 
				+<h2>Introduction</h2>
			
 
				+
			
 
				+<p>
			
 
				+Read-copy update (RCU) is a synchronization mechanism that is often
			
 
				+used as a replacement for reader-writer locking.
			
 
				+RCU is unusual in that updaters do not block readers,
			
 
				+which means that RCU's read-side primitives can be exceedingly fast
			
 
				+and scalable.
			
 
				+In addition, updaters can make useful forward progress concurrently
			
 
				+with readers.
			
 
				+However, all this concurrency between RCU readers and updaters does raise
			
 
				+the question of exactly what RCU readers are doing, which in turn
			
 
				+raises the question of exactly what RCU's requirements are.
			
 
				+
			
 
				+<p>
			
 
				+This document therefore summarizes RCU's requirements, and can be thought
			
 
				+of as an informal, high-level specification for RCU.
			
 
				+It is important to understand that RCU's specification is primarily
			
 
				+empirical in nature;
			
 
				+in fact, I learned about many of these requirements the hard way.
			
 
				+This situation might cause some consternation, however, not only
			
 
				+has this learning process been a lot of fun, but it has also been
			
 
				+a great privilege to work with so many people willing to apply
			
 
				+technologies in interesting new ways.
			
 
				+
			
 
				+<p>
			
 
				+All that aside, here are the categories of currently known RCU requirements:
			
 
				+</p>
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Fundamental Requirements">
			
 
				+	Fundamental Requirements</a>
			
 
				+<li>	<a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
			
 
				+<li>	<a href="#Parallelism Facts of Life">
			
 
				+	Parallelism Facts of Life</a>
			
 
				+<li>	<a href="#Quality-of-Implementation Requirements">
			
 
				+	Quality-of-Implementation Requirements</a>
			
 
				+<li>	<a href="#Linux Kernel Complications">
			
 
				+	Linux Kernel Complications</a>
			
 
				+<li>	<a href="#Software-Engineering Requirements">
			
 
				+	Software-Engineering Requirements</a>
			
 
				+<li>	<a href="#Other RCU Flavors">
			
 
				+	Other RCU Flavors</a>
			
 
				+<li>	<a href="#Possible Future Changes">
			
 
				+	Possible Future Changes</a>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This is followed by a <a href="#Summary">summary</a>,
			
 
				+which is in turn followed by the inevitable
			
 
				+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
			
 
				+
			
 
				+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+RCU's fundamental requirements are the closest thing RCU has to hard
			
 
				+mathematical requirements.
			
 
				+These are:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Grace-Period Guarantee">
			
 
				+	Grace-Period Guarantee</a>
			
 
				+<li>	<a href="#Publish-Subscribe Guarantee">
			
 
				+	Publish-Subscribe Guarantee</a>
			
 
				+<li>	<a href="#Memory-Barrier Guarantees">
			
 
				+	Memory-Barrier Guarantees</a>
			
 
				+<li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
			
 
				+	RCU Primitives Guaranteed to Execute Unconditionally</a>
			
 
				+<li>	<a href="#Guaranteed Read-to-Write Upgrade">
			
 
				+	Guaranteed Read-to-Write Upgrade</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's grace-period guarantee is unusual in being premeditated:
			
 
				+Jack Slingwine and I had this guarantee firmly in mind when we started
			
 
				+work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
			
 
				+That said, the past two decades of experience with RCU have produced
			
 
				+a much more detailed understanding of this guarantee.
			
 
				+
			
 
				+<p>
			
 
				+RCU's grace-period guarantee allows updaters to wait for the completion
			
 
				+of all pre-existing RCU read-side critical sections.
			
 
				+An RCU read-side critical section
			
 
				+begins with the marker <tt>rcu_read_lock()</tt> and ends with
			
 
				+the marker <tt>rcu_read_unlock()</tt>.
			
 
				+These markers may be nested, and RCU treats a nested set as one
			
 
				+big RCU read-side critical section.
			
 
				+Production-quality implementations of <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
			
 
				+fact have exactly zero overhead in Linux kernels built for production
			
 
				+use with <tt>CONFIG_PREEMPT=n</tt>.
			
 
				+
			
 
				+<p>
			
 
				+This guarantee allows ordering to be enforced with extremely low
			
 
				+overhead to readers, for example:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int x, y;
			
 
				+ 2
			
 
				+ 3 void thread0(void)
			
 
				+ 4 {
			
 
				+ 5   rcu_read_lock();
			
 
				+ 6   r1 = READ_ONCE(x);
			
 
				+ 7   r2 = READ_ONCE(y);
			
 
				+ 8   rcu_read_unlock();
			
 
				+ 9 }
			
 
				+10
			
 
				+11 void thread1(void)
			
 
				+12 {
			
 
				+13   WRITE_ONCE(x, 1);
			
 
				+14   synchronize_rcu();
			
 
				+15   WRITE_ONCE(y, 1);
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
			
 
				+all pre-existing readers, any instance of <tt>thread0()</tt> that
			
 
				+loads a value of zero from <tt>x</tt> must complete before
			
 
				+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
			
 
				+also load a value of zero from <tt>y</tt>.
			
 
				+Similarly, any instance of <tt>thread0()</tt> that loads a value of
			
 
				+one from <tt>y</tt> must have started after the
			
 
				+<tt>synchronize_rcu()</tt> started, and must therefore also load
			
 
				+a value of one from <tt>x</tt>.
			
 
				+Therefore, the outcome:
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 0 &amp;&amp; r2 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+cannot happen.
			
 
				+
			
 
				+<p><a name="Quick Quiz 1"><b>Quick Quiz 1</b>:</a>
			
 
				+Wait a minute!
			
 
				+You said that updaters can make useful forward progress concurrently
			
 
				+with readers, but pre-existing readers will block
			
 
				+<tt>synchronize_rcu()</tt>!!!
			
 
				+Just who are you trying to fool???
			
 
				+<br><a href="#qq1answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+This scenario resembles one of the first uses of RCU in
			
 
				+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
			
 
				+which managed a distributed lock manager's transition into
			
 
				+a state suitable for handling recovery from node failure,
			
 
				+more or less as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 #define STATE_NORMAL        0
			
 
				+ 2 #define STATE_WANT_RECOVERY 1
			
 
				+ 3 #define STATE_RECOVERING    2
			
 
				+ 4 #define STATE_WANT_NORMAL   3
			
 
				+ 5
			
 
				+ 6 int state = STATE_NORMAL;
			
 
				+ 7
			
 
				+ 8 void do_something_dlm(void)
			
 
				+ 9 {
			
 
				+10   int state_snap;
			
 
				+11
			
 
				+12   rcu_read_lock();
			
 
				+13   state_snap = READ_ONCE(state);
			
 
				+14   if (state_snap == STATE_NORMAL)
			
 
				+15     do_something();
			
 
				+16   else
			
 
				+17     do_something_carefully();
			
 
				+18   rcu_read_unlock();
			
 
				+19 }
			
 
				+20
			
 
				+21 void start_recovery(void)
			
 
				+22 {
			
 
				+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
			
 
				+24   synchronize_rcu();
			
 
				+25   WRITE_ONCE(state, STATE_RECOVERING);
			
 
				+26   recovery();
			
 
				+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
			
 
				+28   synchronize_rcu();
			
 
				+29   WRITE_ONCE(state, STATE_NORMAL);
			
 
				+30 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The RCU read-side critical section in <tt>do_something_dlm()</tt>
			
 
				+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
			
 
				+to guarantee that <tt>do_something()</tt> never runs concurrently
			
 
				+with <tt>recovery()</tt>, but with little or no synchronization
			
 
				+overhead in <tt>do_something_dlm()</tt>.
			
 
				+
			
 
				+<p><a name="Quick Quiz 2"><b>Quick Quiz 2</b>:</a>
			
 
				+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
			
 
				+<br><a href="#qq2answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+In order to avoid fatal problems such as deadlocks,
			
 
				+an RCU read-side critical section must not contain calls to
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+Similarly, an RCU read-side critical section must not
			
 
				+contain anything that waits, directly or indirectly, on completion of
			
 
				+an invocation of <tt>synchronize_rcu()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although RCU's grace-period guarantee is useful in and of itself, with
			
 
				+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
			
 
				+it would be good to be able to use RCU to coordinate read-side
			
 
				+access to linked data structures.
			
 
				+For this, the grace-period guarantee is not sufficient, as can
			
 
				+be seen in function <tt>add_gp_buggy()</tt> below.
			
 
				+We will look at the reader's code later, but in the meantime, just think of
			
 
				+the reader as locklessly picking up the <tt>gp</tt> pointer,
			
 
				+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
			
 
				+<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp_buggy(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   p-&gt;a = a;
			
 
				+12   p-&gt;b = a;
			
 
				+13   gp = p; /* ORDERING BUG */
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The problem is that both the compiler and weakly ordered CPUs are within
			
 
				+their rights to reorder this code as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp_buggy_optimized(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+<b>11   gp = p; /* ORDERING BUG */
			
 
				+12   p-&gt;a = a;
			
 
				+13   p-&gt;b = a;</b>
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If an RCU reader fetches <tt>gp</tt> just after
			
 
				+<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
			
 
				+it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
			
 
				+fields.
			
 
				+And this is but one of many ways in which compiler and hardware optimizations
			
 
				+could cause trouble.
			
 
				+Therefore, we clearly need some way to prevent the compiler and the CPU from
			
 
				+reordering in this manner, which brings us to the publish-subscribe
			
 
				+guarantee discussed in the next section.
			
 
				+
			
 
				+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's publish-subscribe guarantee allows data to be inserted
			
 
				+into a linked data structure without disrupting RCU readers.
			
 
				+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
			
 
				+new data, and readers use <tt>rcu_dereference()</tt> to
			
 
				+access data, whether new or old.
			
 
				+The following shows an example of insertion:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   p-&gt;a = a;
			
 
				+12   p-&gt;b = a;
			
 
				+13   rcu_assign_pointer(gp, p);
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
			
 
				+equivalent to a simple assignment statement, but also guarantees
			
 
				+that its assignment will
			
 
				+happen after the two assignments in lines&nbsp;11 and&nbsp;12,
			
 
				+similar to the C11 <tt>memory_order_release</tt> store operation.
			
 
				+It also prevents any number of &ldquo;interesting&rdquo; compiler
			
 
				+optimizations, for example, the use of <tt>gp</tt> as a scratch
			
 
				+location immediately preceding the assignment.
			
 
				+
			
 
				+<p><a name="Quick Quiz 3"><b>Quick Quiz 3</b>:</a>
			
 
				+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
			
 
				+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
			
 
				+from being reordered.
			
 
				+Can't that also cause problems?
			
 
				+<br><a href="#qq3answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+It is tempting to assume that the reader need not do anything special
			
 
				+to control its accesses to the RCU-protected data,
			
 
				+as shown in <tt>do_something_gp_buggy()</tt> below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp_buggy(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
			
 
				+ 5   if (p) {
			
 
				+ 6     do_something(p-&gt;a, p-&gt;b);
			
 
				+ 7     rcu_read_unlock();
			
 
				+ 8     return true;
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11   return false;
			
 
				+12 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+However, this temptation must be resisted because there are a
			
 
				+surprisingly large number of ways that the compiler
			
 
				+(to say nothing of
			
 
				+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
			
 
				+can trip this code up.
			
 
				+For but one example, if the compiler were short of registers, it
			
 
				+might choose to refetch from <tt>gp</tt> rather than keeping
			
 
				+a separate copy in <tt>p</tt> as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp_buggy_optimized(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
			
 
				+<b> 5     do_something(gp-&gt;a, gp-&gt;b);</b>
			
 
				+ 6     rcu_read_unlock();
			
 
				+ 7     return true;
			
 
				+ 8   }
			
 
				+ 9   rcu_read_unlock();
			
 
				+10   return false;
			
 
				+11 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If this function ran concurrently with a series of updates that
			
 
				+replaced the current structure with a new one,
			
 
				+the fetches of <tt>gp-&gt;a</tt>
			
 
				+and <tt>gp-&gt;b</tt> might well come from two different structures,
			
 
				+which could cause serious confusion.
			
 
				+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
			
 
				+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   p = rcu_dereference(gp);
			
 
				+ 5   if (p) {
			
 
				+ 6     do_something(p-&gt;a, p-&gt;b);
			
 
				+ 7     rcu_read_unlock();
			
 
				+ 8     return true;
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11   return false;
			
 
				+12 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
			
 
				+memory barriers in the Linux kernel.
			
 
				+Should a
			
 
				+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
			
 
				+ever appear, then <tt>rcu_dereference()</tt> could be implemented
			
 
				+as a <tt>memory_order_consume</tt> load.
			
 
				+Regardless of the exact implementation, a pointer fetched by
			
 
				+<tt>rcu_dereference()</tt> may not be used outside of the
			
 
				+outermost RCU read-side critical section containing that
			
 
				+<tt>rcu_dereference()</tt>, unless protection of
			
 
				+the corresponding data element has been passed from RCU to some
			
 
				+other synchronization mechanism, most commonly locking or
			
 
				+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
			
 
				+
			
 
				+<p>
			
 
				+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
			
 
				+use <tt>rcu_dereference()</tt>, and these two RCU API elements
			
 
				+work together to ensure that readers have a consistent view of
			
 
				+newly added data elements.
			
 
				+
			
 
				+<p>
			
 
				+Of course, it is also necessary to remove elements from RCU-protected
			
 
				+data structures, for example, using the following process:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Remove the data element from the enclosing structure.
			
 
				+<li>	Wait for all pre-existing RCU read-side critical sections
			
 
				+	to complete (because only pre-existing readers can possibly have
			
 
				+	a reference to the newly removed data element).
			
 
				+<li>	At this point, only the updater has a reference to the
			
 
				+	newly removed data element, so it can safely reclaim
			
 
				+	the data element, for example, by passing it to <tt>kfree()</tt>.
			
 
				+</ol>
			
 
				+
			
 
				+This process is implemented by <tt>remove_gp_synchronous()</tt>:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool remove_gp_synchronous(void)
			
 
				+ 2 {
			
 
				+ 3   struct foo *p;
			
 
				+ 4
			
 
				+ 5   spin_lock(&amp;gp_lock);
			
 
				+ 6   p = rcu_access_pointer(gp);
			
 
				+ 7   if (!p) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   rcu_assign_pointer(gp, NULL);
			
 
				+12   spin_unlock(&amp;gp_lock);
			
 
				+13   synchronize_rcu();
			
 
				+14   kfree(p);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+This function is straightforward, with line&nbsp;13 waiting for a grace
			
 
				+period before line&nbsp;14 frees the old data element.
			
 
				+This waiting ensures that readers will reach line&nbsp;7 of
			
 
				+<tt>do_something_gp()</tt> before the data element referenced by
			
 
				+<tt>p</tt> is freed.
			
 
				+The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
			
 
				+<tt>rcu_dereference()</tt>, except that:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	The value returned by <tt>rcu_access_pointer()</tt>
			
 
				+	cannot be dereferenced.
			
 
				+	If you want to access the value pointed to as well as
			
 
				+	the pointer itself, use <tt>rcu_dereference()</tt>
			
 
				+	instead of <tt>rcu_access_pointer()</tt>.
			
 
				+<li>	The call to <tt>rcu_access_pointer()</tt> need not be
			
 
				+	protected.
			
 
				+	In contrast, <tt>rcu_dereference()</tt> must either be
			
 
				+	within an RCU read-side critical section or in a code
			
 
				+	segment where the pointer cannot change, for example, in
			
 
				+	code protected by the corresponding update-side lock.
			
 
				+</ol>
			
 
				+
			
 
				+<p><a name="Quick Quiz 4"><b>Quick Quiz 4</b>:</a>
			
 
				+Without the <tt>rcu_dereference()</tt> or the
			
 
				+<tt>rcu_access_pointer()</tt>, what destructive optimizations
			
 
				+might the compiler make use of?
			
 
				+<br><a href="#qq4answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+In short, RCU's publish-subscribe guarantee is provided by the combination
			
 
				+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
			
 
				+This guarantee allows data elements to be safely added to RCU-protected
			
 
				+linked data structures without disrupting RCU readers.
			
 
				+This guarantee can be used in combination with the grace-period
			
 
				+guarantee to also allow data elements to be removed from RCU-protected
			
 
				+linked data structures, again without disrupting RCU readers.
			
 
				+
			
 
				+<p>
			
 
				+This guarantee was only partially premeditated.
			
 
				+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
			
 
				+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
			
 
				+have anything resembling the <tt>smp_read_barrier_depends()</tt>
			
 
				+that was later subsumed into <tt>rcu_dereference()</tt>.
			
 
				+The need for these operations made itself known quite suddenly at a
			
 
				+late-1990s meeting with the DEC Alpha architects, back in the days when
			
 
				+DEC was still a free-standing company.
			
 
				+It took the Alpha architects a good hour to convince me that any sort
			
 
				+of barrier would ever be needed, and it then took me a good <i>two</i> hours
			
 
				+to convince them that their documentation did not make this point clear.
			
 
				+More recent work with the C and C++ standards committees have provided
			
 
				+much education on tricks and traps from the compiler.
			
 
				+In short, compilers were much less tricky in the early 1990s, but in
			
 
				+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
			
 
				+
			
 
				+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The previous section's simple linked-data-structure scenario clearly
			
 
				+demonstrates the need for RCU's stringent memory-ordering guarantees on
			
 
				+systems with more than one CPU:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Each CPU that has an RCU read-side critical section that
			
 
				+	begins before <tt>synchronize_rcu()</tt> starts is
			
 
				+	guaranteed to execute a full memory barrier between the time
			
 
				+	that the RCU read-side critical section ends and the time that
			
 
				+	<tt>synchronize_rcu()</tt> returns.
			
 
				+	Without this guarantee, a pre-existing RCU read-side critical section
			
 
				+	might hold a reference to the newly removed <tt>struct foo</tt>
			
 
				+	after the <tt>kfree()</tt> on line&nbsp;14 of
			
 
				+	<tt>remove_gp_synchronous()</tt>.
			
 
				+<li>	Each CPU that has an RCU read-side critical section that ends
			
 
				+	after <tt>synchronize_rcu()</tt> returns is guaranteed
			
 
				+	to execute a full memory barrier between the time that
			
 
				+	<tt>synchronize_rcu()</tt> begins and the time that the RCU
			
 
				+	read-side critical section begins.
			
 
				+	Without this guarantee, a later RCU read-side critical section
			
 
				+	running after the <tt>kfree()</tt> on line&nbsp;14 of
			
 
				+	<tt>remove_gp_synchronous()</tt> might
			
 
				+	later run <tt>do_something_gp()</tt> and find the
			
 
				+	newly deleted <tt>struct foo</tt>.
			
 
				+<li>	If the task invoking <tt>synchronize_rcu()</tt> remains
			
 
				+	on a given CPU, then that CPU is guaranteed to execute a full
			
 
				+	memory barrier sometime during the execution of
			
 
				+	<tt>synchronize_rcu()</tt>.
			
 
				+	This guarantee ensures that the <tt>kfree()</tt> on
			
 
				+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
			
 
				+	execute after the removal on line&nbsp;11.
			
 
				+<li>	If the task invoking <tt>synchronize_rcu()</tt> migrates
			
 
				+	among a group of CPUs during that invocation, then each of the
			
 
				+	CPUs in that group is guaranteed to execute a full memory barrier
			
 
				+	sometime during the execution of <tt>synchronize_rcu()</tt>.
			
 
				+	This guarantee also ensures that the <tt>kfree()</tt> on
			
 
				+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
			
 
				+	execute after the removal on
			
 
				+	line&nbsp;11, but also in the case where the thread executing the
			
 
				+	<tt>synchronize_rcu()</tt> migrates in the meantime.
			
 
				+</ol>
			
 
				+
			
 
				+<p><a name="Quick Quiz 5"><b>Quick Quiz 5</b>:</a>
			
 
				+Given that multiple CPUs can start RCU read-side critical sections
			
 
				+at any time without any ordering whatsoever, how can RCU possibly tell whether
			
 
				+or not a given RCU read-side critical section starts before a
			
 
				+given instance of <tt>synchronize_rcu()</tt>?
			
 
				+<br><a href="#qq5answer">Answer</a>
			
 
				+
			
 
				+<p><a name="Quick Quiz 6"><b>Quick Quiz 6</b>:</a>
			
 
				+The first and second guarantees require unbelievably strict ordering!
			
 
				+Are all these memory barriers <i> really</i> required?
			
 
				+<br><a href="#qq6answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+Note that these memory-barrier requirements do not replace the fundamental
			
 
				+RCU requirement that a grace period wait for all pre-existing readers.
			
 
				+On the contrary, the memory barriers called out in this section must operate in
			
 
				+such a way as to <i>enforce</i> this fundamental requirement.
			
 
				+Of course, different implementations enforce this requirement in different
			
 
				+ways, but enforce it they must.
			
 
				+
			
 
				+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The common-case RCU primitives are unconditional.
			
 
				+They are invoked, they do their job, and they return, with no possibility
			
 
				+of error, and no need to retry.
			
 
				+This is a key RCU design philosophy.
			
 
				+
			
 
				+<p>
			
 
				+However, this philosophy is pragmatic rather than pigheaded.
			
 
				+If someone comes up with a good justification for a particular conditional
			
 
				+RCU primitive, it might well be implemented and added.
			
 
				+After all, this guarantee was reverse-engineered, not premeditated.
			
 
				+The unconditional nature of the RCU primitives was initially an
			
 
				+accident of implementation, and later experience with synchronization
			
 
				+primitives with conditional primitives caused me to elevate this
			
 
				+accident to a guarantee.
			
 
				+Therefore, the justification for adding a conditional primitive to
			
 
				+RCU would need to be based on detailed and compelling use cases.
			
 
				+
			
 
				+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+As far as RCU is concerned, it is always possible to carry out an
			
 
				+update within an RCU read-side critical section.
			
 
				+For example, that RCU read-side critical section might search for
			
 
				+a given data element, and then might acquire the update-side
			
 
				+spinlock in order to update that element, all while remaining
			
 
				+in that RCU read-side critical section.
			
 
				+Of course, it is necessary to exit the RCU read-side critical section
			
 
				+before invoking <tt>synchronize_rcu()</tt>, however, this
			
 
				+inconvenience can be avoided through use of the
			
 
				+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
			
 
				+described later in this document.
			
 
				+
			
 
				+<p><a name="Quick Quiz 7"><b>Quick Quiz 7</b>:</a>
			
 
				+But how does the upgrade-to-write operation exclude other readers?
			
 
				+<br><a href="#qq7answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+This guarantee allows lookup code to be shared between read-side
			
 
				+and update-side code, and was premeditated, appearing in the earliest
			
 
				+DYNIX/ptx RCU documentation.
			
 
				+
			
 
				+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+RCU provides extremely lightweight readers, and its read-side guarantees,
			
 
				+though quite useful, are correspondingly lightweight.
			
 
				+It is therefore all too easy to assume that RCU is guaranteeing more
			
 
				+than it really is.
			
 
				+Of course, the list of things that RCU does not guarantee is infinitely
			
 
				+long, however, the following sections list a few non-guarantees that
			
 
				+have caused confusion.
			
 
				+Except where otherwise noted, these non-guarantees were premeditated.
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Readers Impose Minimal Ordering">
			
 
				+	Readers Impose Minimal Ordering</a>
			
 
				+<li>	<a href="#Readers Do Not Exclude Updaters">
			
 
				+	Readers Do Not Exclude Updaters</a>
			
 
				+<li>	<a href="#Updaters Only Wait For Old Readers">
			
 
				+	Updaters Only Wait For Old Readers</a>
			
 
				+<li>	<a href="#Grace Periods Don't Partition Read-Side Critical Sections">
			
 
				+	Grace Periods Don't Partition Read-Side Critical Sections</a>
			
 
				+<li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
			
 
				+	Read-Side Critical Sections Don't Partition Grace Periods</a>
			
 
				+<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
			
 
				+	Disabling Preemption Does Not Block Grace Periods</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Reader-side markers such as <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
			
 
				+except through their interaction with the grace-period APIs such as
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+To see this, consider the following pair of threads:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(x, 1);
			
 
				+ 5   rcu_read_unlock();
			
 
				+ 6   rcu_read_lock();
			
 
				+ 7   WRITE_ONCE(y, 1);
			
 
				+ 8   rcu_read_unlock();
			
 
				+ 9 }
			
 
				+10
			
 
				+11 void thread1(void)
			
 
				+12 {
			
 
				+13   rcu_read_lock();
			
 
				+14   r1 = READ_ONCE(y);
			
 
				+15   rcu_read_unlock();
			
 
				+16   rcu_read_lock();
			
 
				+17   r2 = READ_ONCE(x);
			
 
				+18   rcu_read_unlock();
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
			
 
				+concurrently, it is quite possible to have
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 0)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
			
 
				+which would not be possible if <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> had much in the way of ordering
			
 
				+properties.
			
 
				+But they do not, so the CPU is within its rights
			
 
				+to do significant reordering.
			
 
				+This is by design:  Any significant ordering constraints would slow down
			
 
				+these fast-path APIs.
			
 
				+
			
 
				+<p><a name="Quick Quiz 8"><b>Quick Quiz 8</b>:</a>
			
 
				+Can't the compiler also reorder this code?
			
 
				+<br><a href="#qq8answer">Answer</a>
			
 
				+
			
 
				+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
			
 
				+exclude updates.
			
 
				+All they do is to prevent grace periods from ending.
			
 
				+The following example illustrates this:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   r1 = READ_ONCE(y);
			
 
				+ 5   if (r1) {
			
 
				+ 6     do_something_with_nonzero_x();
			
 
				+ 7     r2 = READ_ONCE(x);
			
 
				+ 8     WARN_ON(!r2); /* BUG!!! */
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11 }
			
 
				+12
			
 
				+13 void thread1(void)
			
 
				+14 {
			
 
				+15   spin_lock(&amp;my_lock);
			
 
				+16   WRITE_ONCE(x, 1);
			
 
				+17   WRITE_ONCE(y, 1);
			
 
				+18   spin_unlock(&amp;my_lock);
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
			
 
				+excluded the <tt>thread1()</tt> function's update,
			
 
				+the <tt>WARN_ON()</tt> could never fire.
			
 
				+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
			
 
				+much of anything aside from subsequent grace periods, of which
			
 
				+<tt>thread1()</tt> has none, so the
			
 
				+<tt>WARN_ON()</tt> can and does fire.
			
 
				+
			
 
				+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
			
 
				+completes, there are no readers executing.
			
 
				+This temptation must be avoided because
			
 
				+new readers can start immediately after <tt>synchronize_rcu()</tt>
			
 
				+starts, and <tt>synchronize_rcu()</tt> is under no
			
 
				+obligation to wait for these new readers.
			
 
				+
			
 
				+<p><a name="Quick Quiz 9"><b>Quick Quiz 9</b>:</a>
			
 
				+Suppose that synchronize_rcu() did wait until all readers had completed.
			
 
				+Would the updater be able to rely on this?
			
 
				+<br><a href="#qq9answer">Answer</a>
			
 
				+
			
 
				+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
			
 
				+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is tempting to assume that if any part of one RCU read-side critical
			
 
				+section precedes a given grace period, and if any part of another RCU
			
 
				+read-side critical section follows that same grace period, then all of
			
 
				+the first RCU read-side critical section must precede all of the second.
			
 
				+However, this just isn't the case: A single grace period does not
			
 
				+partition the set of RCU read-side critical sections.
			
 
				+An example of this situation can be illustrated as follows, where
			
 
				+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   rcu_read_lock();
			
 
				+19   r2 = READ_ONCE(b);
			
 
				+20   r3 = READ_ONCE(c);
			
 
				+21   rcu_read_unlock();
			
 
				+22 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+It turns out that the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+is entirely possible.
			
 
				+The following figure show how this can happen, with each circled
			
 
				+<tt>QS</tt> indicating the point at which RCU recorded a
			
 
				+<i>quiescent state</i> for each thread, that is, a state in which
			
 
				+RCU knows that the thread cannot be in the midst of an RCU read-side
			
 
				+critical section that started before the current grace period:
			
 
				+
			
 
				+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
			
 
				+
			
 
				+<p>
			
 
				+If it is necessary to partition RCU read-side critical sections in this
			
 
				+manner, it is necessary to use two grace periods, where the first
			
 
				+grace period is known to end before the second grace period starts:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   r2 = READ_ONCE(c);
			
 
				+19   synchronize_rcu();
			
 
				+20   WRITE_ONCE(d, 1);
			
 
				+21 }
			
 
				+22
			
 
				+23 void thread3(void)
			
 
				+24 {
			
 
				+25   rcu_read_lock();
			
 
				+26   r3 = READ_ONCE(b);
			
 
				+27   r4 = READ_ONCE(d);
			
 
				+28   rcu_read_unlock();
			
 
				+29 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Here, if <tt>(r1 == 1)</tt>, then
			
 
				+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
			
 
				+before the end of <tt>thread1()</tt>'s grace period.
			
 
				+If in addition <tt>(r4 == 1)</tt>, then
			
 
				+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
			
 
				+after the beginning of <tt>thread2()</tt>'s grace period.
			
 
				+If it is also the case that <tt>(r2 == 1)</tt>, then the
			
 
				+end of <tt>thread1()</tt>'s grace period must precede the
			
 
				+beginning of <tt>thread2()</tt>'s grace period.
			
 
				+This mean that the two RCU read-side critical sections cannot overlap,
			
 
				+guaranteeing that <tt>(r3 == 1)</tt>.
			
 
				+As a result, the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+cannot happen.
			
 
				+
			
 
				+<p>
			
 
				+This non-requirement was also non-premeditated, but became apparent
			
 
				+when studying RCU's interaction with memory ordering.
			
 
				+
			
 
				+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
			
 
				+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is also tempting to assume that if an RCU read-side critical section
			
 
				+happens between a pair of grace periods, then those grace periods cannot
			
 
				+overlap.
			
 
				+However, this temptation leads nowhere good, as can be illustrated by
			
 
				+the following, with all variables initially zero:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   rcu_read_lock();
			
 
				+19   WRITE_ONCE(d, 1);
			
 
				+20   r2 = READ_ONCE(c);
			
 
				+21   rcu_read_unlock();
			
 
				+22 }
			
 
				+23
			
 
				+24 void thread3(void)
			
 
				+25 {
			
 
				+26   r3 = READ_ONCE(d);
			
 
				+27   synchronize_rcu();
			
 
				+28   WRITE_ONCE(e, 1);
			
 
				+29 }
			
 
				+30
			
 
				+31 void thread4(void)
			
 
				+32 {
			
 
				+33   rcu_read_lock();
			
 
				+34   r4 = READ_ONCE(b);
			
 
				+35   r5 = READ_ONCE(e);
			
 
				+36   rcu_read_unlock();
			
 
				+37 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+In this case, the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+is entirely possible, as illustrated below:
			
 
				+
			
 
				+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
			
 
				+
			
 
				+<p>
			
 
				+Again, an RCU read-side critical section can overlap almost all of a
			
 
				+given grace period, just so long as it does not overlap the entire
			
 
				+grace period.
			
 
				+As a result, an RCU read-side critical section cannot partition a pair
			
 
				+of RCU grace periods.
			
 
				+
			
 
				+<p><a name="Quick Quiz 10"><b>Quick Quiz 10</b>:</a>
			
 
				+How long a sequence of grace periods, each separated by an RCU read-side
			
 
				+critical section, would be required to partition the RCU read-side
			
 
				+critical sections at the beginning and end of the chain?
			
 
				+<br><a href="#qq10answer">Answer</a>
			
 
				+
			
 
				+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
			
 
				+Disabling Preemption Does Not Block Grace Periods</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+There was a time when disabling preemption on any given CPU would block
			
 
				+subsequent grace periods.
			
 
				+However, this was an accident of implementation and is not a requirement.
			
 
				+And in the current Linux-kernel implementation, disabling preemption
			
 
				+on a given CPU in fact does not block grace periods, as Oleg Nesterov
			
 
				+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
			
 
				+
			
 
				+<p>
			
 
				+If you need a preempt-disable region to block grace periods, you need to add
			
 
				+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
			
 
				+as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 preempt_disable();
			
 
				+ 2 rcu_read_lock();
			
 
				+ 3 do_something();
			
 
				+ 4 rcu_read_unlock();
			
 
				+ 5 preempt_enable();
			
 
				+ 6
			
 
				+ 7 /* Spinlocks implicitly disable preemption. */
			
 
				+ 8 spin_lock(&amp;mylock);
			
 
				+ 9 rcu_read_lock();
			
 
				+10 do_something();
			
 
				+11 rcu_read_unlock();
			
 
				+12 spin_unlock(&amp;mylock);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+In theory, you could enter the RCU read-side critical section first,
			
 
				+but it is more efficient to keep the entire RCU read-side critical
			
 
				+section contained in the preempt-disable region as shown above.
			
 
				+Of course, RCU read-side critical sections that extend outside of
			
 
				+preempt-disable regions will work correctly, but such critical sections
			
 
				+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
			
 
				+more work.
			
 
				+And no, this is <i>not</i> an invitation to enclose all of your RCU
			
 
				+read-side critical sections within preempt-disable regions, because
			
 
				+doing so would degrade real-time response.
			
 
				+
			
 
				+<p>
			
 
				+This non-requirement appeared with preemptible RCU.
			
 
				+If you need a grace period that waits on non-preemptible code regions, use
			
 
				+<a href="#Sched Flavor">RCU-sched</a>.
			
 
				+
			
 
				+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+These parallelism facts of life are by no means specific to RCU, but
			
 
				+the RCU implementation must abide by them.
			
 
				+They therefore bear repeating:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Any CPU or task may be delayed at any time,
			
 
				+	and any attempts to avoid these delays by disabling
			
 
				+	preemption, interrupts, or whatever are completely futile.
			
 
				+	This is most obvious in preemptible user-level
			
 
				+	environments and in virtualized environments (where
			
 
				+	a given guest OS's VCPUs can be preempted at any time by
			
 
				+	the underlying hypervisor), but can also happen in bare-metal
			
 
				+	environments due to ECC errors, NMIs, and other hardware
			
 
				+	events.
			
 
				+	Although a delay of more than about 20 seconds can result
			
 
				+	in splats, the RCU implementation is obligated to use
			
 
				+	algorithms that can tolerate extremely long delays, but where
			
 
				+	&ldquo;extremely long&rdquo; is not long enough to allow
			
 
				+	wrap-around when incrementing a 64-bit counter.
			
 
				+<li>	Both the compiler and the CPU can reorder memory accesses.
			
 
				+	Where it matters, RCU must use compiler directives and
			
 
				+	memory-barrier instructions to preserve ordering.
			
 
				+<li>	Conflicting writes to memory locations in any given cache line
			
 
				+	will result in expensive cache misses.
			
 
				+	Greater numbers of concurrent writes and more-frequent
			
 
				+	concurrent writes will result in more dramatic slowdowns.
			
 
				+	RCU is therefore obligated to use algorithms that have
			
 
				+	sufficient locality to avoid significant performance and
			
 
				+	scalability problems.
			
 
				+<li>	As a rough rule of thumb, only one CPU's worth of processing
			
 
				+	may be carried out under the protection of any given exclusive
			
 
				+	lock.
			
 
				+	RCU must therefore use scalable locking designs.
			
 
				+<li>	Counters are finite, especially on 32-bit systems.
			
 
				+	RCU's use of counters must therefore tolerate counter wrap,
			
 
				+	or be designed such that counter wrap would take way more
			
 
				+	time than a single system is likely to run.
			
 
				+	An uptime of ten years is quite possible, a runtime
			
 
				+	of a century much less so.
			
 
				+	As an example of the latter, RCU's dyntick-idle nesting counter
			
 
				+	allows 54 bits for interrupt nesting level (this counter
			
 
				+	is 64 bits even on a 32-bit system).
			
 
				+	Overflowing this counter requires 2<sup>54</sup>
			
 
				+	half-interrupts on a given CPU without that CPU ever going idle.
			
 
				+	If a half-interrupt happened every microsecond, it would take
			
 
				+	570 years of runtime to overflow this counter, which is currently
			
 
				+	believed to be an acceptably long time.
			
 
				+<li>	Linux systems can have thousands of CPUs running a single
			
 
				+	Linux kernel in a single shared-memory environment.
			
 
				+	RCU must therefore pay close attention to high-end scalability.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This last parallelism fact of life means that RCU must pay special
			
 
				+attention to the preceding facts of life.
			
 
				+The idea that Linux might scale to systems with thousands of CPUs would
			
 
				+have been met with some skepticism in the 1990s, but these requirements
			
 
				+would have otherwise have been unsurprising, even in the early 1990s.
			
 
				+
			
 
				+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+These sections list quality-of-implementation requirements.
			
 
				+Although an RCU implementation that ignores these requirements could
			
 
				+still be used, it would likely be subject to limitations that would
			
 
				+make it inappropriate for industrial-strength production use.
			
 
				+Classes of quality-of-implementation requirements are as follows:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Specialization">Specialization</a>
			
 
				+<li>	<a href="#Performance and Scalability">Performance and Scalability</a>
			
 
				+<li>	<a href="#Composability">Composability</a>
			
 
				+<li>	<a href="#Corner Cases">Corner Cases</a>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+These classes is covered in the following sections.
			
 
				+
			
 
				+<h3><a name="Specialization">Specialization</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU is and always has been intended primarily for read-mostly situations, as
			
 
				+illustrated by the following figure.
			
 
				+This means that RCU's read-side primitives are optimized, often at the
			
 
				+expense of its update-side primitives.
			
 
				+
			
 
				+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
			
 
				+
			
 
				+<p>
			
 
				+This focus on read-mostly situations means that RCU must interoperate
			
 
				+with other synchronization primitives.
			
 
				+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
			
 
				+examples discussed earlier use RCU to protect readers and locking to
			
 
				+coordinate updaters.
			
 
				+However, the need extends much farther, requiring that a variety of
			
 
				+synchronization primitives be legal within RCU read-side critical sections,
			
 
				+including spinlocks, sequence locks, atomic operations, reference
			
 
				+counters, and memory barriers.
			
 
				+
			
 
				+<p><a name="Quick Quiz 11"><b>Quick Quiz 11</b>:</a>
			
 
				+What about sleeping locks?
			
 
				+<br><a href="#qq11answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+It often comes as a surprise that many algorithms do not require a
			
 
				+consistent view of data, but many can function in that mode,
			
 
				+with network routing being the poster child.
			
 
				+Internet routing algorithms take significant time to propagate
			
 
				+updates, so that by the time an update arrives at a given system,
			
 
				+that system has been sending network traffic the wrong way for
			
 
				+a considerable length of time.
			
 
				+Having a few threads continue to send traffic the wrong way for a
			
 
				+few more milliseconds is clearly not a problem:  In the worst case,
			
 
				+TCP retransmissions will eventually get the data where it needs to go.
			
 
				+In general, when tracking the state of the universe outside of the
			
 
				+computer, some level of inconsistency must be tolerated due to
			
 
				+speed-of-light delays if nothing else.
			
 
				+
			
 
				+<p>
			
 
				+Furthermore, uncertainty about external state is inherent in many cases.
			
 
				+For example, a pair of veternarians might use heartbeat to determine
			
 
				+whether or not a given cat was alive.
			
 
				+But how long should they wait after the last heartbeat to decide that
			
 
				+the cat is in fact dead?
			
 
				+Waiting less than 400 milliseconds makes no sense because this would
			
 
				+mean that a relaxed cat would be considered to cycle between death
			
 
				+and life more than 100 times per minute.
			
 
				+Moreover, just as with human beings, a cat's heart might stop for
			
 
				+some period of time, so the exact wait period is a judgment call.
			
 
				+One of our pair of veternarians might wait 30 seconds before pronouncing
			
 
				+the cat dead, while the other might insist on waiting a full minute.
			
 
				+The two veternarians would then disagree on the state of the cat during
			
 
				+the final 30 seconds of the minute following the last heartbeat, as
			
 
				+fancifully illustrated below:
			
 
				+
			
 
				+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
			
 
				+
			
 
				+<p>
			
 
				+Interestingly enough, this same situation applies to hardware.
			
 
				+When push comes to shove, how do we tell whether or not some
			
 
				+external server has failed?
			
 
				+We send messages to it periodically, and declare it failed if we
			
 
				+don't receive a response within a given period of time.
			
 
				+Policy decisions can usually tolerate short
			
 
				+periods of inconsistency.
			
 
				+The policy was decided some time ago, and is only now being put into
			
 
				+effect, so a few milliseconds of delay is normally inconsequential.
			
 
				+
			
 
				+<p>
			
 
				+However, there are algorithms that absolutely must see consistent data.
			
 
				+For example, the translation between a user-level SystemV semaphore
			
 
				+ID to the corresponding in-kernel data structure is protected by RCU,
			
 
				+but it is absolutely forbidden to update a semaphore that has just been
			
 
				+removed.
			
 
				+In the Linux kernel, this need for consistency is accommodated by acquiring
			
 
				+spinlocks located in the in-kernel data structure from within
			
 
				+the RCU read-side critical section, and this is indicated by the
			
 
				+green box in the figure above.
			
 
				+Many other techniques may be used, and are in fact used within the
			
 
				+Linux kernel.
			
 
				+
			
 
				+<p>
			
 
				+In short, RCU is not required to maintain consistency, and other
			
 
				+mechanisms may be used in concert with RCU when consistency is required.
			
 
				+RCU's specialization allows it to do its job extremely well, and its
			
 
				+ability to interoperate with other synchronization mechanisms allows
			
 
				+the right mix of synchronization tools to be used for a given job.
			
 
				+
			
 
				+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Energy efficiency is a critical component of performance today,
			
 
				+and Linux-kernel RCU implementations must therefore avoid unnecessarily
			
 
				+awakening idle CPUs.
			
 
				+I cannot claim that this requirement was premeditated.
			
 
				+In fact, I learned of it during a telephone conversation in which I
			
 
				+was given &ldquo;frank and open&rdquo; feedback on the importance
			
 
				+of energy efficiency in battery-powered systems and on specific
			
 
				+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
			
 
				+In my experience, the battery-powered embedded community will consider
			
 
				+any unnecessary wakeups to be extremely unfriendly acts.
			
 
				+So much so that mere Linux-kernel-mailing-list posts are
			
 
				+insufficient to vent their ire.
			
 
				+
			
 
				+<p>
			
 
				+Memory consumption is not particularly important for in most
			
 
				+situations, and has become decreasingly
			
 
				+so as memory sizes have expanded and memory
			
 
				+costs have plummeted.
			
 
				+However, as I learned from Matt Mackall's
			
 
				+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
			
 
				+efforts, memory footprint is critically important on single-CPU systems with
			
 
				+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
			
 
				+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
			
 
				+was born.
			
 
				+Josh Triplett has since taken over the small-memory banner with his
			
 
				+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
			
 
				+project, which resulted in
			
 
				+<a href="#Sleepable RCU">SRCU</a>
			
 
				+becoming optional for those kernels not needing it.
			
 
				+
			
 
				+<p>
			
 
				+The remaining performance requirements are, for the most part,
			
 
				+unsurprising.
			
 
				+For example, in keeping with RCU's read-side specialization,
			
 
				+<tt>rcu_dereference()</tt> should have negligible overhead (for
			
 
				+example, suppression of a few minor compiler optimizations).
			
 
				+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
			
 
				+
			
 
				+<p>
			
 
				+In preemptible environments, in the case where the RCU read-side
			
 
				+critical section was not preempted (as will be the case for the
			
 
				+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> should have minimal overhead.
			
 
				+In particular, they should not contain atomic read-modify-write
			
 
				+operations, memory-barrier instructions, preemption disabling,
			
 
				+interrupt disabling, or backwards branches.
			
 
				+However, in the case where the RCU read-side critical section was preempted,
			
 
				+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
			
 
				+This is why it is better to nest an RCU read-side critical section
			
 
				+within a preempt-disable region than vice versa, at least in cases
			
 
				+where that critical section is short enough to avoid unduly degrading
			
 
				+real-time latencies.
			
 
				+
			
 
				+<p>
			
 
				+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
			
 
				+optimized for throughput.
			
 
				+It may therefore incur several milliseconds of latency in addition to
			
 
				+the duration of the longest RCU read-side critical section.
			
 
				+On the other hand, multiple concurrent invocations of
			
 
				+<tt>synchronize_rcu()</tt> are required to use batching optimizations
			
 
				+so that they can be satisfied by a single underlying grace-period-wait
			
 
				+operation.
			
 
				+For example, in the Linux kernel, it is not unusual for a single
			
 
				+grace-period-wait operation to serve more than
			
 
				+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
			
 
				+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
			
 
				+overhead down to nearly zero.
			
 
				+However, the grace-period optimization is also required to avoid
			
 
				+measurable degradation of real-time scheduling and interrupt latencies.
			
 
				+
			
 
				+<p>
			
 
				+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
			
 
				+latencies are unacceptable.
			
 
				+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
			
 
				+instead, reducing the grace-period latency down to a few tens of
			
 
				+microseconds on small systems, at least in cases where the RCU read-side
			
 
				+critical sections are short.
			
 
				+There are currently no special latency requirements for
			
 
				+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
			
 
				+consistent with the empirical nature of the RCU specification,
			
 
				+that is subject to change.
			
 
				+However, there most definitely are scalability requirements:
			
 
				+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
			
 
				+CPUs should at least make reasonable forward progress.
			
 
				+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
			
 
				+is permitted to impose modest degradation of real-time latency
			
 
				+on non-idle online CPUs.
			
 
				+That said, it will likely be necessary to take further steps to reduce this
			
 
				+degradation, hopefully to roughly that of a scheduling-clock interrupt.
			
 
				+
			
 
				+<p>
			
 
				+There are a number of situations where even
			
 
				+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
			
 
				+latency is unacceptable.
			
 
				+In these situations, the asynchronous <tt>call_rcu()</tt> can be
			
 
				+used in place of <tt>synchronize_rcu()</tt> as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 struct foo {
			
 
				+ 2   int a;
			
 
				+ 3   int b;
			
 
				+ 4   struct rcu_head rh;
			
 
				+ 5 };
			
 
				+ 6
			
 
				+ 7 static void remove_gp_cb(struct rcu_head *rhp)
			
 
				+ 8 {
			
 
				+ 9   struct foo *p = container_of(rhp, struct foo, rh);
			
 
				+10
			
 
				+11   kfree(p);
			
 
				+12 }
			
 
				+13
			
 
				+14 bool remove_gp_asynchronous(void)
			
 
				+15 {
			
 
				+16   struct foo *p;
			
 
				+17
			
 
				+18   spin_lock(&amp;gp_lock);
			
 
				+19   p = rcu_dereference(gp);
			
 
				+20   if (!p) {
			
 
				+21     spin_unlock(&amp;gp_lock);
			
 
				+22     return false;
			
 
				+23   }
			
 
				+24   rcu_assign_pointer(gp, NULL);
			
 
				+25   call_rcu(&amp;p-&gt;rh, remove_gp_cb);
			
 
				+26   spin_unlock(&amp;gp_lock);
			
 
				+27   return true;
			
 
				+28 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+A definition of <tt>struct foo</tt> is finally needed, and appears
			
 
				+on lines&nbsp;1-5.
			
 
				+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
			
 
				+on line&nbsp;25, and will be invoked after the end of a subsequent
			
 
				+grace period.
			
 
				+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
			
 
				+but without forcing the updater to wait for a grace period to elapse.
			
 
				+The <tt>call_rcu()</tt> function may be used in a number of
			
 
				+situations where neither <tt>synchronize_rcu()</tt> nor
			
 
				+<tt>synchronize_rcu_expedited()</tt> would be legal,
			
 
				+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
			
 
				+interrupt-disable code, and interrupt handlers.
			
 
				+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
			
 
				+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
			
 
				+executed within softirq (software interrupt) environment within the
			
 
				+Linux kernel,
			
 
				+either within a real softirq handler or under the protection
			
 
				+of <tt>local_bh_disable()</tt>.
			
 
				+In both the Linux kernel and in userspace, it is bad practice to
			
 
				+write an RCU callback function that takes too long.
			
 
				+Long-running operations should be relegated to separate threads or
			
 
				+(in the Linux kernel) workqueues.
			
 
				+
			
 
				+<p><a name="Quick Quiz 12"><b>Quick Quiz 12</b>:</a>
			
 
				+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
			
 
				+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
			
 
				+structure, which would interact badly with concurrent insertions.
			
 
				+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
			
 
				+<br><a href="#qq12answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+However, all that <tt>remove_gp_cb()</tt> is doing is
			
 
				+invoking <tt>kfree()</tt> on the data element.
			
 
				+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
			
 
				+which allows &ldquo;fire and forget&rdquo; operation as shown below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 struct foo {
			
 
				+ 2   int a;
			
 
				+ 3   int b;
			
 
				+ 4   struct rcu_head rh;
			
 
				+ 5 };
			
 
				+ 6
			
 
				+ 7 bool remove_gp_faf(void)
			
 
				+ 8 {
			
 
				+ 9   struct foo *p;
			
 
				+10
			
 
				+11   spin_lock(&amp;gp_lock);
			
 
				+12   p = rcu_dereference(gp);
			
 
				+13   if (!p) {
			
 
				+14     spin_unlock(&amp;gp_lock);
			
 
				+15     return false;
			
 
				+16   }
			
 
				+17   rcu_assign_pointer(gp, NULL);
			
 
				+18   kfree_rcu(p, rh);
			
 
				+19   spin_unlock(&amp;gp_lock);
			
 
				+20   return true;
			
 
				+21 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Note that <tt>remove_gp_faf()</tt> simply invokes
			
 
				+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
			
 
				+further attention to the subsequent grace period and <tt>kfree()</tt>.
			
 
				+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
			
 
				+environments as for <tt>call_rcu()</tt>.
			
 
				+Interestingly enough, DYNIX/ptx had the equivalents of
			
 
				+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
			
 
				+so the very few places that needed something like
			
 
				+<tt>synchronize_rcu()</tt> simply open-coded it.
			
 
				+
			
 
				+<p><a name="Quick Quiz 13"><b>Quick Quiz 13</b>:</a>
			
 
				+Earlier it was claimed that <tt>call_rcu()</tt> and
			
 
				+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
			
 
				+by readers.
			
 
				+But how can that be correct, given that the invocation of the callback
			
 
				+and the freeing of the memory (respectively) must still wait for
			
 
				+a grace period to elapse?
			
 
				+<br><a href="#qq13answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+But what if the updater must wait for the completion of code to be
			
 
				+executed after the end of the grace period, but has other tasks
			
 
				+that can be carried out in the meantime?
			
 
				+The polling-style <tt>get_state_synchronize_rcu()</tt> and
			
 
				+<tt>cond_synchronize_rcu()</tt> functions may be used for this
			
 
				+purpose, as shown below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool remove_gp_poll(void)
			
 
				+ 2 {
			
 
				+ 3   struct foo *p;
			
 
				+ 4   unsigned long s;
			
 
				+ 5
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   p = rcu_access_pointer(gp);
			
 
				+ 8   if (!p) {
			
 
				+ 9     spin_unlock(&amp;gp_lock);
			
 
				+10     return false;
			
 
				+11   }
			
 
				+12   rcu_assign_pointer(gp, NULL);
			
 
				+13   spin_unlock(&amp;gp_lock);
			
 
				+14   s = get_state_synchronize_rcu();
			
 
				+15   do_something_while_waiting();
			
 
				+16   cond_synchronize_rcu(s);
			
 
				+17   kfree(p);
			
 
				+18   return true;
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
			
 
				+&ldquo;cookie&rdquo; from RCU,
			
 
				+then line&nbsp;15 carries out other tasks,
			
 
				+and finally, line&nbsp;16 returns immediately if a grace period has
			
 
				+elapsed in the meantime, but otherwise waits as required.
			
 
				+The need for <tt>get_state_synchronize_rcu</tt> and
			
 
				+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
			
 
				+so it is too early to tell whether they will stand the test of time.
			
 
				+
			
 
				+<p>
			
 
				+RCU thus provides a range of tools to allow updaters to strike the
			
 
				+required tradeoff between latency, flexibility and CPU overhead.
			
 
				+
			
 
				+<h3><a name="Composability">Composability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Composability has received much attention in recent years, perhaps in part
			
 
				+due to the collision of multicore hardware with object-oriented techniques
			
 
				+designed in single-threaded environments for single-threaded use.
			
 
				+And in theory, RCU read-side critical sections may be composed, and in
			
 
				+fact may be nested arbitrarily deeply.
			
 
				+In practice, as with all real-world implementations of composable
			
 
				+constructs, there are limitations.
			
 
				+
			
 
				+<p>
			
 
				+Implementations of RCU for which <tt>rcu_read_lock()</tt>
			
 
				+and <tt>rcu_read_unlock()</tt> generate no code, such as
			
 
				+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
			
 
				+nested arbitrarily deeply.
			
 
				+After all, there is no overhead.
			
 
				+Except that if all these instances of <tt>rcu_read_lock()</tt>
			
 
				+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
			
 
				+compilation will eventually fail due to exhausting memory,
			
 
				+mass storage, or user patience, whichever comes first.
			
 
				+If the nesting is not visible to the compiler, as is the case with
			
 
				+mutually recursive functions each in its own translation unit,
			
 
				+stack overflow will result.
			
 
				+If the nesting takes the form of loops, either the control variable
			
 
				+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
			
 
				+Nevertheless, this class of RCU implementations is one
			
 
				+of the most composable constructs in existence.
			
 
				+
			
 
				+<p>
			
 
				+RCU implementations that explicitly track nesting depth
			
 
				+are limited by the nesting-depth counter.
			
 
				+For example, the Linux kernel's preemptible RCU limits nesting to
			
 
				+<tt>INT_MAX</tt>.
			
 
				+This should suffice for almost all practical purposes.
			
 
				+That said, a consecutive pair of RCU read-side critical sections
			
 
				+between which there is an operation that waits for a grace period
			
 
				+cannot be enclosed in another RCU read-side critical section.
			
 
				+This is because it is not legal to wait for a grace period within
			
 
				+an RCU read-side critical section:  To do so would result either
			
 
				+in deadlock or
			
 
				+in RCU implicitly splitting the enclosing RCU read-side critical
			
 
				+section, neither of which is conducive to a long-lived and prosperous
			
 
				+kernel.
			
 
				+
			
 
				+<p>
			
 
				+It is worth noting that RCU is not alone in limiting composability.
			
 
				+For example, many transactional-memory implementations prohibit
			
 
				+composing a pair of transactions separated by an irrevocable
			
 
				+operation (for example, a network receive operation).
			
 
				+For another example, lock-based critical sections can be composed
			
 
				+surprisingly freely, but only if deadlock is avoided.
			
 
				+
			
 
				+<p>
			
 
				+In short, although RCU read-side critical sections are highly composable,
			
 
				+care is required in some situations, just as is the case for any other
			
 
				+composable synchronization mechanism.
			
 
				+
			
 
				+<h3><a name="Corner Cases">Corner Cases</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+A given RCU workload might have an endless and intense stream of
			
 
				+RCU read-side critical sections, perhaps even so intense that there
			
 
				+was never a point in time during which there was not at least one
			
 
				+RCU read-side critical section in flight.
			
 
				+RCU cannot allow this situation to block grace periods:  As long as
			
 
				+all the RCU read-side critical sections are finite, grace periods
			
 
				+must also be finite.
			
 
				+
			
 
				+<p>
			
 
				+That said, preemptible RCU implementations could potentially result
			
 
				+in RCU read-side critical sections being preempted for long durations,
			
 
				+which has the effect of creating a long-duration RCU read-side
			
 
				+critical section.
			
 
				+This situation can arise only in heavily loaded systems, but systems using
			
 
				+real-time priorities are of course more vulnerable.
			
 
				+Therefore, RCU priority boosting is provided to help deal with this
			
 
				+case.
			
 
				+That said, the exact requirements on RCU priority boosting will likely
			
 
				+evolve as more experience accumulates.
			
 
				+
			
 
				+<p>
			
 
				+Other workloads might have very high update rates.
			
 
				+Although one can argue that such workloads should instead use
			
 
				+something other than RCU, the fact remains that RCU must
			
 
				+handle such workloads gracefully.
			
 
				+This requirement is another factor driving batching of grace periods,
			
 
				+but it is also the driving force behind the checks for large numbers
			
 
				+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
			
 
				+Finally, high update rates should not delay RCU read-side critical
			
 
				+sections, although some read-side delays can occur when using
			
 
				+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
			
 
				+of <tt>try_stop_cpus()</tt>.
			
 
				+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
			
 
				+converted to use lighter-weight inter-processor interrupts (IPIs),
			
 
				+but this will still disturb readers, though to a much smaller degree.)
			
 
				+
			
 
				+<p>
			
 
				+Although all three of these corner cases were understood in the early
			
 
				+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
			
 
				+in a tight loop
			
 
				+in the early 2000s suddenly provided a much deeper appreciation of the
			
 
				+high-update-rate corner case.
			
 
				+This test also motivated addition of some RCU code to react to high update
			
 
				+rates, for example, if a given CPU finds itself with more than 10,000
			
 
				+RCU callbacks queued, it will cause RCU to take evasive action by
			
 
				+more aggressively starting grace periods and more aggressively forcing
			
 
				+completion of grace-period processing.
			
 
				+This evasive action causes the grace period to complete more quickly,
			
 
				+but at the cost of restricting RCU's batching optimizations, thus
			
 
				+increasing the CPU overhead incurred by that grace period.
			
 
				+
			
 
				+<h2><a name="Software-Engineering Requirements">
			
 
				+Software-Engineering Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
			
 
				+guard against mishaps and misuse:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	It is all too easy to forget to use <tt>rcu_read_lock()</tt>
			
 
				+	everywhere that it is needed, so kernels built with
			
 
				+	<tt>CONFIG_PROVE_RCU=y</tt> will spat if
			
 
				+	<tt>rcu_dereference()</tt> is used outside of an
			
 
				+	RCU read-side critical section.
			
 
				+	Update-side code can use <tt>rcu_dereference_protected()</tt>,
			
 
				+	which takes a
			
 
				+	<a href="https://lwn.net/Articles/371986/">lockdep expression</a>
			
 
				+	to indicate what is providing the protection.
			
 
				+	If the indicated protection is not provided, a lockdep splat
			
 
				+	is emitted.
			
 
				+
			
 
				+	<p>
			
 
				+	Code shared between readers and updaters can use
			
 
				+	<tt>rcu_dereference_check()</tt>, which also takes a
			
 
				+	lockdep expression, and emits a lockdep splat if neither
			
 
				+	<tt>rcu_read_lock()</tt> nor the indicated protection
			
 
				+	is in place.
			
 
				+	In addition, <tt>rcu_dereference_raw()</tt> is used in those
			
 
				+	(hopefully rare) cases where the required protection cannot
			
 
				+	be easily described.
			
 
				+	Finally, <tt>rcu_read_lock_held()</tt> is provided to
			
 
				+	allow a function to verify that it has been invoked within
			
 
				+	an RCU read-side critical section.
			
 
				+	I was made aware of this set of requirements shortly after Thomas
			
 
				+	Gleixner audited a number of RCU uses.
			
 
				+<li>	A given function might wish to check for RCU-related preconditions
			
 
				+	upon entry, before using any other RCU API.
			
 
				+	The <tt>rcu_lockdep_assert()</tt> does this job,
			
 
				+	asserting the expression in kernels having lockdep enabled
			
 
				+	and doing nothing otherwise.
			
 
				+<li>	It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
			
 
				+	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
			
 
				+	substituting a simple assignment.
			
 
				+	To catch this sort of error, a given RCU-protected pointer may be
			
 
				+	tagged with <tt>__rcu</tt>, after which running sparse
			
 
				+	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
			
 
				+	about simple-assignment accesses to that pointer.
			
 
				+	Arnd Bergmann made me aware of this requirement, and also
			
 
				+	supplied the needed
			
 
				+	<a href="https://lwn.net/Articles/376011/">patch series</a>.
			
 
				+<li>	Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
			
 
				+	will splat if a data element is passed to <tt>call_rcu()</tt>
			
 
				+	twice in a row, without a grace period in between.
			
 
				+	(This error is similar to a double free.)
			
 
				+	The corresponding <tt>rcu_head</tt> structures that are
			
 
				+	dynamically allocated are automatically tracked, but
			
 
				+	<tt>rcu_head</tt> structures allocated on the stack
			
 
				+	must be initialized with <tt>init_rcu_head_on_stack()</tt>
			
 
				+	and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
			
 
				+	Similarly, statically allocated non-stack <tt>rcu_head</tt>
			
 
				+	structures must be initialized with <tt>init_rcu_head()</tt>
			
 
				+	and cleaned up with <tt>destroy_rcu_head()</tt>.
			
 
				+	Mathieu Desnoyers made me aware of this requirement, and also
			
 
				+	supplied the needed
			
 
				+	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
			
 
				+<li>	An infinite loop in an RCU read-side critical section will
			
 
				+	eventually trigger an RCU CPU stall warning splat, with
			
 
				+	the duration of &ldquo;eventually&rdquo; being controlled by the
			
 
				+	<tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
			
 
				+	alternatively, by the
			
 
				+	<tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
			
 
				+	parameter.
			
 
				+	However, RCU is not obligated to produce this splat
			
 
				+	unless there is a grace period waiting on that particular
			
 
				+	RCU read-side critical section.
			
 
				+	<p>
			
 
				+	Some extreme workloads might intentionally delay
			
 
				+	RCU grace periods, and systems running those workloads can
			
 
				+	be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
			
 
				+	to suppress the splats.
			
 
				+	This kernel parameter may also be set via <tt>sysfs</tt>.
			
 
				+	Furthermore, RCU CPU stall warnings are counter-productive
			
 
				+	during sysrq dumps and during panics.
			
 
				+	RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
			
 
				+	<tt>rcu_sysrq_end()</tt> API members to be called before
			
 
				+	and after long sysrq dumps.
			
 
				+	RCU also supplies the <tt>rcu_panic()</tt> notifier that is
			
 
				+	automatically invoked at the beginning of a panic to suppress
			
 
				+	further RCU CPU stall warnings.
			
 
				+
			
 
				+	<p>
			
 
				+	This requirement made itself known in the early 1990s, pretty
			
 
				+	much the first time that it was necessary to debug a CPU stall.
			
 
				+	That said, the initial implementation in DYNIX/ptx was quite
			
 
				+	generic in comparison with that of Linux.
			
 
				+<li>	Although it would be very good to detect pointers leaking out
			
 
				+	of RCU read-side critical sections, there is currently no
			
 
				+	good way of doing this.
			
 
				+	One complication is the need to distinguish between pointers
			
 
				+	leaking and pointers that have been handed off from RCU to
			
 
				+	some other synchronization mechanism, for example, reference
			
 
				+	counting.
			
 
				+<li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
			
 
				+	information is provided via both debugfs and event tracing.
			
 
				+<li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
			
 
				+	<tt>rcu_dereference()</tt> to create typical linked
			
 
				+	data structures can be surprisingly error-prone.
			
 
				+	Therefore, RCU-protected
			
 
				+	<a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
			
 
				+	and, more recently, RCU-protected
			
 
				+	<a href="https://lwn.net/Articles/612100/">hash tables</a>
			
 
				+	are available.
			
 
				+	Many other special-purpose RCU-protected data structures are
			
 
				+	available in the Linux kernel and the userspace RCU library.
			
 
				+<li>	Some linked structures are created at compile time, but still
			
 
				+	require <tt>__rcu</tt> checking.
			
 
				+	The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
			
 
				+	purpose.
			
 
				+<li>	It is not necessary to use <tt>rcu_assign_pointer()</tt>
			
 
				+	when creating linked structures that are to be published via
			
 
				+	a single external pointer.
			
 
				+	The <tt>RCU_INIT_POINTER()</tt> macro is provided for
			
 
				+	this task and also for assigning <tt>NULL</tt> pointers
			
 
				+	at runtime.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This not a hard-and-fast list:  RCU's diagnostic capabilities will
			
 
				+continue to be guided by the number and type of usage bugs found
			
 
				+in real-world RCU usage.
			
 
				+
			
 
				+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel provides an interesting environment for all kinds of
			
 
				+software, including RCU.
			
 
				+Some of the relevant points of interest are as follows:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Configuration">Configuration</a>.
			
 
				+<li>	<a href="#Firmware Interface">Firmware Interface</a>.
			
 
				+<li>	<a href="#Early Boot">Early Boot</a>.
			
 
				+<li>	<a href="#Interrupts and NMIs">
			
 
				+	Interrupts and non-maskable interrupts (NMIs)</a>.
			
 
				+<li>	<a href="#Loadable Modules">Loadable Modules</a>.
			
 
				+<li>	<a href="#Hotplug CPU">Hotplug CPU</a>.
			
 
				+<li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
			
 
				+<li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
			
 
				+<li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
			
 
				+<li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
			
 
				+<li>	<a href="#Performance, Scalability, Response Time, and Reliability">
			
 
				+	Performance, Scalability, Response Time, and Reliability</a>.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This list is probably incomplete, but it does give a feel for the
			
 
				+most notable Linux-kernel complications.
			
 
				+Each of the following sections covers one of the above topics.
			
 
				+
			
 
				+<h3><a name="Configuration">Configuration</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's goal is automatic configuration, so that almost nobody
			
 
				+needs to worry about RCU's <tt>Kconfig</tt> options.
			
 
				+And for almost all users, RCU does in fact work well
			
 
				+&ldquo;out of the box.&rdquo;
			
 
				+
			
 
				+<p>
			
 
				+However, there are specialized use cases that are handled by
			
 
				+kernel boot parameters and <tt>Kconfig</tt> options.
			
 
				+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
			
 
				+about new <tt>Kconfig</tt> options, which requires almost all of them
			
 
				+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
			
 
				+
			
 
				+<p>
			
 
				+This all should be quite obvious, but the fact remains that
			
 
				+Linus Torvalds recently had to
			
 
				+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
			
 
				+me of this requirement.
			
 
				+
			
 
				+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+In many cases, kernel obtains information about the system from the
			
 
				+firmware, and sometimes things are lost in translation.
			
 
				+Or the translation is accurate, but the original message is bogus.
			
 
				+
			
 
				+<p>
			
 
				+For example, some systems' firmware overreports the number of CPUs,
			
 
				+sometimes by a large factor.
			
 
				+If RCU naively believed the firmware, as it used to do,
			
 
				+it would create too many per-CPU kthreads.
			
 
				+Although the resulting system will still run correctly, the extra
			
 
				+kthreads needlessly consume memory and can cause confusion
			
 
				+when they show up in <tt>ps</tt> listings.
			
 
				+
			
 
				+<p>
			
 
				+RCU must therefore wait for a given CPU to actually come online before
			
 
				+it can allow itself to believe that the CPU actually exists.
			
 
				+The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
			
 
				+come online) cause a number of
			
 
				+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
			
 
				+
			
 
				+<h3><a name="Early Boot">Early Boot</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel's boot sequence is an interesting process,
			
 
				+and RCU is used early, even before <tt>rcu_init()</tt>
			
 
				+is invoked.
			
 
				+In fact, a number of RCU's primitives can be used as soon as the
			
 
				+initial task's <tt>task_struct</tt> is available and the
			
 
				+boot CPU's per-CPU variables are set up.
			
 
				+The read-side primitives (<tt>rcu_read_lock()</tt>,
			
 
				+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
			
 
				+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
			
 
				+as will <tt>rcu_assign_pointer()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although <tt>call_rcu()</tt> may be invoked at any
			
 
				+time during boot, callbacks are not guaranteed to be invoked until after
			
 
				+the scheduler is fully up and running.
			
 
				+This delay in callback invocation is due to the fact that RCU does not
			
 
				+invoke callbacks until it is fully initialized, and this full initialization
			
 
				+cannot occur until after the scheduler has initialized itself to the
			
 
				+point where RCU can spawn and run its kthreads.
			
 
				+In theory, it would be possible to invoke callbacks earlier,
			
 
				+however, this is not a panacea because there would be severe restrictions
			
 
				+on what operations those callbacks could invoke.
			
 
				+
			
 
				+<p>
			
 
				+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
			
 
				+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
			
 
				+(<a href="#Bottom-Half Flavor">discussed below</a>),
			
 
				+and
			
 
				+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
			
 
				+will all operate normally
			
 
				+during very early boot, the reason being that there is only one CPU
			
 
				+and preemption is disabled.
			
 
				+This means that the call <tt>synchronize_rcu()</tt> (or friends)
			
 
				+itself is a quiescent
			
 
				+state and thus a grace period, so the early-boot implementation can
			
 
				+be a no-op.
			
 
				+
			
 
				+<p>
			
 
				+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
			
 
				+continue to operate normally through the remainder of boot, courtesy
			
 
				+of the fact that preemption is disabled across their RCU read-side
			
 
				+critical sections and also courtesy of the fact that there is still
			
 
				+only one CPU.
			
 
				+However, once the scheduler starts initializing, preemption is enabled.
			
 
				+There is still only a single CPU, but the fact that preemption is enabled
			
 
				+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
			
 
				+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
			
 
				+Therefore, as soon as the scheduler starts initializing, the early-boot
			
 
				+fastpath is disabled.
			
 
				+This means that <tt>synchronize_rcu()</tt> switches to its runtime
			
 
				+mode of operation where it posts callbacks, which in turn means that
			
 
				+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
			
 
				+callback is invoked.
			
 
				+Unfortunately, the callback cannot be invoked until RCU's runtime
			
 
				+grace-period machinery is up and running, which cannot happen until
			
 
				+the scheduler has initialized itself sufficiently to allow RCU's
			
 
				+kthreads to be spawned.
			
 
				+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
			
 
				+initialization can result in deadlock.
			
 
				+
			
 
				+<p><a name="Quick Quiz 14"><b>Quick Quiz 14</b>:</a>
			
 
				+So what happens with <tt>synchronize_rcu()</tt> during
			
 
				+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
			
 
				+kernels?
			
 
				+<br><a href="#qq14answer">Answer</a>
			
 
				+
			
 
				+<p>
			
 
				+I learned of these boot-time requirements as a result of a series of
			
 
				+system hangs.
			
 
				+
			
 
				+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has interrupts, and RCU read-side critical sections are
			
 
				+legal within interrupt handlers and within interrupt-disabled regions
			
 
				+of code, as are invocations of <tt>call_rcu()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Some Linux-kernel architectures can enter an interrupt handler from
			
 
				+non-idle process context, and then just never leave it, instead stealthily
			
 
				+transitioning back to process context.
			
 
				+This trick is sometimes used to invoke system calls from inside the kernel.
			
 
				+These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
			
 
				+about how it counts interrupt nesting levels.
			
 
				+I learned of this requirement the hard way during a rewrite
			
 
				+of RCU's dyntick-idle code.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has non-maskable interrupts (NMIs), and
			
 
				+RCU read-side critical sections are legal within NMI handlers.
			
 
				+Thankfully, RCU update-side primitives, including
			
 
				+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
			
 
				+
			
 
				+<p>
			
 
				+The name notwithstanding, some Linux-kernel architectures
			
 
				+can have nested NMIs, which RCU must handle correctly.
			
 
				+Andy Lutomirski
			
 
				+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
			
 
				+with this requirement;
			
 
				+he also kindly surprised me with
			
 
				+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
			
 
				+that meets this requirement.
			
 
				+
			
 
				+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has loadable modules, and these modules can
			
 
				+also be unloaded.
			
 
				+After a given module has been unloaded, any attempt to call
			
 
				+one of its functions results in a segmentation fault.
			
 
				+The module-unload functions must therefore cancel any
			
 
				+delayed calls to loadable-module functions, for example,
			
 
				+any outstanding <tt>mod_timer()</tt> must be dealt with
			
 
				+via <tt>del_timer_sync()</tt> or similar.
			
 
				+
			
 
				+<p>
			
 
				+Unfortunately, there is no way to cancel an RCU callback;
			
 
				+once you invoke <tt>call_rcu()</tt>, the callback function is
			
 
				+going to eventually be invoked, unless the system goes down first.
			
 
				+Because it is normally considered socially irresponsible to crash the system
			
 
				+in response to a module unload request, we need some other way
			
 
				+to deal with in-flight RCU callbacks.
			
 
				+
			
 
				+<p>
			
 
				+RCU therefore provides
			
 
				+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
			
 
				+which waits until all in-flight RCU callbacks have been invoked.
			
 
				+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
			
 
				+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
			
 
				+<tt>rcu_barrier()</tt>.
			
 
				+In theory, the underlying module-unload code could invoke
			
 
				+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
			
 
				+incur unacceptable latencies.
			
 
				+
			
 
				+<p>
			
 
				+Nikita Danilov noted this requirement for an analogous filesystem-unmount
			
 
				+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
			
 
				+The need for <tt>rcu_barrier()</tt> for module unloading became
			
 
				+apparent later.
			
 
				+
			
 
				+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel supports CPU hotplug, which means that CPUs
			
 
				+can come and go.
			
 
				+It is of course illegal to use any RCU API member from an offline CPU.
			
 
				+This requirement was present from day one in DYNIX/ptx, but
			
 
				+on the other hand, the Linux kernel's CPU-hotplug implementation
			
 
				+is &ldquo;interesting.&rdquo;
			
 
				+
			
 
				+<p>
			
 
				+The Linux-kernel CPU-hotplug implementation has notifiers that
			
 
				+are used to allow the various kernel subsystems (including RCU)
			
 
				+to respond appropriately to a given CPU-hotplug operation.
			
 
				+Most RCU operations may be invoked from CPU-hotplug notifiers,
			
 
				+including even normal synchronous grace-period operations
			
 
				+such as <tt>synchronize_rcu()</tt>.
			
 
				+However, expedited grace-period operations such as
			
 
				+<tt>synchronize_rcu_expedited()</tt> are not supported,
			
 
				+due to the fact that current implementations block CPU-hotplug
			
 
				+operations, which could result in deadlock.
			
 
				+
			
 
				+<p>
			
 
				+In addition, all-callback-wait operations such as
			
 
				+<tt>rcu_barrier()</tt> are also not supported, due to the
			
 
				+fact that there are phases of CPU-hotplug operations where
			
 
				+the outgoing CPU's callbacks will not be invoked until after
			
 
				+the CPU-hotplug operation ends, which could also result in deadlock.
			
 
				+
			
 
				+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU depends on the scheduler, and the scheduler uses RCU to
			
 
				+protect some of its data structures.
			
 
				+This means the scheduler is forbidden from acquiring
			
 
				+the runqueue locks and the priority-inheritance locks
			
 
				+in the middle of an outermost RCU read-side critical section unless either
			
 
				+(1)&nbsp;it releases them before exiting that same
			
 
				+RCU read-side critical section, or
			
 
				+(2)&nbsp;interrupts are disabled across
			
 
				+that entire RCU read-side critical section.
			
 
				+This same prohibition also applies (recursively!) to any lock that is acquired
			
 
				+while holding any lock to which this prohibition applies.
			
 
				+Adhering to this rule prevents preemptible RCU from invoking
			
 
				+<tt>rcu_read_unlock_special()</tt> while either runqueue or
			
 
				+priority-inheritance locks are held, thus avoiding deadlock.
			
 
				+
			
 
				+<p>
			
 
				+Prior to v4.4, it was only necessary to disable preemption across
			
 
				+RCU read-side critical sections that acquired scheduler locks.
			
 
				+In v4.4, expedited grace periods started using IPIs, and these
			
 
				+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
			
 
				+Therefore, this expedited-grace-period change required disabling of
			
 
				+interrupts, not just preemption.
			
 
				+
			
 
				+<p>
			
 
				+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
			
 
				+implementation must be written carefully to avoid similar deadlocks.
			
 
				+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
			
 
				+interrupt where the interrupt handler invokes both
			
 
				+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
			
 
				+This possibility requires <tt>rcu_read_unlock()</tt> to use
			
 
				+negative nesting levels to avoid destructive recursion via
			
 
				+interrupt handler's use of RCU.
			
 
				+
			
 
				+<p>
			
 
				+This pair of mutual scheduler-RCU requirements came as a
			
 
				+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
			
 
				+
			
 
				+<p>
			
 
				+As noted above, RCU makes use of kthreads, and it is necessary to
			
 
				+avoid excessive CPU-time accumulation by these kthreads.
			
 
				+This requirement was no surprise, but RCU's violation of it
			
 
				+when running context-switch-heavy workloads when built with
			
 
				+<tt>CONFIG_NO_HZ_FULL=y</tt>
			
 
				+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
			
 
				+RCU has made good progress towards meeting this requirement, even
			
 
				+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
			
 
				+but there is room for further improvement.
			
 
				+
			
 
				+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is possible to use tracing on RCU code, but tracing itself
			
 
				+uses RCU.
			
 
				+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
			
 
				+is provided for use by tracing, which avoids the destructive
			
 
				+recursion that could otherwise ensue.
			
 
				+This API is also used by virtualization in some architectures,
			
 
				+where RCU readers execute in environments in which tracing
			
 
				+cannot be used.
			
 
				+The tracing folks both located the requirement and provided the
			
 
				+needed fix, so this surprise requirement was relatively painless.
			
 
				+
			
 
				+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Interrupting idle CPUs is considered socially unacceptable,
			
 
				+especially by people with battery-powered embedded systems.
			
 
				+RCU therefore conserves energy by detecting which CPUs are
			
 
				+idle, including tracking CPUs that have been interrupted from idle.
			
 
				+This is a large part of the energy-efficiency requirement,
			
 
				+so I learned of this via an irate phone call.
			
 
				+
			
 
				+<p>
			
 
				+Because RCU avoids interrupting idle CPUs, it is illegal to
			
 
				+execute an RCU read-side critical section on an idle CPU.
			
 
				+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
			
 
				+if you try it.)
			
 
				+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
			
 
				+event tracing is provided to work around this restriction.
			
 
				+In addition, <tt>rcu_is_watching()</tt> may be used to
			
 
				+test whether or not it is currently legal to run RCU read-side
			
 
				+critical sections on this CPU.
			
 
				+I learned of the need for diagnostics on the one hand
			
 
				+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
			
 
				+idle-loop code.
			
 
				+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
			
 
				+which is used quite heavily in the idle loop.
			
 
				+
			
 
				+<p>
			
 
				+It is similarly socially unacceptable to interrupt an
			
 
				+<tt>nohz_full</tt> CPU running in userspace.
			
 
				+RCU must therefore track <tt>nohz_full</tt> userspace
			
 
				+execution.
			
 
				+And in
			
 
				+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
			
 
				+kernels, RCU must separately track idle CPUs on the one hand and
			
 
				+CPUs that are either idle or executing in userspace on the other.
			
 
				+In both cases, RCU must be able to sample state at two points in
			
 
				+time, and be able to determine whether or not some other CPU spent
			
 
				+any time idle and/or executing in userspace.
			
 
				+
			
 
				+<p>
			
 
				+These energy-efficiency requirements have proven quite difficult to
			
 
				+understand and to meet, for example, there have been more than five
			
 
				+clean-sheet rewrites of RCU's energy-efficiency code, the last of
			
 
				+which was finally able to demonstrate
			
 
				+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
			
 
				+As noted earlier,
			
 
				+I learned of many of these requirements via angry phone calls:
			
 
				+Flaming me on the Linux-kernel mailing list was apparently not
			
 
				+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
			
 
				+
			
 
				+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Although small-memory non-realtime systems can simply use Tiny RCU,
			
 
				+code size is only one aspect of memory efficiency.
			
 
				+Another aspect is the size of the <tt>rcu_head</tt> structure
			
 
				+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
			
 
				+Although this structure contains nothing more than a pair of pointers,
			
 
				+it does appear in many RCU-protected data structures, including
			
 
				+some that are size critical.
			
 
				+The <tt>page</tt> structure is a case in point, as evidenced by
			
 
				+the many occurrences of the <tt>union</tt> keyword within that structure.
			
 
				+
			
 
				+<p>
			
 
				+This need for memory efficiency is one reason that RCU uses hand-crafted
			
 
				+singly linked lists to track the <tt>rcu_head</tt> structures that
			
 
				+are waiting for a grace period to elapse.
			
 
				+It is also the reason why <tt>rcu_head</tt> structures do not contain
			
 
				+debug information, such as fields tracking the file and line of the
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
			
 
				+Although this information might appear in debug-only kernel builds at some
			
 
				+point, in the meantime, the <tt>-&gt;func</tt> field will often provide
			
 
				+the needed debug information.
			
 
				+
			
 
				+<p>
			
 
				+However, in some cases, the need for memory efficiency leads to even
			
 
				+more extreme measures.
			
 
				+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
			
 
				+shares storage with a great many other structures that are used at
			
 
				+various points in the corresponding page's lifetime.
			
 
				+In order to correctly resolve certain
			
 
				+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
			
 
				+the Linux kernel's memory-management subsystem needs a particular bit
			
 
				+to remain zero during all phases of grace-period processing,
			
 
				+and that bit happens to map to the bottom bit of the
			
 
				+<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
			
 
				+RCU makes this guarantee as long as <tt>call_rcu()</tt>
			
 
				+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
			
 
				+or some future &ldquo;lazy&rdquo;
			
 
				+variant of <tt>call_rcu()</tt> that might one day be created for
			
 
				+energy-efficiency purposes.
			
 
				+
			
 
				+<h3><a name="Performance, Scalability, Response Time, and Reliability">
			
 
				+Performance, Scalability, Response Time, and Reliability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Expanding on the
			
 
				+<a href="#Performance and Scalability">earlier discussion</a>,
			
 
				+RCU is used heavily by hot code paths in performance-critical
			
 
				+portions of the Linux kernel's networking, security, virtualization,
			
 
				+and scheduling code paths.
			
 
				+RCU must therefore use efficient implementations, especially in its
			
 
				+read-side primitives.
			
 
				+To that end, it would be good if preemptible RCU's implementation
			
 
				+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
			
 
				+this requires resolving <tt>#include</tt> issues with the
			
 
				+<tt>task_struct</tt> structure.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel supports hardware configurations with up to
			
 
				+4096 CPUs, which means that RCU must be extremely scalable.
			
 
				+Algorithms that involve frequent acquisitions of global locks or
			
 
				+frequent atomic operations on global variables simply cannot be
			
 
				+tolerated within the RCU implementation.
			
 
				+RCU therefore makes heavy use of a combining tree based on the
			
 
				+<tt>rcu_node</tt> structure.
			
 
				+RCU is required to tolerate all CPUs continuously invoking any
			
 
				+combination of RCU's runtime primitives with minimal per-operation
			
 
				+overhead.
			
 
				+In fact, in many cases, increasing load must <i>decrease</i> the
			
 
				+per-operation overhead, witness the batching optimizations for
			
 
				+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
			
 
				+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
			
 
				+As a general rule, RCU must cheerfully accept whatever the
			
 
				+rest of the Linux kernel decides to throw at it.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel is used for real-time workloads, especially
			
 
				+in conjunction with the
			
 
				+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
			
 
				+The real-time-latency response requirements are such that the
			
 
				+traditional approach of disabling preemption across RCU
			
 
				+read-side critical sections is inappropriate.
			
 
				+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
			
 
				+use an RCU implementation that allows RCU read-side critical
			
 
				+sections to be preempted.
			
 
				+This requirement made its presence known after users made it
			
 
				+clear that an earlier
			
 
				+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
			
 
				+did not meet their needs, in conjunction with some
			
 
				+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
			
 
				+encountered by a very early version of the -rt patchset.
			
 
				+
			
 
				+<p>
			
 
				+In addition, RCU must make do with a sub-100-microsecond real-time latency
			
 
				+budget.
			
 
				+In fact, on smaller systems with the -rt patchset, the Linux kernel
			
 
				+provides sub-20-microsecond real-time latencies for the whole kernel,
			
 
				+including RCU.
			
 
				+RCU's scalability and latency must therefore be sufficient for
			
 
				+these sorts of configurations.
			
 
				+To my surprise, the sub-100-microsecond real-time latency budget
			
 
				+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
			
 
				+applies to even the largest systems [PDF]</a>,
			
 
				+up to and including systems with 4096 CPUs.
			
 
				+This real-time requirement motivated the grace-period kthread, which
			
 
				+also simplified handling of a number of race conditions.
			
 
				+
			
 
				+<p>
			
 
				+Finally, RCU's status as a synchronization primitive means that
			
 
				+any RCU failure can result in arbitrary memory corruption that can be
			
 
				+extremely difficult to debug.
			
 
				+This means that RCU must be extremely reliable, which in
			
 
				+practice also means that RCU must have an aggressive stress-test
			
 
				+suite.
			
 
				+This stress-test suite is called <tt>rcutorture</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although the need for <tt>rcutorture</tt> was no surprise,
			
 
				+the current immense popularity of the Linux kernel is posing
			
 
				+interesting&mdash;and perhaps unprecedented&mdash;validation
			
 
				+challenges.
			
 
				+To see this, keep in mind that there are well over one billion
			
 
				+instances of the Linux kernel running today, given Android
			
 
				+smartphones, Linux-powered televisions, and servers.
			
 
				+This number can be expected to increase sharply with the advent of
			
 
				+the celebrated Internet of Things.
			
 
				+
			
 
				+<p>
			
 
				+Suppose that RCU contains a race condition that manifests on average
			
 
				+once per million years of runtime.
			
 
				+This bug will be occurring about three times per <i>day</i> across
			
 
				+the installed base.
			
 
				+RCU could simply hide behind hardware error rates, given that no one
			
 
				+should really expect their smartphone to last for a million years.
			
 
				+However, anyone taking too much comfort from this thought should
			
 
				+consider the fact that in most jurisdictions, a successful multi-year
			
 
				+test of a given mechanism, which might include a Linux kernel,
			
 
				+suffices for a number of types of safety-critical certifications.
			
 
				+In fact, rumor has it that the Linux kernel is already being used
			
 
				+in production for safety-critical applications.
			
 
				+I don't know about you, but I would feel quite bad if a bug in RCU
			
 
				+killed someone.
			
 
				+Which might explain my recent focus on validation and verification.
			
 
				+
			
 
				+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+One of the more surprising things about RCU is that there are now
			
 
				+no fewer than five <i>flavors</i>, or API families.
			
 
				+In addition, the primary flavor that has been the sole focus up to
			
 
				+this point has two different implementations, non-preemptible and
			
 
				+preemptible.
			
 
				+The other four flavors are listed below, with requirements for each
			
 
				+described in a separate section.
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
			
 
				+<li>	<a href="#Sched Flavor">Sched Flavor</a>
			
 
				+<li>	<a href="#Sleepable RCU">Sleepable RCU</a>
			
 
				+<li>	<a href="#Tasks RCU">Tasks RCU</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
			
 
				+hence the &ldquo;_bh&rdquo; abbreviations)
			
 
				+flavor of RCU, or <i>RCU-bh</i>, was developed by
			
 
				+Dipankar Sarma to provide a flavor of RCU that could withstand the
			
 
				+network-based denial-of-service attacks researched by Robert
			
 
				+Olsson.
			
 
				+These attacks placed so much networking load on the system
			
 
				+that some of the CPUs never exited softirq execution,
			
 
				+which in turn prevented those CPUs from ever executing a context switch,
			
 
				+which, in the RCU implementation of that time, prevented grace periods
			
 
				+from ever ending.
			
 
				+The result was an out-of-memory condition and a system hang.
			
 
				+
			
 
				+<p>
			
 
				+The solution was the creation of RCU-bh, which does
			
 
				+<tt>local_bh_disable()</tt>
			
 
				+across its read-side critical sections, and which uses the transition
			
 
				+from one type of softirq processing to another as a quiescent state
			
 
				+in addition to context switch, idle, user mode, and offline.
			
 
				+This means that RCU-bh grace periods can complete even when some of
			
 
				+the CPUs execute in softirq indefinitely, thus allowing algorithms
			
 
				+based on RCU-bh to withstand network-based denial-of-service attacks.
			
 
				+
			
 
				+<p>
			
 
				+Because
			
 
				+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
			
 
				+disable and re-enable softirq handlers, any attempt to start a softirq
			
 
				+handlers during the
			
 
				+RCU-bh read-side critical section will be deferred.
			
 
				+In this case, <tt>rcu_read_unlock_bh()</tt>
			
 
				+will invoke softirq processing, which can take considerable time.
			
 
				+One can of course argue that this softirq overhead should be associated
			
 
				+with the code following the RCU-bh read-side critical section rather
			
 
				+than <tt>rcu_read_unlock_bh()</tt>, but the fact
			
 
				+is that most profiling tools cannot be expected to make this sort
			
 
				+of fine distinction.
			
 
				+For example, suppose that a three-millisecond-long RCU-bh read-side
			
 
				+critical section executes during a time of heavy networking load.
			
 
				+There will very likely be an attempt to invoke at least one softirq
			
 
				+handler during that three milliseconds, but any such invocation will
			
 
				+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
			
 
				+This can of course make it appear at first glance as if
			
 
				+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
			
 
				+includes
			
 
				+<tt>rcu_read_lock_bh()</tt>,
			
 
				+<tt>rcu_read_unlock_bh()</tt>,
			
 
				+<tt>rcu_dereference_bh()</tt>,
			
 
				+<tt>rcu_dereference_bh_check()</tt>,
			
 
				+<tt>synchronize_rcu_bh()</tt>,
			
 
				+<tt>synchronize_rcu_bh_expedited()</tt>,
			
 
				+<tt>call_rcu_bh()</tt>,
			
 
				+<tt>rcu_barrier_bh()</tt>, and
			
 
				+<tt>rcu_read_lock_bh_held()</tt>.
			
 
				+
			
 
				+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Before preemptible RCU, waiting for an RCU grace period had the
			
 
				+side effect of also waiting for all pre-existing interrupt
			
 
				+and NMI handlers.
			
 
				+However, there are legitimate preemptible-RCU implementations that
			
 
				+do not have this property, given that any point in the code outside
			
 
				+of an RCU read-side critical section can be a quiescent state.
			
 
				+Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
			
 
				+RCU in that an RCU-sched grace period waits for for pre-existing
			
 
				+interrupt and NMI handlers.
			
 
				+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
			
 
				+APIs have identical implementations, while kernels built with
			
 
				+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
			
 
				+
			
 
				+<p>
			
 
				+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
			
 
				+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
			
 
				+disable and re-enable preemption, respectively.
			
 
				+This means that if there was a preemption attempt during the
			
 
				+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
			
 
				+will enter the scheduler, with all the latency and overhead entailed.
			
 
				+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
			
 
				+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
			
 
				+However, the highest-priority task won't be preempted, so that task
			
 
				+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
			
 
				+includes
			
 
				+<tt>rcu_read_lock_sched()</tt>,
			
 
				+<tt>rcu_read_unlock_sched()</tt>,
			
 
				+<tt>rcu_read_lock_sched_notrace()</tt>,
			
 
				+<tt>rcu_read_unlock_sched_notrace()</tt>,
			
 
				+<tt>rcu_dereference_sched()</tt>,
			
 
				+<tt>rcu_dereference_sched_check()</tt>,
			
 
				+<tt>synchronize_sched()</tt>,
			
 
				+<tt>synchronize_rcu_sched_expedited()</tt>,
			
 
				+<tt>call_rcu_sched()</tt>,
			
 
				+<tt>rcu_barrier_sched()</tt>, and
			
 
				+<tt>rcu_read_lock_sched_held()</tt>.
			
 
				+However, anything that disables preemption also marks an RCU-sched
			
 
				+read-side critical section, including
			
 
				+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
			
 
				+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
			
 
				+and so on.
			
 
				+
			
 
				+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+For well over a decade, someone saying &ldquo;I need to block within
			
 
				+an RCU read-side critical section&rdquo; was a reliable indication
			
 
				+that this someone did not understand RCU.
			
 
				+After all, if you are always blocking in an RCU read-side critical
			
 
				+section, you can probably afford to use a higher-overhead synchronization
			
 
				+mechanism.
			
 
				+However, that changed with the advent of the Linux kernel's notifiers,
			
 
				+whose RCU read-side critical
			
 
				+sections almost never sleep, but sometimes need to.
			
 
				+This resulted in the introduction of
			
 
				+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
			
 
				+or <i>SRCU</i>.
			
 
				+
			
 
				+<p>
			
 
				+SRCU allows different domains to be defined, with each such domain
			
 
				+defined by an instance of an <tt>srcu_struct</tt> structure.
			
 
				+A pointer to this structure must be passed in to each SRCU function,
			
 
				+for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
			
 
				+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
			
 
				+The key benefit of these domains is that a slow SRCU reader in one
			
 
				+domain does not delay an SRCU grace period in some other domain.
			
 
				+That said, one consequence of these domains is that read-side code
			
 
				+must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
			
 
				+to <tt>srcu_read_unlock()</tt>, for example, as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int idx;
			
 
				+ 2
			
 
				+ 3 idx = srcu_read_lock(&amp;ss);
			
 
				+ 4 do_something();
			
 
				+ 5 srcu_read_unlock(&amp;ss, idx);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+As noted above, it is legal to block within SRCU read-side critical sections,
			
 
				+however, with great power comes great responsibility.
			
 
				+If you block forever in one of a given domain's SRCU read-side critical
			
 
				+sections, then that domain's grace periods will also be blocked forever.
			
 
				+Of course, one good way to block forever is to deadlock, which can
			
 
				+happen if any operation in a given domain's SRCU read-side critical
			
 
				+section can block waiting, either directly or indirectly, for that domain's
			
 
				+grace period to elapse.
			
 
				+For example, this results in a self-deadlock:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int idx;
			
 
				+ 2
			
 
				+ 3 idx = srcu_read_lock(&amp;ss);
			
 
				+ 4 do_something();
			
 
				+ 5 synchronize_srcu(&amp;ss);
			
 
				+ 6 srcu_read_unlock(&amp;ss, idx);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+However, if line&nbsp;5 acquired a mutex that was held across
			
 
				+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
			
 
				+deadlock would still be possible.
			
 
				+Furthermore, if line&nbsp;5 acquired a mutex that was held across
			
 
				+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
			
 
				+and if an <tt>ss1</tt>-domain SRCU read-side critical section
			
 
				+acquired another mutex that was held across as <tt>ss</tt>-domain
			
 
				+<tt>synchronize_srcu()</tt>,
			
 
				+deadlock would again be possible.
			
 
				+Such a deadlock cycle could extend across an arbitrarily large number
			
 
				+of different SRCU domains.
			
 
				+Again, with great power comes great responsibility.
			
 
				+
			
 
				+<p>
			
 
				+Unlike the other RCU flavors, SRCU read-side critical sections can
			
 
				+run on idle and even offline CPUs.
			
 
				+This ability requires that <tt>srcu_read_lock()</tt> and
			
 
				+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
			
 
				+that SRCU readers will run a bit slower than would RCU readers.
			
 
				+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
			
 
				+API, which, in combination with <tt>srcu_read_unlock()</tt>,
			
 
				+guarantees a full memory barrier.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
			
 
				+includes
			
 
				+<tt>srcu_read_lock()</tt>,
			
 
				+<tt>srcu_read_unlock()</tt>,
			
 
				+<tt>srcu_dereference()</tt>,
			
 
				+<tt>srcu_dereference_check()</tt>,
			
 
				+<tt>synchronize_srcu()</tt>,
			
 
				+<tt>synchronize_srcu_expedited()</tt>,
			
 
				+<tt>call_srcu()</tt>,
			
 
				+<tt>srcu_barrier()</tt>, and
			
 
				+<tt>srcu_read_lock_held()</tt>.
			
 
				+It also includes
			
 
				+<tt>DEFINE_SRCU()</tt>,
			
 
				+<tt>DEFINE_STATIC_SRCU()</tt>, and
			
 
				+<tt>init_srcu_struct()</tt>
			
 
				+APIs for defining and initializing <tt>srcu_struct</tt> structures.
			
 
				+
			
 
				+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
			
 
				+binary rewriting required to install different types of probes.
			
 
				+It would be good to be able to free old trampolines, which sounds
			
 
				+like a job for some form of RCU.
			
 
				+However, because it is necessary to be able to install a trace
			
 
				+anywhere in the code, it is not possible to use read-side markers
			
 
				+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
			
 
				+In addition, it does not work to have these markers in the trampoline
			
 
				+itself, because there would need to be instructions following
			
 
				+<tt>rcu_read_unlock()</tt>.
			
 
				+Although <tt>synchronize_rcu()</tt> would guarantee that execution
			
 
				+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
			
 
				+guarantee that execution had completely left the trampoline.
			
 
				+
			
 
				+<p>
			
 
				+The solution, in the form of
			
 
				+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
			
 
				+is to have implicit
			
 
				+read-side critical sections that are delimited by voluntary context
			
 
				+switches, that is, calls to <tt>schedule()</tt>,
			
 
				+<tt>cond_resched_rcu_qs()</tt>, and
			
 
				+<tt>synchronize_rcu_tasks()</tt>.
			
 
				+In addition, transitions to and from userspace execution also delimit
			
 
				+tasks-RCU read-side critical sections.
			
 
				+
			
 
				+<p>
			
 
				+The tasks-RCU API is quite compact, consisting only of
			
 
				+<tt>call_rcu_tasks()</tt>,
			
 
				+<tt>synchronize_rcu_tasks()</tt>, and
			
 
				+<tt>rcu_barrier_tasks()</tt>.
			
 
				+
			
 
				+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+One of the tricks that RCU uses to attain update-side scalability is
			
 
				+to increase grace-period latency with increasing numbers of CPUs.
			
 
				+If this becomes a serious problem, it will be necessary to rework the
			
 
				+grace-period state machine so as to avoid the need for the additional
			
 
				+latency.
			
 
				+
			
 
				+<p>
			
 
				+Expedited grace periods scan the CPUs, so their latency and overhead
			
 
				+increases with increasing numbers of CPUs.
			
 
				+If this becomes a serious problem on large systems, it will be necessary
			
 
				+to do some redesign to avoid this scalability problem.
			
 
				+
			
 
				+<p>
			
 
				+RCU disables CPU hotplug in a few places, perhaps most notably in the
			
 
				+expedited grace-period and <tt>rcu_barrier()</tt> operations.
			
 
				+If there is a strong reason to use expedited grace periods in CPU-hotplug
			
 
				+notifiers, it will be necessary to avoid disabling CPU hotplug.
			
 
				+This would introduce some complexity, so there had better be a <i>very</i>
			
 
				+good reason.
			
 
				+
			
 
				+<p>
			
 
				+The tradeoff between grace-period latency on the one hand and interruptions
			
 
				+of other CPUs on the other hand may need to be re-examined.
			
 
				+The desire is of course for zero grace-period latency as well as zero
			
 
				+interprocessor interrupts undertaken during an expedited grace period
			
 
				+operation.
			
 
				+While this ideal is unlikely to be achievable, it is quite possible that
			
 
				+further improvements can be made.
			
 
				+
			
 
				+<p>
			
 
				+The multiprocessor implementations of RCU use a combining tree that
			
 
				+groups CPUs so as to reduce lock contention and increase cache locality.
			
 
				+However, this combining tree does not spread its memory across NUMA
			
 
				+nodes nor does it align the CPU groups with hardware features such
			
 
				+as sockets or cores.
			
 
				+Such spreading and alignment is currently believed to be unnecessary
			
 
				+because the hotpath read-side primitives do not access the combining
			
 
				+tree, nor does <tt>call_rcu()</tt> in the common case.
			
 
				+If you believe that your architecture needs such spreading and alignment,
			
 
				+then your architecture should also benefit from the
			
 
				+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
			
 
				+to the number of CPUs in a socket, NUMA node, or whatever.
			
 
				+If the number of CPUs is too large, use a fraction of the number of
			
 
				+CPUs.
			
 
				+If the number of CPUs is a large prime number, well, that certainly
			
 
				+is an &ldquo;interesting&rdquo; architectural choice!
			
 
				+More flexible arrangements might be considered, but only if
			
 
				+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
			
 
				+if the inadequacy has been demonstrated by a carefully run and
			
 
				+realistic system-level workload.
			
 
				+
			
 
				+<p>
			
 
				+Please note that arrangements that require RCU to remap CPU numbers will
			
 
				+require extremely good demonstration of need and full exploration of
			
 
				+alternatives.
			
 
				+
			
 
				+<p>
			
 
				+There is an embarrassingly large number of flavors of RCU, and this
			
 
				+number has been increasing over time.
			
 
				+Perhaps it will be possible to combine some at some future date.
			
 
				+
			
 
				+<p>
			
 
				+RCU's various kthreads are reasonably recent additions.
			
 
				+It is quite likely that adjustments will be required to more gracefully
			
 
				+handle extreme loads.
			
 
				+It might also be necessary to be able to relate CPU utilization by
			
 
				+RCU's kthreads and softirq handlers to the code that instigated this
			
 
				+CPU utilization.
			
 
				+For example, RCU callback overhead might be charged back to the
			
 
				+originating <tt>call_rcu()</tt> instance, though probably not
			
 
				+in production kernels.
			
 
				+
			
 
				+<h2><a name="Summary">Summary</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+This document has presented more than two decade's worth of RCU
			
 
				+requirements.
			
 
				+Given that the requirements keep changing, this will not be the last
			
 
				+word on this subject, but at least it serves to get an important
			
 
				+subset of the requirements set forth.
			
 
				+
			
 
				+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
			
 
				+
			
 
				+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
			
 
				+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
			
 
				+Andy Lutomirski for their help in rendering
			
 
				+this article human readable, and to Michelle Rankin for her support
			
 
				+of this effort.
			
 
				+Other contributions are acknowledged in the Linux kernel's git archive.
			
 
				+The cartoon is copyright (c) 2013 by Melissa Broussard,
			
 
				+and is provided
			
 
				+under the terms of the Creative Commons Attribution-Share Alike 3.0
			
 
				+United States license.
			
 
				+
			
 
				+<h3><a name="Answers to Quick Quizzes">
			
 
				+Answers to Quick Quizzes</a></h3>
			
 
				+
			
 
				+<a name="qq1answer"></a>
			
 
				+<p><b>Quick Quiz 1</b>:
			
 
				+Wait a minute!
			
 
				+You said that updaters can make useful forward progress concurrently
			
 
				+with readers, but pre-existing readers will block
			
 
				+<tt>synchronize_rcu()</tt>!!!
			
 
				+Just who are you trying to fool???
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+First, if updaters do not wish to be blocked by readers, they can use
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
			
 
				+be discussed later.
			
 
				+Second, even when using <tt>synchronize_rcu()</tt>, the other
			
 
				+update-side code does run concurrently with readers, whether pre-existing
			
 
				+or not.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%201"><b>Back to Quick Quiz 1</b>.</a>
			
 
				+
			
 
				+<a name="qq2answer"></a>
			
 
				+<p><b>Quick Quiz 2</b>:
			
 
				+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+Without that extra grace period, memory reordering could result in
			
 
				+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
			
 
				+concurrently with the last bits of <tt>recovery()</tt>.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%202"><b>Back to Quick Quiz 2</b>.</a>
			
 
				+
			
 
				+<a name="qq3answer"></a>
			
 
				+<p><b>Quick Quiz 3</b>:
			
 
				+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
			
 
				+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
			
 
				+from being reordered.
			
 
				+Can't that also cause problems?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+No, it cannot.
			
 
				+The readers cannot see either of these two fields until
			
 
				+the assignment to <tt>gp</tt>, by which time both fields are
			
 
				+fully initialized.
			
 
				+So reordering the assignments
			
 
				+to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
			
 
				+cause any problems.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%203"><b>Back to Quick Quiz 3</b>.</a>
			
 
				+
			
 
				+<a name="qq4answer"></a>
			
 
				+<p><b>Quick Quiz 4</b>:
			
 
				+Without the <tt>rcu_dereference()</tt> or the
			
 
				+<tt>rcu_access_pointer()</tt>, what destructive optimizations
			
 
				+might the compiler make use of?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+Let's start with what happens to <tt>do_something_gp()</tt>
			
 
				+if it fails to use <tt>rcu_dereference()</tt>.
			
 
				+It could reuse a value formerly fetched from this same pointer.
			
 
				+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
			
 
				+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
			
 
				+mash-up of two distince pointer values.
			
 
				+It might even use value-speculation optimizations, where it makes a wrong
			
 
				+guess, but by the time it gets around to checking the value, an update
			
 
				+has changed the pointer to match the wrong guess.
			
 
				+Too bad about any dereferences that returned pre-initialization garbage
			
 
				+in the meantime!
			
 
				+
			
 
				+<p>
			
 
				+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
			
 
				+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
			
 
				+the above optimizations are harmless.
			
 
				+However,
			
 
				+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
			
 
				+<tt>sparse</tt> will complain if you
			
 
				+define <tt>gp</tt> with <tt>__rcu</tt> and then
			
 
				+access it without using
			
 
				+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%204"><b>Back to Quick Quiz 4</b>.</a>
			
 
				+
			
 
				+<a name="qq5answer"></a>
			
 
				+<p><b>Quick Quiz 5</b>:
			
 
				+Given that multiple CPUs can start RCU read-side critical sections
			
 
				+at any time without any ordering whatsoever, how can RCU possibly tell whether
			
 
				+or not a given RCU read-side critical section starts before a
			
 
				+given instance of <tt>synchronize_rcu()</tt>?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+If RCU cannot tell whether or not a given
			
 
				+RCU read-side critical section starts before a
			
 
				+given instance of <tt>synchronize_rcu()</tt>,
			
 
				+then it must assume that the RCU read-side critical section
			
 
				+started first.
			
 
				+In other words, a given instance of <tt>synchronize_rcu()</tt>
			
 
				+can avoid waiting on a given RCU read-side critical section only
			
 
				+if it can prove that <tt>synchronize_rcu()</tt> started first.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%205"><b>Back to Quick Quiz 5</b>.</a>
			
 
				+
			
 
				+<a name="qq6answer"></a>
			
 
				+<p><b>Quick Quiz 6</b>:
			
 
				+The first and second guarantees require unbelievably strict ordering!
			
 
				+Are all these memory barriers <i> really</i> required?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+Yes, they really are required.
			
 
				+To see why the first guarantee is required, consider the following
			
 
				+sequence of events:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	CPU 1: <tt>rcu_read_lock()</tt>
			
 
				+<li>	CPU 1: <tt>q = rcu_dereference(gp);
			
 
				+	/* Very likely to return p. */</tt>
			
 
				+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
			
 
				+<li>	CPU 1: <tt>do_something_with(q-&gt;a);
			
 
				+	/* No smp_mb(), so might happen after kfree(). */</tt>
			
 
				+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
			
 
				+<li>	CPU 0: <tt>kfree(p);</tt>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+Therefore, there absolutely must be a full memory barrier between the
			
 
				+end of the RCU read-side critical section and the end of the
			
 
				+grace period.
			
 
				+
			
 
				+<p>
			
 
				+The sequence of events demonstrating the necessity of the second rule
			
 
				+is roughly similar:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
			
 
				+<li>	CPU 1: <tt>rcu_read_lock()</tt>
			
 
				+<li>	CPU 1: <tt>q = rcu_dereference(gp);
			
 
				+	/* Might return p if no memory barrier. */</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
			
 
				+<li>	CPU 0: <tt>kfree(p);</tt>
			
 
				+<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
			
 
				+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+And similarly, without a memory barrier between the beginning of the
			
 
				+grace period and the beginning of the RCU read-side critical section,
			
 
				+CPU&nbsp;1 might end up accessing the freelist.
			
 
				+
			
 
				+<p>
			
 
				+The &ldquo;as if&rdquo; rule of course applies, so that any implementation
			
 
				+that acts as if the appropriate memory barriers were in place is a
			
 
				+correct implementation.
			
 
				+That said, it is much easier to fool yourself into believing that you have
			
 
				+adhered to the as-if rule than it is to actually adhere to it!
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%206"><b>Back to Quick Quiz 6</b>.</a>
			
 
				+
			
 
				+<a name="qq7answer"></a>
			
 
				+<p><b>Quick Quiz 7</b>:
			
 
				+But how does the upgrade-to-write operation exclude other readers?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+It doesn't, just like normal RCU updates, which also do not exclude
			
 
				+RCU readers.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%207"><b>Back to Quick Quiz 7</b>.</a>
			
 
				+
			
 
				+<a name="qq8answer"></a>
			
 
				+<p><b>Quick Quiz 8</b>:
			
 
				+Can't the compiler also reorder this code?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+No, the volatile casts in <tt>READ_ONCE()</tt> and
			
 
				+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
			
 
				+this particular case.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%208"><b>Back to Quick Quiz 8</b>.</a>
			
 
				+
			
 
				+<a name="qq9answer"></a>
			
 
				+<p><b>Quick Quiz 9</b>:
			
 
				+Suppose that synchronize_rcu() did wait until all readers had completed.
			
 
				+Would the updater be able to rely on this?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+No.
			
 
				+Even if <tt>synchronize_rcu()</tt> were to wait until
			
 
				+all readers had completed, a new reader might start immediately after
			
 
				+<tt>synchronize_rcu()</tt> completed.
			
 
				+Therefore, the code following
			
 
				+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
			
 
				+in any case.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%209"><b>Back to Quick Quiz 9</b>.</a>
			
 
				+
			
 
				+<a name="qq10answer"></a>
			
 
				+<p><b>Quick Quiz 10</b>:
			
 
				+How long a sequence of grace periods, each separated by an RCU read-side
			
 
				+critical section, would be required to partition the RCU read-side
			
 
				+critical sections at the beginning and end of the chain?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+In theory, an infinite number.
			
 
				+In practice, an unknown number that is sensitive to both implementation
			
 
				+details and timing considerations.
			
 
				+Therefore, even in practice, RCU users must abide by the theoretical rather
			
 
				+than the practical answer.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%2010"><b>Back to Quick Quiz 10</b>.</a>
			
 
				+
			
 
				+<a name="qq11answer"></a>
			
 
				+<p><b>Quick Quiz 11</b>:
			
 
				+What about sleeping locks?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+These are forbidden within Linux-kernel RCU read-side critical sections
			
 
				+because it is not legal to place a quiescent state (in this case,
			
 
				+voluntary context switch) within an RCU read-side critical section.
			
 
				+However, sleeping locks may be used within userspace RCU read-side critical
			
 
				+sections, and also within Linux-kernel sleepable RCU
			
 
				+<a href="#Sleepable RCU">(SRCU)</a>
			
 
				+read-side critical sections.
			
 
				+In addition, the -rt patchset turns spinlocks into a sleeping locks so
			
 
				+that the corresponding critical sections can be preempted, which
			
 
				+also means that these sleeplockified spinlocks (but not other sleeping locks!)
			
 
				+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
			
 
				+
			
 
				+<p>
			
 
				+Note that it <i>is</i> legal for a normal RCU read-side critical section
			
 
				+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
			
 
				+but only as long as it does not loop indefinitely attempting to
			
 
				+conditionally acquire that sleeping locks.
			
 
				+The key point is that things like <tt>mutex_trylock()</tt>
			
 
				+either return with the mutex held, or return an error indication if
			
 
				+the mutex was not immediately available.
			
 
				+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%2011"><b>Back to Quick Quiz 11</b>.</a>
			
 
				+
			
 
				+<a name="qq12answer"></a>
			
 
				+<p><b>Quick Quiz 12</b>:
			
 
				+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
			
 
				+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
			
 
				+structure, which would interact badly with concurrent insertions.
			
 
				+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
			
 
				+any changes, including any insertions that <tt>rcu_dereference()</tt>
			
 
				+would protect against.
			
 
				+Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
			
 
				+is released on line&nbsp;25, which in turn means that
			
 
				+<tt>rcu_access_pointer()</tt> suffices.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%2012"><b>Back to Quick Quiz 12</b>.</a>
			
 
				+
			
 
				+<a name="qq13answer"></a>
			
 
				+<p><b>Quick Quiz 13</b>:
			
 
				+Earlier it was claimed that <tt>call_rcu()</tt> and
			
 
				+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
			
 
				+by readers.
			
 
				+But how can that be correct, given that the invocation of the callback
			
 
				+and the freeing of the memory (respectively) must still wait for
			
 
				+a grace period to elapse?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+We could define things this way, but keep in mind that this sort of
			
 
				+definition would say that updates in garbage-collected languages
			
 
				+cannot complete until the next time the garbage collector runs,
			
 
				+which does not seem at all reasonable.
			
 
				+The key point is that in most cases, an updater using either
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
			
 
				+next update as soon as it has invoked <tt>call_rcu()</tt> or
			
 
				+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
			
 
				+grace period.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%2013"><b>Back to Quick Quiz 13</b>.</a>
			
 
				+
			
 
				+<a name="qq14answer"></a>
			
 
				+<p><b>Quick Quiz 14</b>:
			
 
				+So what happens with <tt>synchronize_rcu()</tt> during
			
 
				+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
			
 
				+kernels?
			
 
				+
			
 
				+
			
 
				+</p><p><b>Answer</b>:
			
 
				+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
			
 
				+maps directly to <tt>synchronize_sched()</tt>.
			
 
				+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
			
 
				+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
			
 
				+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
			
 
				+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
			
 
				+during scheduler initialization.
			
 
				+
			
 
				+
			
 
				+</p><p><a href="#Quick%20Quiz%2014"><b>Back to Quick Quiz 14</b>.</a>
			
 
				+
			
 
				+
			
 
				+</body></html>
			
--- a/Documentation/RCU/Design/Requirements/Requirements.htmlx
+++ b/Documentation/RCU/Design/Requirements/Requirements.htmlx
@@ -0,0 +1,2741 @@
 
				+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
			
 
				+        "http://www.w3.org/TR/html4/loose.dtd">
			
 
				+        <html>
			
 
				+        <head><title>A Tour Through RCU's Requirements [LWN.net]</title>
			
 
				+        <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=utf-8">
			
 
				+
			
 
				+<h1>A Tour Through RCU's Requirements</h1>
			
 
				+
			
 
				+<p>Copyright IBM Corporation, 2015</p>
			
 
				+<p>Author: Paul E.&nbsp;McKenney</p>
			
 
				+<p><i>The initial version of this document appeared in the
			
 
				+<a href="https://lwn.net/">LWN</a> articles
			
 
				+<a href="https://lwn.net/Articles/652156/">here</a>,
			
 
				+<a href="https://lwn.net/Articles/652677/">here</a>, and
			
 
				+<a href="https://lwn.net/Articles/653326/">here</a>.</i></p>
			
 
				+
			
 
				+<h2>Introduction</h2>
			
 
				+
			
 
				+<p>
			
 
				+Read-copy update (RCU) is a synchronization mechanism that is often
			
 
				+used as a replacement for reader-writer locking.
			
 
				+RCU is unusual in that updaters do not block readers,
			
 
				+which means that RCU's read-side primitives can be exceedingly fast
			
 
				+and scalable.
			
 
				+In addition, updaters can make useful forward progress concurrently
			
 
				+with readers.
			
 
				+However, all this concurrency between RCU readers and updaters does raise
			
 
				+the question of exactly what RCU readers are doing, which in turn
			
 
				+raises the question of exactly what RCU's requirements are.
			
 
				+
			
 
				+<p>
			
 
				+This document therefore summarizes RCU's requirements, and can be thought
			
 
				+of as an informal, high-level specification for RCU.
			
 
				+It is important to understand that RCU's specification is primarily
			
 
				+empirical in nature;
			
 
				+in fact, I learned about many of these requirements the hard way.
			
 
				+This situation might cause some consternation, however, not only
			
 
				+has this learning process been a lot of fun, but it has also been
			
 
				+a great privilege to work with so many people willing to apply
			
 
				+technologies in interesting new ways.
			
 
				+
			
 
				+<p>
			
 
				+All that aside, here are the categories of currently known RCU requirements:
			
 
				+</p>
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Fundamental Requirements">
			
 
				+	Fundamental Requirements</a>
			
 
				+<li>	<a href="#Fundamental Non-Requirements">Fundamental Non-Requirements</a>
			
 
				+<li>	<a href="#Parallelism Facts of Life">
			
 
				+	Parallelism Facts of Life</a>
			
 
				+<li>	<a href="#Quality-of-Implementation Requirements">
			
 
				+	Quality-of-Implementation Requirements</a>
			
 
				+<li>	<a href="#Linux Kernel Complications">
			
 
				+	Linux Kernel Complications</a>
			
 
				+<li>	<a href="#Software-Engineering Requirements">
			
 
				+	Software-Engineering Requirements</a>
			
 
				+<li>	<a href="#Other RCU Flavors">
			
 
				+	Other RCU Flavors</a>
			
 
				+<li>	<a href="#Possible Future Changes">
			
 
				+	Possible Future Changes</a>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This is followed by a <a href="#Summary">summary</a>,
			
 
				+which is in turn followed by the inevitable
			
 
				+<a href="#Answers to Quick Quizzes">answers to the quick quizzes</a>.
			
 
				+
			
 
				+<h2><a name="Fundamental Requirements">Fundamental Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+RCU's fundamental requirements are the closest thing RCU has to hard
			
 
				+mathematical requirements.
			
 
				+These are:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Grace-Period Guarantee">
			
 
				+	Grace-Period Guarantee</a>
			
 
				+<li>	<a href="#Publish-Subscribe Guarantee">
			
 
				+	Publish-Subscribe Guarantee</a>
			
 
				+<li>	<a href="#Memory-Barrier Guarantees">
			
 
				+	Memory-Barrier Guarantees</a>
			
 
				+<li>	<a href="#RCU Primitives Guaranteed to Execute Unconditionally">
			
 
				+	RCU Primitives Guaranteed to Execute Unconditionally</a>
			
 
				+<li>	<a href="#Guaranteed Read-to-Write Upgrade">
			
 
				+	Guaranteed Read-to-Write Upgrade</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Grace-Period Guarantee">Grace-Period Guarantee</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's grace-period guarantee is unusual in being premeditated:
			
 
				+Jack Slingwine and I had this guarantee firmly in mind when we started
			
 
				+work on RCU (then called &ldquo;rclock&rdquo;) in the early 1990s.
			
 
				+That said, the past two decades of experience with RCU have produced
			
 
				+a much more detailed understanding of this guarantee.
			
 
				+
			
 
				+<p>
			
 
				+RCU's grace-period guarantee allows updaters to wait for the completion
			
 
				+of all pre-existing RCU read-side critical sections.
			
 
				+An RCU read-side critical section
			
 
				+begins with the marker <tt>rcu_read_lock()</tt> and ends with
			
 
				+the marker <tt>rcu_read_unlock()</tt>.
			
 
				+These markers may be nested, and RCU treats a nested set as one
			
 
				+big RCU read-side critical section.
			
 
				+Production-quality implementations of <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> are extremely lightweight, and in
			
 
				+fact have exactly zero overhead in Linux kernels built for production
			
 
				+use with <tt>CONFIG_PREEMPT=n</tt>.
			
 
				+
			
 
				+<p>
			
 
				+This guarantee allows ordering to be enforced with extremely low
			
 
				+overhead to readers, for example:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int x, y;
			
 
				+ 2
			
 
				+ 3 void thread0(void)
			
 
				+ 4 {
			
 
				+ 5   rcu_read_lock();
			
 
				+ 6   r1 = READ_ONCE(x);
			
 
				+ 7   r2 = READ_ONCE(y);
			
 
				+ 8   rcu_read_unlock();
			
 
				+ 9 }
			
 
				+10
			
 
				+11 void thread1(void)
			
 
				+12 {
			
 
				+13   WRITE_ONCE(x, 1);
			
 
				+14   synchronize_rcu();
			
 
				+15   WRITE_ONCE(y, 1);
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Because the <tt>synchronize_rcu()</tt> on line&nbsp;14 waits for
			
 
				+all pre-existing readers, any instance of <tt>thread0()</tt> that
			
 
				+loads a value of zero from <tt>x</tt> must complete before
			
 
				+<tt>thread1()</tt> stores to <tt>y</tt>, so that instance must
			
 
				+also load a value of zero from <tt>y</tt>.
			
 
				+Similarly, any instance of <tt>thread0()</tt> that loads a value of
			
 
				+one from <tt>y</tt> must have started after the
			
 
				+<tt>synchronize_rcu()</tt> started, and must therefore also load
			
 
				+a value of one from <tt>x</tt>.
			
 
				+Therefore, the outcome:
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 0 &amp;&amp; r2 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+cannot happen.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Wait a minute!
			
 
				+You said that updaters can make useful forward progress concurrently
			
 
				+with readers, but pre-existing readers will block
			
 
				+<tt>synchronize_rcu()</tt>!!!
			
 
				+Just who are you trying to fool???
			
 
				+<p>@@QQA@@
			
 
				+First, if updaters do not wish to be blocked by readers, they can use
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt>, which will
			
 
				+be discussed later.
			
 
				+Second, even when using <tt>synchronize_rcu()</tt>, the other
			
 
				+update-side code does run concurrently with readers, whether pre-existing
			
 
				+or not.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+This scenario resembles one of the first uses of RCU in
			
 
				+<a href="https://en.wikipedia.org/wiki/DYNIX">DYNIX/ptx</a>,
			
 
				+which managed a distributed lock manager's transition into
			
 
				+a state suitable for handling recovery from node failure,
			
 
				+more or less as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 #define STATE_NORMAL        0
			
 
				+ 2 #define STATE_WANT_RECOVERY 1
			
 
				+ 3 #define STATE_RECOVERING    2
			
 
				+ 4 #define STATE_WANT_NORMAL   3
			
 
				+ 5
			
 
				+ 6 int state = STATE_NORMAL;
			
 
				+ 7
			
 
				+ 8 void do_something_dlm(void)
			
 
				+ 9 {
			
 
				+10   int state_snap;
			
 
				+11
			
 
				+12   rcu_read_lock();
			
 
				+13   state_snap = READ_ONCE(state);
			
 
				+14   if (state_snap == STATE_NORMAL)
			
 
				+15     do_something();
			
 
				+16   else
			
 
				+17     do_something_carefully();
			
 
				+18   rcu_read_unlock();
			
 
				+19 }
			
 
				+20
			
 
				+21 void start_recovery(void)
			
 
				+22 {
			
 
				+23   WRITE_ONCE(state, STATE_WANT_RECOVERY);
			
 
				+24   synchronize_rcu();
			
 
				+25   WRITE_ONCE(state, STATE_RECOVERING);
			
 
				+26   recovery();
			
 
				+27   WRITE_ONCE(state, STATE_WANT_NORMAL);
			
 
				+28   synchronize_rcu();
			
 
				+29   WRITE_ONCE(state, STATE_NORMAL);
			
 
				+30 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The RCU read-side critical section in <tt>do_something_dlm()</tt>
			
 
				+works with the <tt>synchronize_rcu()</tt> in <tt>start_recovery()</tt>
			
 
				+to guarantee that <tt>do_something()</tt> never runs concurrently
			
 
				+with <tt>recovery()</tt>, but with little or no synchronization
			
 
				+overhead in <tt>do_something_dlm()</tt>.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Why is the <tt>synchronize_rcu()</tt> on line&nbsp;28 needed?
			
 
				+<p>@@QQA@@
			
 
				+Without that extra grace period, memory reordering could result in
			
 
				+<tt>do_something_dlm()</tt> executing <tt>do_something()</tt>
			
 
				+concurrently with the last bits of <tt>recovery()</tt>.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+In order to avoid fatal problems such as deadlocks,
			
 
				+an RCU read-side critical section must not contain calls to
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+Similarly, an RCU read-side critical section must not
			
 
				+contain anything that waits, directly or indirectly, on completion of
			
 
				+an invocation of <tt>synchronize_rcu()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although RCU's grace-period guarantee is useful in and of itself, with
			
 
				+<a href="https://lwn.net/Articles/573497/">quite a few use cases</a>,
			
 
				+it would be good to be able to use RCU to coordinate read-side
			
 
				+access to linked data structures.
			
 
				+For this, the grace-period guarantee is not sufficient, as can
			
 
				+be seen in function <tt>add_gp_buggy()</tt> below.
			
 
				+We will look at the reader's code later, but in the meantime, just think of
			
 
				+the reader as locklessly picking up the <tt>gp</tt> pointer,
			
 
				+and, if the value loaded is non-<tt>NULL</tt>, locklessly accessing the
			
 
				+<tt>-&gt;a</tt> and <tt>-&gt;b</tt> fields.
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp_buggy(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   p-&gt;a = a;
			
 
				+12   p-&gt;b = a;
			
 
				+13   gp = p; /* ORDERING BUG */
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The problem is that both the compiler and weakly ordered CPUs are within
			
 
				+their rights to reorder this code as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp_buggy_optimized(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+<b>11   gp = p; /* ORDERING BUG */
			
 
				+12   p-&gt;a = a;
			
 
				+13   p-&gt;b = a;</b>
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If an RCU reader fetches <tt>gp</tt> just after
			
 
				+<tt>add_gp_buggy_optimized</tt> executes line&nbsp;11,
			
 
				+it will see garbage in the <tt>-&gt;a</tt> and <tt>-&gt;b</tt>
			
 
				+fields.
			
 
				+And this is but one of many ways in which compiler and hardware optimizations
			
 
				+could cause trouble.
			
 
				+Therefore, we clearly need some way to prevent the compiler and the CPU from
			
 
				+reordering in this manner, which brings us to the publish-subscribe
			
 
				+guarantee discussed in the next section.
			
 
				+
			
 
				+<h3><a name="Publish-Subscribe Guarantee">Publish/Subscribe Guarantee</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's publish-subscribe guarantee allows data to be inserted
			
 
				+into a linked data structure without disrupting RCU readers.
			
 
				+The updater uses <tt>rcu_assign_pointer()</tt> to insert the
			
 
				+new data, and readers use <tt>rcu_dereference()</tt> to
			
 
				+access data, whether new or old.
			
 
				+The following shows an example of insertion:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool add_gp(int a, int b)
			
 
				+ 2 {
			
 
				+ 3   p = kmalloc(sizeof(*p), GFP_KERNEL);
			
 
				+ 4   if (!p)
			
 
				+ 5     return -ENOMEM;
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   if (rcu_access_pointer(gp)) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   p-&gt;a = a;
			
 
				+12   p-&gt;b = a;
			
 
				+13   rcu_assign_pointer(gp, p);
			
 
				+14   spin_unlock(&amp;gp_lock);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The <tt>rcu_assign_pointer()</tt> on line&nbsp;13 is conceptually
			
 
				+equivalent to a simple assignment statement, but also guarantees
			
 
				+that its assignment will
			
 
				+happen after the two assignments in lines&nbsp;11 and&nbsp;12,
			
 
				+similar to the C11 <tt>memory_order_release</tt> store operation.
			
 
				+It also prevents any number of &ldquo;interesting&rdquo; compiler
			
 
				+optimizations, for example, the use of <tt>gp</tt> as a scratch
			
 
				+location immediately preceding the assignment.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+But <tt>rcu_assign_pointer()</tt> does nothing to prevent the
			
 
				+two assignments to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt>
			
 
				+from being reordered.
			
 
				+Can't that also cause problems?
			
 
				+<p>@@QQA@@
			
 
				+No, it cannot.
			
 
				+The readers cannot see either of these two fields until
			
 
				+the assignment to <tt>gp</tt>, by which time both fields are
			
 
				+fully initialized.
			
 
				+So reordering the assignments
			
 
				+to <tt>p-&gt;a</tt> and <tt>p-&gt;b</tt> cannot possibly
			
 
				+cause any problems.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+It is tempting to assume that the reader need not do anything special
			
 
				+to control its accesses to the RCU-protected data,
			
 
				+as shown in <tt>do_something_gp_buggy()</tt> below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp_buggy(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   p = gp;  /* OPTIMIZATIONS GALORE!!! */
			
 
				+ 5   if (p) {
			
 
				+ 6     do_something(p-&gt;a, p-&gt;b);
			
 
				+ 7     rcu_read_unlock();
			
 
				+ 8     return true;
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11   return false;
			
 
				+12 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+However, this temptation must be resisted because there are a
			
 
				+surprisingly large number of ways that the compiler
			
 
				+(to say nothing of
			
 
				+<a href="https://h71000.www7.hp.com/wizard/wiz_2637.html">DEC Alpha CPUs</a>)
			
 
				+can trip this code up.
			
 
				+For but one example, if the compiler were short of registers, it
			
 
				+might choose to refetch from <tt>gp</tt> rather than keeping
			
 
				+a separate copy in <tt>p</tt> as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp_buggy_optimized(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   if (gp) { /* OPTIMIZATIONS GALORE!!! */
			
 
				+<b> 5     do_something(gp-&gt;a, gp-&gt;b);</b>
			
 
				+ 6     rcu_read_unlock();
			
 
				+ 7     return true;
			
 
				+ 8   }
			
 
				+ 9   rcu_read_unlock();
			
 
				+10   return false;
			
 
				+11 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If this function ran concurrently with a series of updates that
			
 
				+replaced the current structure with a new one,
			
 
				+the fetches of <tt>gp-&gt;a</tt>
			
 
				+and <tt>gp-&gt;b</tt> might well come from two different structures,
			
 
				+which could cause serious confusion.
			
 
				+To prevent this (and much else besides), <tt>do_something_gp()</tt> uses
			
 
				+<tt>rcu_dereference()</tt> to fetch from <tt>gp</tt>:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool do_something_gp(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   p = rcu_dereference(gp);
			
 
				+ 5   if (p) {
			
 
				+ 6     do_something(p-&gt;a, p-&gt;b);
			
 
				+ 7     rcu_read_unlock();
			
 
				+ 8     return true;
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11   return false;
			
 
				+12 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+The <tt>rcu_dereference()</tt> uses volatile casts and (for DEC Alpha)
			
 
				+memory barriers in the Linux kernel.
			
 
				+Should a
			
 
				+<a href="http://www.rdrop.com/users/paulmck/RCU/consume.2015.07.13a.pdf">high-quality implementation of C11 <tt>memory_order_consume</tt> [PDF]</a>
			
 
				+ever appear, then <tt>rcu_dereference()</tt> could be implemented
			
 
				+as a <tt>memory_order_consume</tt> load.
			
 
				+Regardless of the exact implementation, a pointer fetched by
			
 
				+<tt>rcu_dereference()</tt> may not be used outside of the
			
 
				+outermost RCU read-side critical section containing that
			
 
				+<tt>rcu_dereference()</tt>, unless protection of
			
 
				+the corresponding data element has been passed from RCU to some
			
 
				+other synchronization mechanism, most commonly locking or
			
 
				+<a href="https://www.kernel.org/doc/Documentation/RCU/rcuref.txt">reference counting</a>.
			
 
				+
			
 
				+<p>
			
 
				+In short, updaters use <tt>rcu_assign_pointer()</tt> and readers
			
 
				+use <tt>rcu_dereference()</tt>, and these two RCU API elements
			
 
				+work together to ensure that readers have a consistent view of
			
 
				+newly added data elements.
			
 
				+
			
 
				+<p>
			
 
				+Of course, it is also necessary to remove elements from RCU-protected
			
 
				+data structures, for example, using the following process:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Remove the data element from the enclosing structure.
			
 
				+<li>	Wait for all pre-existing RCU read-side critical sections
			
 
				+	to complete (because only pre-existing readers can possibly have
			
 
				+	a reference to the newly removed data element).
			
 
				+<li>	At this point, only the updater has a reference to the
			
 
				+	newly removed data element, so it can safely reclaim
			
 
				+	the data element, for example, by passing it to <tt>kfree()</tt>.
			
 
				+</ol>
			
 
				+
			
 
				+This process is implemented by <tt>remove_gp_synchronous()</tt>:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool remove_gp_synchronous(void)
			
 
				+ 2 {
			
 
				+ 3   struct foo *p;
			
 
				+ 4
			
 
				+ 5   spin_lock(&amp;gp_lock);
			
 
				+ 6   p = rcu_access_pointer(gp);
			
 
				+ 7   if (!p) {
			
 
				+ 8     spin_unlock(&amp;gp_lock);
			
 
				+ 9     return false;
			
 
				+10   }
			
 
				+11   rcu_assign_pointer(gp, NULL);
			
 
				+12   spin_unlock(&amp;gp_lock);
			
 
				+13   synchronize_rcu();
			
 
				+14   kfree(p);
			
 
				+15   return true;
			
 
				+16 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+This function is straightforward, with line&nbsp;13 waiting for a grace
			
 
				+period before line&nbsp;14 frees the old data element.
			
 
				+This waiting ensures that readers will reach line&nbsp;7 of
			
 
				+<tt>do_something_gp()</tt> before the data element referenced by
			
 
				+<tt>p</tt> is freed.
			
 
				+The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
			
 
				+<tt>rcu_dereference()</tt>, except that:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	The value returned by <tt>rcu_access_pointer()</tt>
			
 
				+	cannot be dereferenced.
			
 
				+	If you want to access the value pointed to as well as
			
 
				+	the pointer itself, use <tt>rcu_dereference()</tt>
			
 
				+	instead of <tt>rcu_access_pointer()</tt>.
			
 
				+<li>	The call to <tt>rcu_access_pointer()</tt> need not be
			
 
				+	protected.
			
 
				+	In contrast, <tt>rcu_dereference()</tt> must either be
			
 
				+	within an RCU read-side critical section or in a code
			
 
				+	segment where the pointer cannot change, for example, in
			
 
				+	code protected by the corresponding update-side lock.
			
 
				+</ol>
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Without the <tt>rcu_dereference()</tt> or the
			
 
				+<tt>rcu_access_pointer()</tt>, what destructive optimizations
			
 
				+might the compiler make use of?
			
 
				+<p>@@QQA@@
			
 
				+Let's start with what happens to <tt>do_something_gp()</tt>
			
 
				+if it fails to use <tt>rcu_dereference()</tt>.
			
 
				+It could reuse a value formerly fetched from this same pointer.
			
 
				+It could also fetch the pointer from <tt>gp</tt> in a byte-at-a-time
			
 
				+manner, resulting in <i>load tearing</i>, in turn resulting a bytewise
			
 
				+mash-up of two distince pointer values.
			
 
				+It might even use value-speculation optimizations, where it makes a wrong
			
 
				+guess, but by the time it gets around to checking the value, an update
			
 
				+has changed the pointer to match the wrong guess.
			
 
				+Too bad about any dereferences that returned pre-initialization garbage
			
 
				+in the meantime!
			
 
				+
			
 
				+<p>
			
 
				+For <tt>remove_gp_synchronous()</tt>, as long as all modifications
			
 
				+to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
			
 
				+the above optimizations are harmless.
			
 
				+However,
			
 
				+with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
			
 
				+<tt>sparse</tt> will complain if you
			
 
				+define <tt>gp</tt> with <tt>__rcu</tt> and then
			
 
				+access it without using
			
 
				+either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+In short, RCU's publish-subscribe guarantee is provided by the combination
			
 
				+of <tt>rcu_assign_pointer()</tt> and <tt>rcu_dereference()</tt>.
			
 
				+This guarantee allows data elements to be safely added to RCU-protected
			
 
				+linked data structures without disrupting RCU readers.
			
 
				+This guarantee can be used in combination with the grace-period
			
 
				+guarantee to also allow data elements to be removed from RCU-protected
			
 
				+linked data structures, again without disrupting RCU readers.
			
 
				+
			
 
				+<p>
			
 
				+This guarantee was only partially premeditated.
			
 
				+DYNIX/ptx used an explicit memory barrier for publication, but had nothing
			
 
				+resembling <tt>rcu_dereference()</tt> for subscription, nor did it
			
 
				+have anything resembling the <tt>smp_read_barrier_depends()</tt>
			
 
				+that was later subsumed into <tt>rcu_dereference()</tt>.
			
 
				+The need for these operations made itself known quite suddenly at a
			
 
				+late-1990s meeting with the DEC Alpha architects, back in the days when
			
 
				+DEC was still a free-standing company.
			
 
				+It took the Alpha architects a good hour to convince me that any sort
			
 
				+of barrier would ever be needed, and it then took me a good <i>two</i> hours
			
 
				+to convince them that their documentation did not make this point clear.
			
 
				+More recent work with the C and C++ standards committees have provided
			
 
				+much education on tricks and traps from the compiler.
			
 
				+In short, compilers were much less tricky in the early 1990s, but in
			
 
				+2015, don't even think about omitting <tt>rcu_dereference()</tt>!
			
 
				+
			
 
				+<h3><a name="Memory-Barrier Guarantees">Memory-Barrier Guarantees</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The previous section's simple linked-data-structure scenario clearly
			
 
				+demonstrates the need for RCU's stringent memory-ordering guarantees on
			
 
				+systems with more than one CPU:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Each CPU that has an RCU read-side critical section that
			
 
				+	begins before <tt>synchronize_rcu()</tt> starts is
			
 
				+	guaranteed to execute a full memory barrier between the time
			
 
				+	that the RCU read-side critical section ends and the time that
			
 
				+	<tt>synchronize_rcu()</tt> returns.
			
 
				+	Without this guarantee, a pre-existing RCU read-side critical section
			
 
				+	might hold a reference to the newly removed <tt>struct foo</tt>
			
 
				+	after the <tt>kfree()</tt> on line&nbsp;14 of
			
 
				+	<tt>remove_gp_synchronous()</tt>.
			
 
				+<li>	Each CPU that has an RCU read-side critical section that ends
			
 
				+	after <tt>synchronize_rcu()</tt> returns is guaranteed
			
 
				+	to execute a full memory barrier between the time that
			
 
				+	<tt>synchronize_rcu()</tt> begins and the time that the RCU
			
 
				+	read-side critical section begins.
			
 
				+	Without this guarantee, a later RCU read-side critical section
			
 
				+	running after the <tt>kfree()</tt> on line&nbsp;14 of
			
 
				+	<tt>remove_gp_synchronous()</tt> might
			
 
				+	later run <tt>do_something_gp()</tt> and find the
			
 
				+	newly deleted <tt>struct foo</tt>.
			
 
				+<li>	If the task invoking <tt>synchronize_rcu()</tt> remains
			
 
				+	on a given CPU, then that CPU is guaranteed to execute a full
			
 
				+	memory barrier sometime during the execution of
			
 
				+	<tt>synchronize_rcu()</tt>.
			
 
				+	This guarantee ensures that the <tt>kfree()</tt> on
			
 
				+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
			
 
				+	execute after the removal on line&nbsp;11.
			
 
				+<li>	If the task invoking <tt>synchronize_rcu()</tt> migrates
			
 
				+	among a group of CPUs during that invocation, then each of the
			
 
				+	CPUs in that group is guaranteed to execute a full memory barrier
			
 
				+	sometime during the execution of <tt>synchronize_rcu()</tt>.
			
 
				+	This guarantee also ensures that the <tt>kfree()</tt> on
			
 
				+	line&nbsp;14 of <tt>remove_gp_synchronous()</tt> really does
			
 
				+	execute after the removal on
			
 
				+	line&nbsp;11, but also in the case where the thread executing the
			
 
				+	<tt>synchronize_rcu()</tt> migrates in the meantime.
			
 
				+</ol>
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Given that multiple CPUs can start RCU read-side critical sections
			
 
				+at any time without any ordering whatsoever, how can RCU possibly tell whether
			
 
				+or not a given RCU read-side critical section starts before a
			
 
				+given instance of <tt>synchronize_rcu()</tt>?
			
 
				+<p>@@QQA@@
			
 
				+If RCU cannot tell whether or not a given
			
 
				+RCU read-side critical section starts before a
			
 
				+given instance of <tt>synchronize_rcu()</tt>,
			
 
				+then it must assume that the RCU read-side critical section
			
 
				+started first.
			
 
				+In other words, a given instance of <tt>synchronize_rcu()</tt>
			
 
				+can avoid waiting on a given RCU read-side critical section only
			
 
				+if it can prove that <tt>synchronize_rcu()</tt> started first.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+The first and second guarantees require unbelievably strict ordering!
			
 
				+Are all these memory barriers <i> really</i> required?
			
 
				+<p>@@QQA@@
			
 
				+Yes, they really are required.
			
 
				+To see why the first guarantee is required, consider the following
			
 
				+sequence of events:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	CPU 1: <tt>rcu_read_lock()</tt>
			
 
				+<li>	CPU 1: <tt>q = rcu_dereference(gp);
			
 
				+	/* Very likely to return p. */</tt>
			
 
				+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
			
 
				+<li>	CPU 1: <tt>do_something_with(q-&gt;a);
			
 
				+	/* No smp_mb(), so might happen after kfree(). */</tt>
			
 
				+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
			
 
				+<li>	CPU 0: <tt>kfree(p);</tt>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+Therefore, there absolutely must be a full memory barrier between the
			
 
				+end of the RCU read-side critical section and the end of the
			
 
				+grace period.
			
 
				+
			
 
				+<p>
			
 
				+The sequence of events demonstrating the necessity of the second rule
			
 
				+is roughly similar:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	CPU 0: <tt>list_del_rcu(p);</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> starts.
			
 
				+<li>	CPU 1: <tt>rcu_read_lock()</tt>
			
 
				+<li>	CPU 1: <tt>q = rcu_dereference(gp);
			
 
				+	/* Might return p if no memory barrier. */</tt>
			
 
				+<li>	CPU 0: <tt>synchronize_rcu()</tt> returns.
			
 
				+<li>	CPU 0: <tt>kfree(p);</tt>
			
 
				+<li>	CPU 1: <tt>do_something_with(q-&gt;a); /* Boom!!! */</tt>
			
 
				+<li>	CPU 1: <tt>rcu_read_unlock()</tt>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+And similarly, without a memory barrier between the beginning of the
			
 
				+grace period and the beginning of the RCU read-side critical section,
			
 
				+CPU&nbsp;1 might end up accessing the freelist.
			
 
				+
			
 
				+<p>
			
 
				+The &ldquo;as if&rdquo; rule of course applies, so that any implementation
			
 
				+that acts as if the appropriate memory barriers were in place is a
			
 
				+correct implementation.
			
 
				+That said, it is much easier to fool yourself into believing that you have
			
 
				+adhered to the as-if rule than it is to actually adhere to it!
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+Note that these memory-barrier requirements do not replace the fundamental
			
 
				+RCU requirement that a grace period wait for all pre-existing readers.
			
 
				+On the contrary, the memory barriers called out in this section must operate in
			
 
				+such a way as to <i>enforce</i> this fundamental requirement.
			
 
				+Of course, different implementations enforce this requirement in different
			
 
				+ways, but enforce it they must.
			
 
				+
			
 
				+<h3><a name="RCU Primitives Guaranteed to Execute Unconditionally">RCU Primitives Guaranteed to Execute Unconditionally</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The common-case RCU primitives are unconditional.
			
 
				+They are invoked, they do their job, and they return, with no possibility
			
 
				+of error, and no need to retry.
			
 
				+This is a key RCU design philosophy.
			
 
				+
			
 
				+<p>
			
 
				+However, this philosophy is pragmatic rather than pigheaded.
			
 
				+If someone comes up with a good justification for a particular conditional
			
 
				+RCU primitive, it might well be implemented and added.
			
 
				+After all, this guarantee was reverse-engineered, not premeditated.
			
 
				+The unconditional nature of the RCU primitives was initially an
			
 
				+accident of implementation, and later experience with synchronization
			
 
				+primitives with conditional primitives caused me to elevate this
			
 
				+accident to a guarantee.
			
 
				+Therefore, the justification for adding a conditional primitive to
			
 
				+RCU would need to be based on detailed and compelling use cases.
			
 
				+
			
 
				+<h3><a name="Guaranteed Read-to-Write Upgrade">Guaranteed Read-to-Write Upgrade</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+As far as RCU is concerned, it is always possible to carry out an
			
 
				+update within an RCU read-side critical section.
			
 
				+For example, that RCU read-side critical section might search for
			
 
				+a given data element, and then might acquire the update-side
			
 
				+spinlock in order to update that element, all while remaining
			
 
				+in that RCU read-side critical section.
			
 
				+Of course, it is necessary to exit the RCU read-side critical section
			
 
				+before invoking <tt>synchronize_rcu()</tt>, however, this
			
 
				+inconvenience can be avoided through use of the
			
 
				+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt> API members
			
 
				+described later in this document.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+But how does the upgrade-to-write operation exclude other readers?
			
 
				+<p>@@QQA@@
			
 
				+It doesn't, just like normal RCU updates, which also do not exclude
			
 
				+RCU readers.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+This guarantee allows lookup code to be shared between read-side
			
 
				+and update-side code, and was premeditated, appearing in the earliest
			
 
				+DYNIX/ptx RCU documentation.
			
 
				+
			
 
				+<h2><a name="Fundamental Non-Requirements">Fundamental Non-Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+RCU provides extremely lightweight readers, and its read-side guarantees,
			
 
				+though quite useful, are correspondingly lightweight.
			
 
				+It is therefore all too easy to assume that RCU is guaranteeing more
			
 
				+than it really is.
			
 
				+Of course, the list of things that RCU does not guarantee is infinitely
			
 
				+long, however, the following sections list a few non-guarantees that
			
 
				+have caused confusion.
			
 
				+Except where otherwise noted, these non-guarantees were premeditated.
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Readers Impose Minimal Ordering">
			
 
				+	Readers Impose Minimal Ordering</a>
			
 
				+<li>	<a href="#Readers Do Not Exclude Updaters">
			
 
				+	Readers Do Not Exclude Updaters</a>
			
 
				+<li>	<a href="#Updaters Only Wait For Old Readers">
			
 
				+	Updaters Only Wait For Old Readers</a>
			
 
				+<li>	<a href="#Grace Periods Don't Partition Read-Side Critical Sections">
			
 
				+	Grace Periods Don't Partition Read-Side Critical Sections</a>
			
 
				+<li>	<a href="#Read-Side Critical Sections Don't Partition Grace Periods">
			
 
				+	Read-Side Critical Sections Don't Partition Grace Periods</a>
			
 
				+<li>	<a href="#Disabling Preemption Does Not Block Grace Periods">
			
 
				+	Disabling Preemption Does Not Block Grace Periods</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Readers Impose Minimal Ordering">Readers Impose Minimal Ordering</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Reader-side markers such as <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> provide absolutely no ordering guarantees
			
 
				+except through their interaction with the grace-period APIs such as
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+To see this, consider the following pair of threads:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(x, 1);
			
 
				+ 5   rcu_read_unlock();
			
 
				+ 6   rcu_read_lock();
			
 
				+ 7   WRITE_ONCE(y, 1);
			
 
				+ 8   rcu_read_unlock();
			
 
				+ 9 }
			
 
				+10
			
 
				+11 void thread1(void)
			
 
				+12 {
			
 
				+13   rcu_read_lock();
			
 
				+14   r1 = READ_ONCE(y);
			
 
				+15   rcu_read_unlock();
			
 
				+16   rcu_read_lock();
			
 
				+17   r2 = READ_ONCE(x);
			
 
				+18   rcu_read_unlock();
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+After <tt>thread0()</tt> and <tt>thread1()</tt> execute
			
 
				+concurrently, it is quite possible to have
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 0)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+(that is, <tt>y</tt> appears to have been assigned before <tt>x</tt>),
			
 
				+which would not be possible if <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> had much in the way of ordering
			
 
				+properties.
			
 
				+But they do not, so the CPU is within its rights
			
 
				+to do significant reordering.
			
 
				+This is by design:  Any significant ordering constraints would slow down
			
 
				+these fast-path APIs.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Can't the compiler also reorder this code?
			
 
				+<p>@@QQA@@
			
 
				+No, the volatile casts in <tt>READ_ONCE()</tt> and
			
 
				+<tt>WRITE_ONCE()</tt> prevent the compiler from reordering in
			
 
				+this particular case.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<h3><a name="Readers Do Not Exclude Updaters">Readers Do Not Exclude Updaters</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Neither <tt>rcu_read_lock()</tt> nor <tt>rcu_read_unlock()</tt>
			
 
				+exclude updates.
			
 
				+All they do is to prevent grace periods from ending.
			
 
				+The following example illustrates this:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   r1 = READ_ONCE(y);
			
 
				+ 5   if (r1) {
			
 
				+ 6     do_something_with_nonzero_x();
			
 
				+ 7     r2 = READ_ONCE(x);
			
 
				+ 8     WARN_ON(!r2); /* BUG!!! */
			
 
				+ 9   }
			
 
				+10   rcu_read_unlock();
			
 
				+11 }
			
 
				+12
			
 
				+13 void thread1(void)
			
 
				+14 {
			
 
				+15   spin_lock(&amp;my_lock);
			
 
				+16   WRITE_ONCE(x, 1);
			
 
				+17   WRITE_ONCE(y, 1);
			
 
				+18   spin_unlock(&amp;my_lock);
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+If the <tt>thread0()</tt> function's <tt>rcu_read_lock()</tt>
			
 
				+excluded the <tt>thread1()</tt> function's update,
			
 
				+the <tt>WARN_ON()</tt> could never fire.
			
 
				+But the fact is that <tt>rcu_read_lock()</tt> does not exclude
			
 
				+much of anything aside from subsequent grace periods, of which
			
 
				+<tt>thread1()</tt> has none, so the
			
 
				+<tt>WARN_ON()</tt> can and does fire.
			
 
				+
			
 
				+<h3><a name="Updaters Only Wait For Old Readers">Updaters Only Wait For Old Readers</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It might be tempting to assume that after <tt>synchronize_rcu()</tt>
			
 
				+completes, there are no readers executing.
			
 
				+This temptation must be avoided because
			
 
				+new readers can start immediately after <tt>synchronize_rcu()</tt>
			
 
				+starts, and <tt>synchronize_rcu()</tt> is under no
			
 
				+obligation to wait for these new readers.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Suppose that synchronize_rcu() did wait until all readers had completed.
			
 
				+Would the updater be able to rely on this?
			
 
				+<p>@@QQA@@
			
 
				+No.
			
 
				+Even if <tt>synchronize_rcu()</tt> were to wait until
			
 
				+all readers had completed, a new reader might start immediately after
			
 
				+<tt>synchronize_rcu()</tt> completed.
			
 
				+Therefore, the code following
			
 
				+<tt>synchronize_rcu()</tt> cannot rely on there being no readers
			
 
				+in any case.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<h3><a name="Grace Periods Don't Partition Read-Side Critical Sections">
			
 
				+Grace Periods Don't Partition Read-Side Critical Sections</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is tempting to assume that if any part of one RCU read-side critical
			
 
				+section precedes a given grace period, and if any part of another RCU
			
 
				+read-side critical section follows that same grace period, then all of
			
 
				+the first RCU read-side critical section must precede all of the second.
			
 
				+However, this just isn't the case: A single grace period does not
			
 
				+partition the set of RCU read-side critical sections.
			
 
				+An example of this situation can be illustrated as follows, where
			
 
				+<tt>x</tt>, <tt>y</tt>, and <tt>z</tt> are initially all zero:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   rcu_read_lock();
			
 
				+19   r2 = READ_ONCE(b);
			
 
				+20   r3 = READ_ONCE(c);
			
 
				+21   rcu_read_unlock();
			
 
				+22 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+It turns out that the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 0 &amp;&amp; r3 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+is entirely possible.
			
 
				+The following figure show how this can happen, with each circled
			
 
				+<tt>QS</tt> indicating the point at which RCU recorded a
			
 
				+<i>quiescent state</i> for each thread, that is, a state in which
			
 
				+RCU knows that the thread cannot be in the midst of an RCU read-side
			
 
				+critical section that started before the current grace period:
			
 
				+
			
 
				+<p><img src="GPpartitionReaders1.svg" alt="GPpartitionReaders1.svg" width="60%"></p>
			
 
				+
			
 
				+<p>
			
 
				+If it is necessary to partition RCU read-side critical sections in this
			
 
				+manner, it is necessary to use two grace periods, where the first
			
 
				+grace period is known to end before the second grace period starts:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   r2 = READ_ONCE(c);
			
 
				+19   synchronize_rcu();
			
 
				+20   WRITE_ONCE(d, 1);
			
 
				+21 }
			
 
				+22
			
 
				+23 void thread3(void)
			
 
				+24 {
			
 
				+25   rcu_read_lock();
			
 
				+26   r3 = READ_ONCE(b);
			
 
				+27   r4 = READ_ONCE(d);
			
 
				+28   rcu_read_unlock();
			
 
				+29 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Here, if <tt>(r1 == 1)</tt>, then
			
 
				+<tt>thread0()</tt>'s write to <tt>b</tt> must happen
			
 
				+before the end of <tt>thread1()</tt>'s grace period.
			
 
				+If in addition <tt>(r4 == 1)</tt>, then
			
 
				+<tt>thread3()</tt>'s read from <tt>b</tt> must happen
			
 
				+after the beginning of <tt>thread2()</tt>'s grace period.
			
 
				+If it is also the case that <tt>(r2 == 1)</tt>, then the
			
 
				+end of <tt>thread1()</tt>'s grace period must precede the
			
 
				+beginning of <tt>thread2()</tt>'s grace period.
			
 
				+This mean that the two RCU read-side critical sections cannot overlap,
			
 
				+guaranteeing that <tt>(r3 == 1)</tt>.
			
 
				+As a result, the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 0 &amp;&amp; r4 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+cannot happen.
			
 
				+
			
 
				+<p>
			
 
				+This non-requirement was also non-premeditated, but became apparent
			
 
				+when studying RCU's interaction with memory ordering.
			
 
				+
			
 
				+<h3><a name="Read-Side Critical Sections Don't Partition Grace Periods">
			
 
				+Read-Side Critical Sections Don't Partition Grace Periods</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is also tempting to assume that if an RCU read-side critical section
			
 
				+happens between a pair of grace periods, then those grace periods cannot
			
 
				+overlap.
			
 
				+However, this temptation leads nowhere good, as can be illustrated by
			
 
				+the following, with all variables initially zero:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 void thread0(void)
			
 
				+ 2 {
			
 
				+ 3   rcu_read_lock();
			
 
				+ 4   WRITE_ONCE(a, 1);
			
 
				+ 5   WRITE_ONCE(b, 1);
			
 
				+ 6   rcu_read_unlock();
			
 
				+ 7 }
			
 
				+ 8
			
 
				+ 9 void thread1(void)
			
 
				+10 {
			
 
				+11   r1 = READ_ONCE(a);
			
 
				+12   synchronize_rcu();
			
 
				+13   WRITE_ONCE(c, 1);
			
 
				+14 }
			
 
				+15
			
 
				+16 void thread2(void)
			
 
				+17 {
			
 
				+18   rcu_read_lock();
			
 
				+19   WRITE_ONCE(d, 1);
			
 
				+20   r2 = READ_ONCE(c);
			
 
				+21   rcu_read_unlock();
			
 
				+22 }
			
 
				+23
			
 
				+24 void thread3(void)
			
 
				+25 {
			
 
				+26   r3 = READ_ONCE(d);
			
 
				+27   synchronize_rcu();
			
 
				+28   WRITE_ONCE(e, 1);
			
 
				+29 }
			
 
				+30
			
 
				+31 void thread4(void)
			
 
				+32 {
			
 
				+33   rcu_read_lock();
			
 
				+34   r4 = READ_ONCE(b);
			
 
				+35   r5 = READ_ONCE(e);
			
 
				+36   rcu_read_unlock();
			
 
				+37 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+In this case, the outcome:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+(r1 == 1 &amp;&amp; r2 == 1 &amp;&amp; r3 == 1 &amp;&amp; r4 == 0 &amp&amp; r5 == 1)
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+is entirely possible, as illustrated below:
			
 
				+
			
 
				+<p><img src="ReadersPartitionGP1.svg" alt="ReadersPartitionGP1.svg" width="100%"></p>
			
 
				+
			
 
				+<p>
			
 
				+Again, an RCU read-side critical section can overlap almost all of a
			
 
				+given grace period, just so long as it does not overlap the entire
			
 
				+grace period.
			
 
				+As a result, an RCU read-side critical section cannot partition a pair
			
 
				+of RCU grace periods.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+How long a sequence of grace periods, each separated by an RCU read-side
			
 
				+critical section, would be required to partition the RCU read-side
			
 
				+critical sections at the beginning and end of the chain?
			
 
				+<p>@@QQA@@
			
 
				+In theory, an infinite number.
			
 
				+In practice, an unknown number that is sensitive to both implementation
			
 
				+details and timing considerations.
			
 
				+Therefore, even in practice, RCU users must abide by the theoretical rather
			
 
				+than the practical answer.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<h3><a name="Disabling Preemption Does Not Block Grace Periods">
			
 
				+Disabling Preemption Does Not Block Grace Periods</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+There was a time when disabling preemption on any given CPU would block
			
 
				+subsequent grace periods.
			
 
				+However, this was an accident of implementation and is not a requirement.
			
 
				+And in the current Linux-kernel implementation, disabling preemption
			
 
				+on a given CPU in fact does not block grace periods, as Oleg Nesterov
			
 
				+<a href="https://lkml.kernel.org/g/20150614193825.GA19582@redhat.com">demonstrated</a>.
			
 
				+
			
 
				+<p>
			
 
				+If you need a preempt-disable region to block grace periods, you need to add
			
 
				+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>, for example
			
 
				+as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 preempt_disable();
			
 
				+ 2 rcu_read_lock();
			
 
				+ 3 do_something();
			
 
				+ 4 rcu_read_unlock();
			
 
				+ 5 preempt_enable();
			
 
				+ 6
			
 
				+ 7 /* Spinlocks implicitly disable preemption. */
			
 
				+ 8 spin_lock(&amp;mylock);
			
 
				+ 9 rcu_read_lock();
			
 
				+10 do_something();
			
 
				+11 rcu_read_unlock();
			
 
				+12 spin_unlock(&amp;mylock);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+In theory, you could enter the RCU read-side critical section first,
			
 
				+but it is more efficient to keep the entire RCU read-side critical
			
 
				+section contained in the preempt-disable region as shown above.
			
 
				+Of course, RCU read-side critical sections that extend outside of
			
 
				+preempt-disable regions will work correctly, but such critical sections
			
 
				+can be preempted, which forces <tt>rcu_read_unlock()</tt> to do
			
 
				+more work.
			
 
				+And no, this is <i>not</i> an invitation to enclose all of your RCU
			
 
				+read-side critical sections within preempt-disable regions, because
			
 
				+doing so would degrade real-time response.
			
 
				+
			
 
				+<p>
			
 
				+This non-requirement appeared with preemptible RCU.
			
 
				+If you need a grace period that waits on non-preemptible code regions, use
			
 
				+<a href="#Sched Flavor">RCU-sched</a>.
			
 
				+
			
 
				+<h2><a name="Parallelism Facts of Life">Parallelism Facts of Life</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+These parallelism facts of life are by no means specific to RCU, but
			
 
				+the RCU implementation must abide by them.
			
 
				+They therefore bear repeating:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	Any CPU or task may be delayed at any time,
			
 
				+	and any attempts to avoid these delays by disabling
			
 
				+	preemption, interrupts, or whatever are completely futile.
			
 
				+	This is most obvious in preemptible user-level
			
 
				+	environments and in virtualized environments (where
			
 
				+	a given guest OS's VCPUs can be preempted at any time by
			
 
				+	the underlying hypervisor), but can also happen in bare-metal
			
 
				+	environments due to ECC errors, NMIs, and other hardware
			
 
				+	events.
			
 
				+	Although a delay of more than about 20 seconds can result
			
 
				+	in splats, the RCU implementation is obligated to use
			
 
				+	algorithms that can tolerate extremely long delays, but where
			
 
				+	&ldquo;extremely long&rdquo; is not long enough to allow
			
 
				+	wrap-around when incrementing a 64-bit counter.
			
 
				+<li>	Both the compiler and the CPU can reorder memory accesses.
			
 
				+	Where it matters, RCU must use compiler directives and
			
 
				+	memory-barrier instructions to preserve ordering.
			
 
				+<li>	Conflicting writes to memory locations in any given cache line
			
 
				+	will result in expensive cache misses.
			
 
				+	Greater numbers of concurrent writes and more-frequent
			
 
				+	concurrent writes will result in more dramatic slowdowns.
			
 
				+	RCU is therefore obligated to use algorithms that have
			
 
				+	sufficient locality to avoid significant performance and
			
 
				+	scalability problems.
			
 
				+<li>	As a rough rule of thumb, only one CPU's worth of processing
			
 
				+	may be carried out under the protection of any given exclusive
			
 
				+	lock.
			
 
				+	RCU must therefore use scalable locking designs.
			
 
				+<li>	Counters are finite, especially on 32-bit systems.
			
 
				+	RCU's use of counters must therefore tolerate counter wrap,
			
 
				+	or be designed such that counter wrap would take way more
			
 
				+	time than a single system is likely to run.
			
 
				+	An uptime of ten years is quite possible, a runtime
			
 
				+	of a century much less so.
			
 
				+	As an example of the latter, RCU's dyntick-idle nesting counter
			
 
				+	allows 54 bits for interrupt nesting level (this counter
			
 
				+	is 64 bits even on a 32-bit system).
			
 
				+	Overflowing this counter requires 2<sup>54</sup>
			
 
				+	half-interrupts on a given CPU without that CPU ever going idle.
			
 
				+	If a half-interrupt happened every microsecond, it would take
			
 
				+	570 years of runtime to overflow this counter, which is currently
			
 
				+	believed to be an acceptably long time.
			
 
				+<li>	Linux systems can have thousands of CPUs running a single
			
 
				+	Linux kernel in a single shared-memory environment.
			
 
				+	RCU must therefore pay close attention to high-end scalability.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This last parallelism fact of life means that RCU must pay special
			
 
				+attention to the preceding facts of life.
			
 
				+The idea that Linux might scale to systems with thousands of CPUs would
			
 
				+have been met with some skepticism in the 1990s, but these requirements
			
 
				+would have otherwise have been unsurprising, even in the early 1990s.
			
 
				+
			
 
				+<h2><a name="Quality-of-Implementation Requirements">Quality-of-Implementation Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+These sections list quality-of-implementation requirements.
			
 
				+Although an RCU implementation that ignores these requirements could
			
 
				+still be used, it would likely be subject to limitations that would
			
 
				+make it inappropriate for industrial-strength production use.
			
 
				+Classes of quality-of-implementation requirements are as follows:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Specialization">Specialization</a>
			
 
				+<li>	<a href="#Performance and Scalability">Performance and Scalability</a>
			
 
				+<li>	<a href="#Composability">Composability</a>
			
 
				+<li>	<a href="#Corner Cases">Corner Cases</a>
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+These classes is covered in the following sections.
			
 
				+
			
 
				+<h3><a name="Specialization">Specialization</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU is and always has been intended primarily for read-mostly situations, as
			
 
				+illustrated by the following figure.
			
 
				+This means that RCU's read-side primitives are optimized, often at the
			
 
				+expense of its update-side primitives.
			
 
				+
			
 
				+<p><img src="RCUApplicability.svg" alt="RCUApplicability.svg" width="70%"></p>
			
 
				+
			
 
				+<p>
			
 
				+This focus on read-mostly situations means that RCU must interoperate
			
 
				+with other synchronization primitives.
			
 
				+For example, the <tt>add_gp()</tt> and <tt>remove_gp_synchronous()</tt>
			
 
				+examples discussed earlier use RCU to protect readers and locking to
			
 
				+coordinate updaters.
			
 
				+However, the need extends much farther, requiring that a variety of
			
 
				+synchronization primitives be legal within RCU read-side critical sections,
			
 
				+including spinlocks, sequence locks, atomic operations, reference
			
 
				+counters, and memory barriers.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+What about sleeping locks?
			
 
				+<p>@@QQA@@
			
 
				+These are forbidden within Linux-kernel RCU read-side critical sections
			
 
				+because it is not legal to place a quiescent state (in this case,
			
 
				+voluntary context switch) within an RCU read-side critical section.
			
 
				+However, sleeping locks may be used within userspace RCU read-side critical
			
 
				+sections, and also within Linux-kernel sleepable RCU
			
 
				+<a href="#Sleepable RCU">(SRCU)</a>
			
 
				+read-side critical sections.
			
 
				+In addition, the -rt patchset turns spinlocks into a sleeping locks so
			
 
				+that the corresponding critical sections can be preempted, which
			
 
				+also means that these sleeplockified spinlocks (but not other sleeping locks!)
			
 
				+may be acquire within -rt-Linux-kernel RCU read-side critical sections.
			
 
				+
			
 
				+<p>
			
 
				+Note that it <i>is</i> legal for a normal RCU read-side critical section
			
 
				+to conditionally acquire a sleeping locks (as in <tt>mutex_trylock()</tt>),
			
 
				+but only as long as it does not loop indefinitely attempting to
			
 
				+conditionally acquire that sleeping locks.
			
 
				+The key point is that things like <tt>mutex_trylock()</tt>
			
 
				+either return with the mutex held, or return an error indication if
			
 
				+the mutex was not immediately available.
			
 
				+Either way, <tt>mutex_trylock()</tt> returns immediately without sleeping.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+It often comes as a surprise that many algorithms do not require a
			
 
				+consistent view of data, but many can function in that mode,
			
 
				+with network routing being the poster child.
			
 
				+Internet routing algorithms take significant time to propagate
			
 
				+updates, so that by the time an update arrives at a given system,
			
 
				+that system has been sending network traffic the wrong way for
			
 
				+a considerable length of time.
			
 
				+Having a few threads continue to send traffic the wrong way for a
			
 
				+few more milliseconds is clearly not a problem:  In the worst case,
			
 
				+TCP retransmissions will eventually get the data where it needs to go.
			
 
				+In general, when tracking the state of the universe outside of the
			
 
				+computer, some level of inconsistency must be tolerated due to
			
 
				+speed-of-light delays if nothing else.
			
 
				+
			
 
				+<p>
			
 
				+Furthermore, uncertainty about external state is inherent in many cases.
			
 
				+For example, a pair of veternarians might use heartbeat to determine
			
 
				+whether or not a given cat was alive.
			
 
				+But how long should they wait after the last heartbeat to decide that
			
 
				+the cat is in fact dead?
			
 
				+Waiting less than 400 milliseconds makes no sense because this would
			
 
				+mean that a relaxed cat would be considered to cycle between death
			
 
				+and life more than 100 times per minute.
			
 
				+Moreover, just as with human beings, a cat's heart might stop for
			
 
				+some period of time, so the exact wait period is a judgment call.
			
 
				+One of our pair of veternarians might wait 30 seconds before pronouncing
			
 
				+the cat dead, while the other might insist on waiting a full minute.
			
 
				+The two veternarians would then disagree on the state of the cat during
			
 
				+the final 30 seconds of the minute following the last heartbeat, as
			
 
				+fancifully illustrated below:
			
 
				+
			
 
				+<p><img src="2013-08-is-it-dead.png" alt="2013-08-is-it-dead.png" width="431"></p>
			
 
				+
			
 
				+<p>
			
 
				+Interestingly enough, this same situation applies to hardware.
			
 
				+When push comes to shove, how do we tell whether or not some
			
 
				+external server has failed?
			
 
				+We send messages to it periodically, and declare it failed if we
			
 
				+don't receive a response within a given period of time.
			
 
				+Policy decisions can usually tolerate short
			
 
				+periods of inconsistency.
			
 
				+The policy was decided some time ago, and is only now being put into
			
 
				+effect, so a few milliseconds of delay is normally inconsequential.
			
 
				+
			
 
				+<p>
			
 
				+However, there are algorithms that absolutely must see consistent data.
			
 
				+For example, the translation between a user-level SystemV semaphore
			
 
				+ID to the corresponding in-kernel data structure is protected by RCU,
			
 
				+but it is absolutely forbidden to update a semaphore that has just been
			
 
				+removed.
			
 
				+In the Linux kernel, this need for consistency is accommodated by acquiring
			
 
				+spinlocks located in the in-kernel data structure from within
			
 
				+the RCU read-side critical section, and this is indicated by the
			
 
				+green box in the figure above.
			
 
				+Many other techniques may be used, and are in fact used within the
			
 
				+Linux kernel.
			
 
				+
			
 
				+<p>
			
 
				+In short, RCU is not required to maintain consistency, and other
			
 
				+mechanisms may be used in concert with RCU when consistency is required.
			
 
				+RCU's specialization allows it to do its job extremely well, and its
			
 
				+ability to interoperate with other synchronization mechanisms allows
			
 
				+the right mix of synchronization tools to be used for a given job.
			
 
				+
			
 
				+<h3><a name="Performance and Scalability">Performance and Scalability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Energy efficiency is a critical component of performance today,
			
 
				+and Linux-kernel RCU implementations must therefore avoid unnecessarily
			
 
				+awakening idle CPUs.
			
 
				+I cannot claim that this requirement was premeditated.
			
 
				+In fact, I learned of it during a telephone conversation in which I
			
 
				+was given &ldquo;frank and open&rdquo; feedback on the importance
			
 
				+of energy efficiency in battery-powered systems and on specific
			
 
				+energy-efficiency shortcomings of the Linux-kernel RCU implementation.
			
 
				+In my experience, the battery-powered embedded community will consider
			
 
				+any unnecessary wakeups to be extremely unfriendly acts.
			
 
				+So much so that mere Linux-kernel-mailing-list posts are
			
 
				+insufficient to vent their ire.
			
 
				+
			
 
				+<p>
			
 
				+Memory consumption is not particularly important for in most
			
 
				+situations, and has become decreasingly
			
 
				+so as memory sizes have expanded and memory
			
 
				+costs have plummeted.
			
 
				+However, as I learned from Matt Mackall's
			
 
				+<a href="http://elinux.org/Linux_Tiny-FAQ">bloatwatch</a>
			
 
				+efforts, memory footprint is critically important on single-CPU systems with
			
 
				+non-preemptible (<tt>CONFIG_PREEMPT=n</tt>) kernels, and thus
			
 
				+<a href="https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com">tiny RCU</a>
			
 
				+was born.
			
 
				+Josh Triplett has since taken over the small-memory banner with his
			
 
				+<a href="https://tiny.wiki.kernel.org/">Linux kernel tinification</a>
			
 
				+project, which resulted in
			
 
				+<a href="#Sleepable RCU">SRCU</a>
			
 
				+becoming optional for those kernels not needing it.
			
 
				+
			
 
				+<p>
			
 
				+The remaining performance requirements are, for the most part,
			
 
				+unsurprising.
			
 
				+For example, in keeping with RCU's read-side specialization,
			
 
				+<tt>rcu_dereference()</tt> should have negligible overhead (for
			
 
				+example, suppression of a few minor compiler optimizations).
			
 
				+Similarly, in non-preemptible environments, <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> should have exactly zero overhead.
			
 
				+
			
 
				+<p>
			
 
				+In preemptible environments, in the case where the RCU read-side
			
 
				+critical section was not preempted (as will be the case for the
			
 
				+highest-priority real-time process), <tt>rcu_read_lock()</tt> and
			
 
				+<tt>rcu_read_unlock()</tt> should have minimal overhead.
			
 
				+In particular, they should not contain atomic read-modify-write
			
 
				+operations, memory-barrier instructions, preemption disabling,
			
 
				+interrupt disabling, or backwards branches.
			
 
				+However, in the case where the RCU read-side critical section was preempted,
			
 
				+<tt>rcu_read_unlock()</tt> may acquire spinlocks and disable interrupts.
			
 
				+This is why it is better to nest an RCU read-side critical section
			
 
				+within a preempt-disable region than vice versa, at least in cases
			
 
				+where that critical section is short enough to avoid unduly degrading
			
 
				+real-time latencies.
			
 
				+
			
 
				+<p>
			
 
				+The <tt>synchronize_rcu()</tt> grace-period-wait primitive is
			
 
				+optimized for throughput.
			
 
				+It may therefore incur several milliseconds of latency in addition to
			
 
				+the duration of the longest RCU read-side critical section.
			
 
				+On the other hand, multiple concurrent invocations of
			
 
				+<tt>synchronize_rcu()</tt> are required to use batching optimizations
			
 
				+so that they can be satisfied by a single underlying grace-period-wait
			
 
				+operation.
			
 
				+For example, in the Linux kernel, it is not unusual for a single
			
 
				+grace-period-wait operation to serve more than
			
 
				+<a href="https://www.usenix.org/conference/2004-usenix-annual-technical-conference/making-rcu-safe-deep-sub-millisecond-response">1,000 separate invocations</a>
			
 
				+of <tt>synchronize_rcu()</tt>, thus amortizing the per-invocation
			
 
				+overhead down to nearly zero.
			
 
				+However, the grace-period optimization is also required to avoid
			
 
				+measurable degradation of real-time scheduling and interrupt latencies.
			
 
				+
			
 
				+<p>
			
 
				+In some cases, the multi-millisecond <tt>synchronize_rcu()</tt>
			
 
				+latencies are unacceptable.
			
 
				+In these cases, <tt>synchronize_rcu_expedited()</tt> may be used
			
 
				+instead, reducing the grace-period latency down to a few tens of
			
 
				+microseconds on small systems, at least in cases where the RCU read-side
			
 
				+critical sections are short.
			
 
				+There are currently no special latency requirements for
			
 
				+<tt>synchronize_rcu_expedited()</tt> on large systems, but,
			
 
				+consistent with the empirical nature of the RCU specification,
			
 
				+that is subject to change.
			
 
				+However, there most definitely are scalability requirements:
			
 
				+A storm of <tt>synchronize_rcu_expedited()</tt> invocations on 4096
			
 
				+CPUs should at least make reasonable forward progress.
			
 
				+In return for its shorter latencies, <tt>synchronize_rcu_expedited()</tt>
			
 
				+is permitted to impose modest degradation of real-time latency
			
 
				+on non-idle online CPUs.
			
 
				+That said, it will likely be necessary to take further steps to reduce this
			
 
				+degradation, hopefully to roughly that of a scheduling-clock interrupt.
			
 
				+
			
 
				+<p>
			
 
				+There are a number of situations where even
			
 
				+<tt>synchronize_rcu_expedited()</tt>'s reduced grace-period
			
 
				+latency is unacceptable.
			
 
				+In these situations, the asynchronous <tt>call_rcu()</tt> can be
			
 
				+used in place of <tt>synchronize_rcu()</tt> as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 struct foo {
			
 
				+ 2   int a;
			
 
				+ 3   int b;
			
 
				+ 4   struct rcu_head rh;
			
 
				+ 5 };
			
 
				+ 6
			
 
				+ 7 static void remove_gp_cb(struct rcu_head *rhp)
			
 
				+ 8 {
			
 
				+ 9   struct foo *p = container_of(rhp, struct foo, rh);
			
 
				+10
			
 
				+11   kfree(p);
			
 
				+12 }
			
 
				+13
			
 
				+14 bool remove_gp_asynchronous(void)
			
 
				+15 {
			
 
				+16   struct foo *p;
			
 
				+17
			
 
				+18   spin_lock(&amp;gp_lock);
			
 
				+19   p = rcu_dereference(gp);
			
 
				+20   if (!p) {
			
 
				+21     spin_unlock(&amp;gp_lock);
			
 
				+22     return false;
			
 
				+23   }
			
 
				+24   rcu_assign_pointer(gp, NULL);
			
 
				+25   call_rcu(&amp;p-&gt;rh, remove_gp_cb);
			
 
				+26   spin_unlock(&amp;gp_lock);
			
 
				+27   return true;
			
 
				+28 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+A definition of <tt>struct foo</tt> is finally needed, and appears
			
 
				+on lines&nbsp;1-5.
			
 
				+The function <tt>remove_gp_cb()</tt> is passed to <tt>call_rcu()</tt>
			
 
				+on line&nbsp;25, and will be invoked after the end of a subsequent
			
 
				+grace period.
			
 
				+This gets the same effect as <tt>remove_gp_synchronous()</tt>,
			
 
				+but without forcing the updater to wait for a grace period to elapse.
			
 
				+The <tt>call_rcu()</tt> function may be used in a number of
			
 
				+situations where neither <tt>synchronize_rcu()</tt> nor
			
 
				+<tt>synchronize_rcu_expedited()</tt> would be legal,
			
 
				+including within preempt-disable code, <tt>local_bh_disable()</tt> code,
			
 
				+interrupt-disable code, and interrupt handlers.
			
 
				+However, even <tt>call_rcu()</tt> is illegal within NMI handlers.
			
 
				+The callback function (<tt>remove_gp_cb()</tt> in this case) will be
			
 
				+executed within softirq (software interrupt) environment within the
			
 
				+Linux kernel,
			
 
				+either within a real softirq handler or under the protection
			
 
				+of <tt>local_bh_disable()</tt>.
			
 
				+In both the Linux kernel and in userspace, it is bad practice to
			
 
				+write an RCU callback function that takes too long.
			
 
				+Long-running operations should be relegated to separate threads or
			
 
				+(in the Linux kernel) workqueues.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Why does line&nbsp;19 use <tt>rcu_access_pointer()</tt>?
			
 
				+After all, <tt>call_rcu()</tt> on line&nbsp;25 stores into the
			
 
				+structure, which would interact badly with concurrent insertions.
			
 
				+Doesn't this mean that <tt>rcu_dereference()</tt> is required?
			
 
				+<p>@@QQA@@
			
 
				+Presumably the <tt>-&gt;gp_lock</tt> acquired on line&nbsp;18 excludes
			
 
				+any changes, including any insertions that <tt>rcu_dereference()</tt>
			
 
				+would protect against.
			
 
				+Therefore, any insertions will be delayed until after <tt>-&gt;gp_lock</tt>
			
 
				+is released on line&nbsp;25, which in turn means that
			
 
				+<tt>rcu_access_pointer()</tt> suffices.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+However, all that <tt>remove_gp_cb()</tt> is doing is
			
 
				+invoking <tt>kfree()</tt> on the data element.
			
 
				+This is a common idiom, and is supported by <tt>kfree_rcu()</tt>,
			
 
				+which allows &ldquo;fire and forget&rdquo; operation as shown below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 struct foo {
			
 
				+ 2   int a;
			
 
				+ 3   int b;
			
 
				+ 4   struct rcu_head rh;
			
 
				+ 5 };
			
 
				+ 6
			
 
				+ 7 bool remove_gp_faf(void)
			
 
				+ 8 {
			
 
				+ 9   struct foo *p;
			
 
				+10
			
 
				+11   spin_lock(&amp;gp_lock);
			
 
				+12   p = rcu_dereference(gp);
			
 
				+13   if (!p) {
			
 
				+14     spin_unlock(&amp;gp_lock);
			
 
				+15     return false;
			
 
				+16   }
			
 
				+17   rcu_assign_pointer(gp, NULL);
			
 
				+18   kfree_rcu(p, rh);
			
 
				+19   spin_unlock(&amp;gp_lock);
			
 
				+20   return true;
			
 
				+21 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+Note that <tt>remove_gp_faf()</tt> simply invokes
			
 
				+<tt>kfree_rcu()</tt> and proceeds, without any need to pay any
			
 
				+further attention to the subsequent grace period and <tt>kfree()</tt>.
			
 
				+It is permissible to invoke <tt>kfree_rcu()</tt> from the same
			
 
				+environments as for <tt>call_rcu()</tt>.
			
 
				+Interestingly enough, DYNIX/ptx had the equivalents of
			
 
				+<tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>, but not
			
 
				+<tt>synchronize_rcu()</tt>.
			
 
				+This was due to the fact that RCU was not heavily used within DYNIX/ptx,
			
 
				+so the very few places that needed something like
			
 
				+<tt>synchronize_rcu()</tt> simply open-coded it.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+Earlier it was claimed that <tt>call_rcu()</tt> and
			
 
				+<tt>kfree_rcu()</tt> allowed updaters to avoid being blocked
			
 
				+by readers.
			
 
				+But how can that be correct, given that the invocation of the callback
			
 
				+and the freeing of the memory (respectively) must still wait for
			
 
				+a grace period to elapse?
			
 
				+<p>@@QQA@@
			
 
				+We could define things this way, but keep in mind that this sort of
			
 
				+definition would say that updates in garbage-collected languages
			
 
				+cannot complete until the next time the garbage collector runs,
			
 
				+which does not seem at all reasonable.
			
 
				+The key point is that in most cases, an updater using either
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> can proceed to the
			
 
				+next update as soon as it has invoked <tt>call_rcu()</tt> or
			
 
				+<tt>kfree_rcu()</tt>, without having to wait for a subsequent
			
 
				+grace period.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+But what if the updater must wait for the completion of code to be
			
 
				+executed after the end of the grace period, but has other tasks
			
 
				+that can be carried out in the meantime?
			
 
				+The polling-style <tt>get_state_synchronize_rcu()</tt> and
			
 
				+<tt>cond_synchronize_rcu()</tt> functions may be used for this
			
 
				+purpose, as shown below:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 bool remove_gp_poll(void)
			
 
				+ 2 {
			
 
				+ 3   struct foo *p;
			
 
				+ 4   unsigned long s;
			
 
				+ 5
			
 
				+ 6   spin_lock(&amp;gp_lock);
			
 
				+ 7   p = rcu_access_pointer(gp);
			
 
				+ 8   if (!p) {
			
 
				+ 9     spin_unlock(&amp;gp_lock);
			
 
				+10     return false;
			
 
				+11   }
			
 
				+12   rcu_assign_pointer(gp, NULL);
			
 
				+13   spin_unlock(&amp;gp_lock);
			
 
				+14   s = get_state_synchronize_rcu();
			
 
				+15   do_something_while_waiting();
			
 
				+16   cond_synchronize_rcu(s);
			
 
				+17   kfree(p);
			
 
				+18   return true;
			
 
				+19 }
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+On line&nbsp;14, <tt>get_state_synchronize_rcu()</tt> obtains a
			
 
				+&ldquo;cookie&rdquo; from RCU,
			
 
				+then line&nbsp;15 carries out other tasks,
			
 
				+and finally, line&nbsp;16 returns immediately if a grace period has
			
 
				+elapsed in the meantime, but otherwise waits as required.
			
 
				+The need for <tt>get_state_synchronize_rcu</tt> and
			
 
				+<tt>cond_synchronize_rcu()</tt> has appeared quite recently,
			
 
				+so it is too early to tell whether they will stand the test of time.
			
 
				+
			
 
				+<p>
			
 
				+RCU thus provides a range of tools to allow updaters to strike the
			
 
				+required tradeoff between latency, flexibility and CPU overhead.
			
 
				+
			
 
				+<h3><a name="Composability">Composability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Composability has received much attention in recent years, perhaps in part
			
 
				+due to the collision of multicore hardware with object-oriented techniques
			
 
				+designed in single-threaded environments for single-threaded use.
			
 
				+And in theory, RCU read-side critical sections may be composed, and in
			
 
				+fact may be nested arbitrarily deeply.
			
 
				+In practice, as with all real-world implementations of composable
			
 
				+constructs, there are limitations.
			
 
				+
			
 
				+<p>
			
 
				+Implementations of RCU for which <tt>rcu_read_lock()</tt>
			
 
				+and <tt>rcu_read_unlock()</tt> generate no code, such as
			
 
				+Linux-kernel RCU when <tt>CONFIG_PREEMPT=n</tt>, can be
			
 
				+nested arbitrarily deeply.
			
 
				+After all, there is no overhead.
			
 
				+Except that if all these instances of <tt>rcu_read_lock()</tt>
			
 
				+and <tt>rcu_read_unlock()</tt> are visible to the compiler,
			
 
				+compilation will eventually fail due to exhausting memory,
			
 
				+mass storage, or user patience, whichever comes first.
			
 
				+If the nesting is not visible to the compiler, as is the case with
			
 
				+mutually recursive functions each in its own translation unit,
			
 
				+stack overflow will result.
			
 
				+If the nesting takes the form of loops, either the control variable
			
 
				+will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
			
 
				+Nevertheless, this class of RCU implementations is one
			
 
				+of the most composable constructs in existence.
			
 
				+
			
 
				+<p>
			
 
				+RCU implementations that explicitly track nesting depth
			
 
				+are limited by the nesting-depth counter.
			
 
				+For example, the Linux kernel's preemptible RCU limits nesting to
			
 
				+<tt>INT_MAX</tt>.
			
 
				+This should suffice for almost all practical purposes.
			
 
				+That said, a consecutive pair of RCU read-side critical sections
			
 
				+between which there is an operation that waits for a grace period
			
 
				+cannot be enclosed in another RCU read-side critical section.
			
 
				+This is because it is not legal to wait for a grace period within
			
 
				+an RCU read-side critical section:  To do so would result either
			
 
				+in deadlock or
			
 
				+in RCU implicitly splitting the enclosing RCU read-side critical
			
 
				+section, neither of which is conducive to a long-lived and prosperous
			
 
				+kernel.
			
 
				+
			
 
				+<p>
			
 
				+It is worth noting that RCU is not alone in limiting composability.
			
 
				+For example, many transactional-memory implementations prohibit
			
 
				+composing a pair of transactions separated by an irrevocable
			
 
				+operation (for example, a network receive operation).
			
 
				+For another example, lock-based critical sections can be composed
			
 
				+surprisingly freely, but only if deadlock is avoided.
			
 
				+
			
 
				+<p>
			
 
				+In short, although RCU read-side critical sections are highly composable,
			
 
				+care is required in some situations, just as is the case for any other
			
 
				+composable synchronization mechanism.
			
 
				+
			
 
				+<h3><a name="Corner Cases">Corner Cases</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+A given RCU workload might have an endless and intense stream of
			
 
				+RCU read-side critical sections, perhaps even so intense that there
			
 
				+was never a point in time during which there was not at least one
			
 
				+RCU read-side critical section in flight.
			
 
				+RCU cannot allow this situation to block grace periods:  As long as
			
 
				+all the RCU read-side critical sections are finite, grace periods
			
 
				+must also be finite.
			
 
				+
			
 
				+<p>
			
 
				+That said, preemptible RCU implementations could potentially result
			
 
				+in RCU read-side critical sections being preempted for long durations,
			
 
				+which has the effect of creating a long-duration RCU read-side
			
 
				+critical section.
			
 
				+This situation can arise only in heavily loaded systems, but systems using
			
 
				+real-time priorities are of course more vulnerable.
			
 
				+Therefore, RCU priority boosting is provided to help deal with this
			
 
				+case.
			
 
				+That said, the exact requirements on RCU priority boosting will likely
			
 
				+evolve as more experience accumulates.
			
 
				+
			
 
				+<p>
			
 
				+Other workloads might have very high update rates.
			
 
				+Although one can argue that such workloads should instead use
			
 
				+something other than RCU, the fact remains that RCU must
			
 
				+handle such workloads gracefully.
			
 
				+This requirement is another factor driving batching of grace periods,
			
 
				+but it is also the driving force behind the checks for large numbers
			
 
				+of queued RCU callbacks in the <tt>call_rcu()</tt> code path.
			
 
				+Finally, high update rates should not delay RCU read-side critical
			
 
				+sections, although some read-side delays can occur when using
			
 
				+<tt>synchronize_rcu_expedited()</tt>, courtesy of this function's use
			
 
				+of <tt>try_stop_cpus()</tt>.
			
 
				+(In the future, <tt>synchronize_rcu_expedited()</tt> will be
			
 
				+converted to use lighter-weight inter-processor interrupts (IPIs),
			
 
				+but this will still disturb readers, though to a much smaller degree.)
			
 
				+
			
 
				+<p>
			
 
				+Although all three of these corner cases were understood in the early
			
 
				+1990s, a simple user-level test consisting of <tt>close(open(path))</tt>
			
 
				+in a tight loop
			
 
				+in the early 2000s suddenly provided a much deeper appreciation of the
			
 
				+high-update-rate corner case.
			
 
				+This test also motivated addition of some RCU code to react to high update
			
 
				+rates, for example, if a given CPU finds itself with more than 10,000
			
 
				+RCU callbacks queued, it will cause RCU to take evasive action by
			
 
				+more aggressively starting grace periods and more aggressively forcing
			
 
				+completion of grace-period processing.
			
 
				+This evasive action causes the grace period to complete more quickly,
			
 
				+but at the cost of restricting RCU's batching optimizations, thus
			
 
				+increasing the CPU overhead incurred by that grace period.
			
 
				+
			
 
				+<h2><a name="Software-Engineering Requirements">
			
 
				+Software-Engineering Requirements</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+Between Murphy's Law and &ldquo;To err is human&rdquo;, it is necessary to
			
 
				+guard against mishaps and misuse:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	It is all too easy to forget to use <tt>rcu_read_lock()</tt>
			
 
				+	everywhere that it is needed, so kernels built with
			
 
				+	<tt>CONFIG_PROVE_RCU=y</tt> will spat if
			
 
				+	<tt>rcu_dereference()</tt> is used outside of an
			
 
				+	RCU read-side critical section.
			
 
				+	Update-side code can use <tt>rcu_dereference_protected()</tt>,
			
 
				+	which takes a
			
 
				+	<a href="https://lwn.net/Articles/371986/">lockdep expression</a>
			
 
				+	to indicate what is providing the protection.
			
 
				+	If the indicated protection is not provided, a lockdep splat
			
 
				+	is emitted.
			
 
				+
			
 
				+	<p>
			
 
				+	Code shared between readers and updaters can use
			
 
				+	<tt>rcu_dereference_check()</tt>, which also takes a
			
 
				+	lockdep expression, and emits a lockdep splat if neither
			
 
				+	<tt>rcu_read_lock()</tt> nor the indicated protection
			
 
				+	is in place.
			
 
				+	In addition, <tt>rcu_dereference_raw()</tt> is used in those
			
 
				+	(hopefully rare) cases where the required protection cannot
			
 
				+	be easily described.
			
 
				+	Finally, <tt>rcu_read_lock_held()</tt> is provided to
			
 
				+	allow a function to verify that it has been invoked within
			
 
				+	an RCU read-side critical section.
			
 
				+	I was made aware of this set of requirements shortly after Thomas
			
 
				+	Gleixner audited a number of RCU uses.
			
 
				+<li>	A given function might wish to check for RCU-related preconditions
			
 
				+	upon entry, before using any other RCU API.
			
 
				+	The <tt>rcu_lockdep_assert()</tt> does this job,
			
 
				+	asserting the expression in kernels having lockdep enabled
			
 
				+	and doing nothing otherwise.
			
 
				+<li>	It is also easy to forget to use <tt>rcu_assign_pointer()</tt>
			
 
				+	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
			
 
				+	substituting a simple assignment.
			
 
				+	To catch this sort of error, a given RCU-protected pointer may be
			
 
				+	tagged with <tt>__rcu</tt>, after which running sparse
			
 
				+	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
			
 
				+	about simple-assignment accesses to that pointer.
			
 
				+	Arnd Bergmann made me aware of this requirement, and also
			
 
				+	supplied the needed
			
 
				+	<a href="https://lwn.net/Articles/376011/">patch series</a>.
			
 
				+<li>	Kernels built with <tt>CONFIG_DEBUG_OBJECTS_RCU_HEAD=y</tt>
			
 
				+	will splat if a data element is passed to <tt>call_rcu()</tt>
			
 
				+	twice in a row, without a grace period in between.
			
 
				+	(This error is similar to a double free.)
			
 
				+	The corresponding <tt>rcu_head</tt> structures that are
			
 
				+	dynamically allocated are automatically tracked, but
			
 
				+	<tt>rcu_head</tt> structures allocated on the stack
			
 
				+	must be initialized with <tt>init_rcu_head_on_stack()</tt>
			
 
				+	and cleaned up with <tt>destroy_rcu_head_on_stack()</tt>.
			
 
				+	Similarly, statically allocated non-stack <tt>rcu_head</tt>
			
 
				+	structures must be initialized with <tt>init_rcu_head()</tt>
			
 
				+	and cleaned up with <tt>destroy_rcu_head()</tt>.
			
 
				+	Mathieu Desnoyers made me aware of this requirement, and also
			
 
				+	supplied the needed
			
 
				+	<a href="https://lkml.kernel.org/g/20100319013024.GA28456@Krystal">patch</a>.
			
 
				+<li>	An infinite loop in an RCU read-side critical section will
			
 
				+	eventually trigger an RCU CPU stall warning splat, with
			
 
				+	the duration of &ldquo;eventually&rdquo; being controlled by the
			
 
				+	<tt>RCU_CPU_STALL_TIMEOUT</tt> <tt>Kconfig</tt> option, or,
			
 
				+	alternatively, by the
			
 
				+	<tt>rcupdate.rcu_cpu_stall_timeout</tt> boot/sysfs
			
 
				+	parameter.
			
 
				+	However, RCU is not obligated to produce this splat
			
 
				+	unless there is a grace period waiting on that particular
			
 
				+	RCU read-side critical section.
			
 
				+	<p>
			
 
				+	Some extreme workloads might intentionally delay
			
 
				+	RCU grace periods, and systems running those workloads can
			
 
				+	be booted with <tt>rcupdate.rcu_cpu_stall_suppress</tt>
			
 
				+	to suppress the splats.
			
 
				+	This kernel parameter may also be set via <tt>sysfs</tt>.
			
 
				+	Furthermore, RCU CPU stall warnings are counter-productive
			
 
				+	during sysrq dumps and during panics.
			
 
				+	RCU therefore supplies the <tt>rcu_sysrq_start()</tt> and
			
 
				+	<tt>rcu_sysrq_end()</tt> API members to be called before
			
 
				+	and after long sysrq dumps.
			
 
				+	RCU also supplies the <tt>rcu_panic()</tt> notifier that is
			
 
				+	automatically invoked at the beginning of a panic to suppress
			
 
				+	further RCU CPU stall warnings.
			
 
				+
			
 
				+	<p>
			
 
				+	This requirement made itself known in the early 1990s, pretty
			
 
				+	much the first time that it was necessary to debug a CPU stall.
			
 
				+	That said, the initial implementation in DYNIX/ptx was quite
			
 
				+	generic in comparison with that of Linux.
			
 
				+<li>	Although it would be very good to detect pointers leaking out
			
 
				+	of RCU read-side critical sections, there is currently no
			
 
				+	good way of doing this.
			
 
				+	One complication is the need to distinguish between pointers
			
 
				+	leaking and pointers that have been handed off from RCU to
			
 
				+	some other synchronization mechanism, for example, reference
			
 
				+	counting.
			
 
				+<li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
			
 
				+	information is provided via both debugfs and event tracing.
			
 
				+<li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
			
 
				+	<tt>rcu_dereference()</tt> to create typical linked
			
 
				+	data structures can be surprisingly error-prone.
			
 
				+	Therefore, RCU-protected
			
 
				+	<a href="https://lwn.net/Articles/609973/#RCU List APIs">linked lists</a>
			
 
				+	and, more recently, RCU-protected
			
 
				+	<a href="https://lwn.net/Articles/612100/">hash tables</a>
			
 
				+	are available.
			
 
				+	Many other special-purpose RCU-protected data structures are
			
 
				+	available in the Linux kernel and the userspace RCU library.
			
 
				+<li>	Some linked structures are created at compile time, but still
			
 
				+	require <tt>__rcu</tt> checking.
			
 
				+	The <tt>RCU_POINTER_INITIALIZER()</tt> macro serves this
			
 
				+	purpose.
			
 
				+<li>	It is not necessary to use <tt>rcu_assign_pointer()</tt>
			
 
				+	when creating linked structures that are to be published via
			
 
				+	a single external pointer.
			
 
				+	The <tt>RCU_INIT_POINTER()</tt> macro is provided for
			
 
				+	this task and also for assigning <tt>NULL</tt> pointers
			
 
				+	at runtime.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This not a hard-and-fast list:  RCU's diagnostic capabilities will
			
 
				+continue to be guided by the number and type of usage bugs found
			
 
				+in real-world RCU usage.
			
 
				+
			
 
				+<h2><a name="Linux Kernel Complications">Linux Kernel Complications</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel provides an interesting environment for all kinds of
			
 
				+software, including RCU.
			
 
				+Some of the relevant points of interest are as follows:
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Configuration">Configuration</a>.
			
 
				+<li>	<a href="#Firmware Interface">Firmware Interface</a>.
			
 
				+<li>	<a href="#Early Boot">Early Boot</a>.
			
 
				+<li>	<a href="#Interrupts and NMIs">
			
 
				+	Interrupts and non-maskable interrupts (NMIs)</a>.
			
 
				+<li>	<a href="#Loadable Modules">Loadable Modules</a>.
			
 
				+<li>	<a href="#Hotplug CPU">Hotplug CPU</a>.
			
 
				+<li>	<a href="#Scheduler and RCU">Scheduler and RCU</a>.
			
 
				+<li>	<a href="#Tracing and RCU">Tracing and RCU</a>.
			
 
				+<li>	<a href="#Energy Efficiency">Energy Efficiency</a>.
			
 
				+<li>	<a href="#Memory Efficiency">Memory Efficiency</a>.
			
 
				+<li>	<a href="#Performance, Scalability, Response Time, and Reliability">
			
 
				+	Performance, Scalability, Response Time, and Reliability</a>.
			
 
				+</ol>
			
 
				+
			
 
				+<p>
			
 
				+This list is probably incomplete, but it does give a feel for the
			
 
				+most notable Linux-kernel complications.
			
 
				+Each of the following sections covers one of the above topics.
			
 
				+
			
 
				+<h3><a name="Configuration">Configuration</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU's goal is automatic configuration, so that almost nobody
			
 
				+needs to worry about RCU's <tt>Kconfig</tt> options.
			
 
				+And for almost all users, RCU does in fact work well
			
 
				+&ldquo;out of the box.&rdquo;
			
 
				+
			
 
				+<p>
			
 
				+However, there are specialized use cases that are handled by
			
 
				+kernel boot parameters and <tt>Kconfig</tt> options.
			
 
				+Unfortunately, the <tt>Kconfig</tt> system will explicitly ask users
			
 
				+about new <tt>Kconfig</tt> options, which requires almost all of them
			
 
				+be hidden behind a <tt>CONFIG_RCU_EXPERT</tt> <tt>Kconfig</tt> option.
			
 
				+
			
 
				+<p>
			
 
				+This all should be quite obvious, but the fact remains that
			
 
				+Linus Torvalds recently had to
			
 
				+<a href="https://lkml.kernel.org/g/CA+55aFy4wcCwaL4okTs8wXhGZ5h-ibecy_Meg9C4MNQrUnwMcg@mail.gmail.com">remind</a>
			
 
				+me of this requirement.
			
 
				+
			
 
				+<h3><a name="Firmware Interface">Firmware Interface</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+In many cases, kernel obtains information about the system from the
			
 
				+firmware, and sometimes things are lost in translation.
			
 
				+Or the translation is accurate, but the original message is bogus.
			
 
				+
			
 
				+<p>
			
 
				+For example, some systems' firmware overreports the number of CPUs,
			
 
				+sometimes by a large factor.
			
 
				+If RCU naively believed the firmware, as it used to do,
			
 
				+it would create too many per-CPU kthreads.
			
 
				+Although the resulting system will still run correctly, the extra
			
 
				+kthreads needlessly consume memory and can cause confusion
			
 
				+when they show up in <tt>ps</tt> listings.
			
 
				+
			
 
				+<p>
			
 
				+RCU must therefore wait for a given CPU to actually come online before
			
 
				+it can allow itself to believe that the CPU actually exists.
			
 
				+The resulting &ldquo;ghost CPUs&rdquo; (which are never going to
			
 
				+come online) cause a number of
			
 
				+<a href="https://paulmck.livejournal.com/37494.html">interesting complications</a>.
			
 
				+
			
 
				+<h3><a name="Early Boot">Early Boot</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel's boot sequence is an interesting process,
			
 
				+and RCU is used early, even before <tt>rcu_init()</tt>
			
 
				+is invoked.
			
 
				+In fact, a number of RCU's primitives can be used as soon as the
			
 
				+initial task's <tt>task_struct</tt> is available and the
			
 
				+boot CPU's per-CPU variables are set up.
			
 
				+The read-side primitives (<tt>rcu_read_lock()</tt>,
			
 
				+<tt>rcu_read_unlock()</tt>, <tt>rcu_dereference()</tt>,
			
 
				+and <tt>rcu_access_pointer()</tt>) will operate normally very early on,
			
 
				+as will <tt>rcu_assign_pointer()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although <tt>call_rcu()</tt> may be invoked at any
			
 
				+time during boot, callbacks are not guaranteed to be invoked until after
			
 
				+the scheduler is fully up and running.
			
 
				+This delay in callback invocation is due to the fact that RCU does not
			
 
				+invoke callbacks until it is fully initialized, and this full initialization
			
 
				+cannot occur until after the scheduler has initialized itself to the
			
 
				+point where RCU can spawn and run its kthreads.
			
 
				+In theory, it would be possible to invoke callbacks earlier,
			
 
				+however, this is not a panacea because there would be severe restrictions
			
 
				+on what operations those callbacks could invoke.
			
 
				+
			
 
				+<p>
			
 
				+Perhaps surprisingly, <tt>synchronize_rcu()</tt>,
			
 
				+<a href="#Bottom-Half Flavor"><tt>synchronize_rcu_bh()</tt></a>
			
 
				+(<a href="#Bottom-Half Flavor">discussed below</a>),
			
 
				+and
			
 
				+<a href="#Sched Flavor"><tt>synchronize_sched()</tt></a>
			
 
				+will all operate normally
			
 
				+during very early boot, the reason being that there is only one CPU
			
 
				+and preemption is disabled.
			
 
				+This means that the call <tt>synchronize_rcu()</tt> (or friends)
			
 
				+itself is a quiescent
			
 
				+state and thus a grace period, so the early-boot implementation can
			
 
				+be a no-op.
			
 
				+
			
 
				+<p>
			
 
				+Both <tt>synchronize_rcu_bh()</tt> and <tt>synchronize_sched()</tt>
			
 
				+continue to operate normally through the remainder of boot, courtesy
			
 
				+of the fact that preemption is disabled across their RCU read-side
			
 
				+critical sections and also courtesy of the fact that there is still
			
 
				+only one CPU.
			
 
				+However, once the scheduler starts initializing, preemption is enabled.
			
 
				+There is still only a single CPU, but the fact that preemption is enabled
			
 
				+means that the no-op implementation of <tt>synchronize_rcu()</tt> no
			
 
				+longer works in <tt>CONFIG_PREEMPT=y</tt> kernels.
			
 
				+Therefore, as soon as the scheduler starts initializing, the early-boot
			
 
				+fastpath is disabled.
			
 
				+This means that <tt>synchronize_rcu()</tt> switches to its runtime
			
 
				+mode of operation where it posts callbacks, which in turn means that
			
 
				+any call to <tt>synchronize_rcu()</tt> will block until the corresponding
			
 
				+callback is invoked.
			
 
				+Unfortunately, the callback cannot be invoked until RCU's runtime
			
 
				+grace-period machinery is up and running, which cannot happen until
			
 
				+the scheduler has initialized itself sufficiently to allow RCU's
			
 
				+kthreads to be spawned.
			
 
				+Therefore, invoking <tt>synchronize_rcu()</tt> during scheduler
			
 
				+initialization can result in deadlock.
			
 
				+
			
 
				+<p>@@QQ@@
			
 
				+So what happens with <tt>synchronize_rcu()</tt> during
			
 
				+scheduler initialization for <tt>CONFIG_PREEMPT=n</tt>
			
 
				+kernels?
			
 
				+<p>@@QQA@@
			
 
				+In <tt>CONFIG_PREEMPT=n</tt> kernel, <tt>synchronize_rcu()</tt>
			
 
				+maps directly to <tt>synchronize_sched()</tt>.
			
 
				+Therefore, <tt>synchronize_rcu()</tt> works normally throughout
			
 
				+boot in <tt>CONFIG_PREEMPT=n</tt> kernels.
			
 
				+However, your code must also work in <tt>CONFIG_PREEMPT=y</tt> kernels,
			
 
				+so it is still necessary to avoid invoking <tt>synchronize_rcu()</tt>
			
 
				+during scheduler initialization.
			
 
				+<p>@@QQE@@
			
 
				+
			
 
				+<p>
			
 
				+I learned of these boot-time requirements as a result of a series of
			
 
				+system hangs.
			
 
				+
			
 
				+<h3><a name="Interrupts and NMIs">Interrupts and NMIs</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has interrupts, and RCU read-side critical sections are
			
 
				+legal within interrupt handlers and within interrupt-disabled regions
			
 
				+of code, as are invocations of <tt>call_rcu()</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Some Linux-kernel architectures can enter an interrupt handler from
			
 
				+non-idle process context, and then just never leave it, instead stealthily
			
 
				+transitioning back to process context.
			
 
				+This trick is sometimes used to invoke system calls from inside the kernel.
			
 
				+These &ldquo;half-interrupts&rdquo; mean that RCU has to be very careful
			
 
				+about how it counts interrupt nesting levels.
			
 
				+I learned of this requirement the hard way during a rewrite
			
 
				+of RCU's dyntick-idle code.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has non-maskable interrupts (NMIs), and
			
 
				+RCU read-side critical sections are legal within NMI handlers.
			
 
				+Thankfully, RCU update-side primitives, including
			
 
				+<tt>call_rcu()</tt>, are prohibited within NMI handlers.
			
 
				+
			
 
				+<p>
			
 
				+The name notwithstanding, some Linux-kernel architectures
			
 
				+can have nested NMIs, which RCU must handle correctly.
			
 
				+Andy Lutomirski
			
 
				+<a href="https://lkml.kernel.org/g/CALCETrXLq1y7e_dKFPgou-FKHB6Pu-r8+t-6Ds+8=va7anBWDA@mail.gmail.com">surprised me</a>
			
 
				+with this requirement;
			
 
				+he also kindly surprised me with
			
 
				+<a href="https://lkml.kernel.org/g/CALCETrXSY9JpW3uE6H8WYk81sg56qasA2aqmjMPsq5dOtzso=g@mail.gmail.com">an algorithm</a>
			
 
				+that meets this requirement.
			
 
				+
			
 
				+<h3><a name="Loadable Modules">Loadable Modules</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel has loadable modules, and these modules can
			
 
				+also be unloaded.
			
 
				+After a given module has been unloaded, any attempt to call
			
 
				+one of its functions results in a segmentation fault.
			
 
				+The module-unload functions must therefore cancel any
			
 
				+delayed calls to loadable-module functions, for example,
			
 
				+any outstanding <tt>mod_timer()</tt> must be dealt with
			
 
				+via <tt>del_timer_sync()</tt> or similar.
			
 
				+
			
 
				+<p>
			
 
				+Unfortunately, there is no way to cancel an RCU callback;
			
 
				+once you invoke <tt>call_rcu()</tt>, the callback function is
			
 
				+going to eventually be invoked, unless the system goes down first.
			
 
				+Because it is normally considered socially irresponsible to crash the system
			
 
				+in response to a module unload request, we need some other way
			
 
				+to deal with in-flight RCU callbacks.
			
 
				+
			
 
				+<p>
			
 
				+RCU therefore provides
			
 
				+<tt><a href="https://lwn.net/Articles/217484/">rcu_barrier()</a></tt>,
			
 
				+which waits until all in-flight RCU callbacks have been invoked.
			
 
				+If a module uses <tt>call_rcu()</tt>, its exit function should therefore
			
 
				+prevent any future invocation of <tt>call_rcu()</tt>, then invoke
			
 
				+<tt>rcu_barrier()</tt>.
			
 
				+In theory, the underlying module-unload code could invoke
			
 
				+<tt>rcu_barrier()</tt> unconditionally, but in practice this would
			
 
				+incur unacceptable latencies.
			
 
				+
			
 
				+<p>
			
 
				+Nikita Danilov noted this requirement for an analogous filesystem-unmount
			
 
				+situation, and Dipankar Sarma incorporated <tt>rcu_barrier()</tt> into RCU.
			
 
				+The need for <tt>rcu_barrier()</tt> for module unloading became
			
 
				+apparent later.
			
 
				+
			
 
				+<h3><a name="Hotplug CPU">Hotplug CPU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel supports CPU hotplug, which means that CPUs
			
 
				+can come and go.
			
 
				+It is of course illegal to use any RCU API member from an offline CPU.
			
 
				+This requirement was present from day one in DYNIX/ptx, but
			
 
				+on the other hand, the Linux kernel's CPU-hotplug implementation
			
 
				+is &ldquo;interesting.&rdquo;
			
 
				+
			
 
				+<p>
			
 
				+The Linux-kernel CPU-hotplug implementation has notifiers that
			
 
				+are used to allow the various kernel subsystems (including RCU)
			
 
				+to respond appropriately to a given CPU-hotplug operation.
			
 
				+Most RCU operations may be invoked from CPU-hotplug notifiers,
			
 
				+including even normal synchronous grace-period operations
			
 
				+such as <tt>synchronize_rcu()</tt>.
			
 
				+However, expedited grace-period operations such as
			
 
				+<tt>synchronize_rcu_expedited()</tt> are not supported,
			
 
				+due to the fact that current implementations block CPU-hotplug
			
 
				+operations, which could result in deadlock.
			
 
				+
			
 
				+<p>
			
 
				+In addition, all-callback-wait operations such as
			
 
				+<tt>rcu_barrier()</tt> are also not supported, due to the
			
 
				+fact that there are phases of CPU-hotplug operations where
			
 
				+the outgoing CPU's callbacks will not be invoked until after
			
 
				+the CPU-hotplug operation ends, which could also result in deadlock.
			
 
				+
			
 
				+<h3><a name="Scheduler and RCU">Scheduler and RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+RCU depends on the scheduler, and the scheduler uses RCU to
			
 
				+protect some of its data structures.
			
 
				+This means the scheduler is forbidden from acquiring
			
 
				+the runqueue locks and the priority-inheritance locks
			
 
				+in the middle of an outermost RCU read-side critical section unless either
			
 
				+(1)&nbsp;it releases them before exiting that same
			
 
				+RCU read-side critical section, or
			
 
				+(2)&nbsp;interrupts are disabled across
			
 
				+that entire RCU read-side critical section.
			
 
				+This same prohibition also applies (recursively!) to any lock that is acquired
			
 
				+while holding any lock to which this prohibition applies.
			
 
				+Adhering to this rule prevents preemptible RCU from invoking
			
 
				+<tt>rcu_read_unlock_special()</tt> while either runqueue or
			
 
				+priority-inheritance locks are held, thus avoiding deadlock.
			
 
				+
			
 
				+<p>
			
 
				+Prior to v4.4, it was only necessary to disable preemption across
			
 
				+RCU read-side critical sections that acquired scheduler locks.
			
 
				+In v4.4, expedited grace periods started using IPIs, and these
			
 
				+IPIs could force a <tt>rcu_read_unlock()</tt> to take the slowpath.
			
 
				+Therefore, this expedited-grace-period change required disabling of
			
 
				+interrupts, not just preemption.
			
 
				+
			
 
				+<p>
			
 
				+For RCU's part, the preemptible-RCU <tt>rcu_read_unlock()</tt>
			
 
				+implementation must be written carefully to avoid similar deadlocks.
			
 
				+In particular, <tt>rcu_read_unlock()</tt> must tolerate an
			
 
				+interrupt where the interrupt handler invokes both
			
 
				+<tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
			
 
				+This possibility requires <tt>rcu_read_unlock()</tt> to use
			
 
				+negative nesting levels to avoid destructive recursion via
			
 
				+interrupt handler's use of RCU.
			
 
				+
			
 
				+<p>
			
 
				+This pair of mutual scheduler-RCU requirements came as a
			
 
				+<a href="https://lwn.net/Articles/453002/">complete surprise</a>.
			
 
				+
			
 
				+<p>
			
 
				+As noted above, RCU makes use of kthreads, and it is necessary to
			
 
				+avoid excessive CPU-time accumulation by these kthreads.
			
 
				+This requirement was no surprise, but RCU's violation of it
			
 
				+when running context-switch-heavy workloads when built with
			
 
				+<tt>CONFIG_NO_HZ_FULL=y</tt>
			
 
				+<a href="http://www.rdrop.com/users/paulmck/scalability/paper/BareMetal.2015.01.15b.pdf">did come as a surprise [PDF]</a>.
			
 
				+RCU has made good progress towards meeting this requirement, even
			
 
				+for context-switch-have <tt>CONFIG_NO_HZ_FULL=y</tt> workloads,
			
 
				+but there is room for further improvement.
			
 
				+
			
 
				+<h3><a name="Tracing and RCU">Tracing and RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+It is possible to use tracing on RCU code, but tracing itself
			
 
				+uses RCU.
			
 
				+For this reason, <tt>rcu_dereference_raw_notrace()</tt>
			
 
				+is provided for use by tracing, which avoids the destructive
			
 
				+recursion that could otherwise ensue.
			
 
				+This API is also used by virtualization in some architectures,
			
 
				+where RCU readers execute in environments in which tracing
			
 
				+cannot be used.
			
 
				+The tracing folks both located the requirement and provided the
			
 
				+needed fix, so this surprise requirement was relatively painless.
			
 
				+
			
 
				+<h3><a name="Energy Efficiency">Energy Efficiency</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Interrupting idle CPUs is considered socially unacceptable,
			
 
				+especially by people with battery-powered embedded systems.
			
 
				+RCU therefore conserves energy by detecting which CPUs are
			
 
				+idle, including tracking CPUs that have been interrupted from idle.
			
 
				+This is a large part of the energy-efficiency requirement,
			
 
				+so I learned of this via an irate phone call.
			
 
				+
			
 
				+<p>
			
 
				+Because RCU avoids interrupting idle CPUs, it is illegal to
			
 
				+execute an RCU read-side critical section on an idle CPU.
			
 
				+(Kernels built with <tt>CONFIG_PROVE_RCU=y</tt> will splat
			
 
				+if you try it.)
			
 
				+The <tt>RCU_NONIDLE()</tt> macro and <tt>_rcuidle</tt>
			
 
				+event tracing is provided to work around this restriction.
			
 
				+In addition, <tt>rcu_is_watching()</tt> may be used to
			
 
				+test whether or not it is currently legal to run RCU read-side
			
 
				+critical sections on this CPU.
			
 
				+I learned of the need for diagnostics on the one hand
			
 
				+and <tt>RCU_NONIDLE()</tt> on the other while inspecting
			
 
				+idle-loop code.
			
 
				+Steven Rostedt supplied <tt>_rcuidle</tt> event tracing,
			
 
				+which is used quite heavily in the idle loop.
			
 
				+
			
 
				+<p>
			
 
				+It is similarly socially unacceptable to interrupt an
			
 
				+<tt>nohz_full</tt> CPU running in userspace.
			
 
				+RCU must therefore track <tt>nohz_full</tt> userspace
			
 
				+execution.
			
 
				+And in
			
 
				+<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
			
 
				+kernels, RCU must separately track idle CPUs on the one hand and
			
 
				+CPUs that are either idle or executing in userspace on the other.
			
 
				+In both cases, RCU must be able to sample state at two points in
			
 
				+time, and be able to determine whether or not some other CPU spent
			
 
				+any time idle and/or executing in userspace.
			
 
				+
			
 
				+<p>
			
 
				+These energy-efficiency requirements have proven quite difficult to
			
 
				+understand and to meet, for example, there have been more than five
			
 
				+clean-sheet rewrites of RCU's energy-efficiency code, the last of
			
 
				+which was finally able to demonstrate
			
 
				+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/AMPenergy.2013.04.19a.pdf">real energy savings running on real hardware [PDF]</a>.
			
 
				+As noted earlier,
			
 
				+I learned of many of these requirements via angry phone calls:
			
 
				+Flaming me on the Linux-kernel mailing list was apparently not
			
 
				+sufficient to fully vent their ire at RCU's energy-efficiency bugs!
			
 
				+
			
 
				+<h3><a name="Memory Efficiency">Memory Efficiency</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Although small-memory non-realtime systems can simply use Tiny RCU,
			
 
				+code size is only one aspect of memory efficiency.
			
 
				+Another aspect is the size of the <tt>rcu_head</tt> structure
			
 
				+used by <tt>call_rcu()</tt> and <tt>kfree_rcu()</tt>.
			
 
				+Although this structure contains nothing more than a pair of pointers,
			
 
				+it does appear in many RCU-protected data structures, including
			
 
				+some that are size critical.
			
 
				+The <tt>page</tt> structure is a case in point, as evidenced by
			
 
				+the many occurrences of the <tt>union</tt> keyword within that structure.
			
 
				+
			
 
				+<p>
			
 
				+This need for memory efficiency is one reason that RCU uses hand-crafted
			
 
				+singly linked lists to track the <tt>rcu_head</tt> structures that
			
 
				+are waiting for a grace period to elapse.
			
 
				+It is also the reason why <tt>rcu_head</tt> structures do not contain
			
 
				+debug information, such as fields tracking the file and line of the
			
 
				+<tt>call_rcu()</tt> or <tt>kfree_rcu()</tt> that posted them.
			
 
				+Although this information might appear in debug-only kernel builds at some
			
 
				+point, in the meantime, the <tt>-&gt;func</tt> field will often provide
			
 
				+the needed debug information.
			
 
				+
			
 
				+<p>
			
 
				+However, in some cases, the need for memory efficiency leads to even
			
 
				+more extreme measures.
			
 
				+Returning to the <tt>page</tt> structure, the <tt>rcu_head</tt> field
			
 
				+shares storage with a great many other structures that are used at
			
 
				+various points in the corresponding page's lifetime.
			
 
				+In order to correctly resolve certain
			
 
				+<a href="https://lkml.kernel.org/g/1439976106-137226-1-git-send-email-kirill.shutemov@linux.intel.com">race conditions</a>,
			
 
				+the Linux kernel's memory-management subsystem needs a particular bit
			
 
				+to remain zero during all phases of grace-period processing,
			
 
				+and that bit happens to map to the bottom bit of the
			
 
				+<tt>rcu_head</tt> structure's <tt>-&gt;next</tt> field.
			
 
				+RCU makes this guarantee as long as <tt>call_rcu()</tt>
			
 
				+is used to post the callback, as opposed to <tt>kfree_rcu()</tt>
			
 
				+or some future &ldquo;lazy&rdquo;
			
 
				+variant of <tt>call_rcu()</tt> that might one day be created for
			
 
				+energy-efficiency purposes.
			
 
				+
			
 
				+<h3><a name="Performance, Scalability, Response Time, and Reliability">
			
 
				+Performance, Scalability, Response Time, and Reliability</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Expanding on the
			
 
				+<a href="#Performance and Scalability">earlier discussion</a>,
			
 
				+RCU is used heavily by hot code paths in performance-critical
			
 
				+portions of the Linux kernel's networking, security, virtualization,
			
 
				+and scheduling code paths.
			
 
				+RCU must therefore use efficient implementations, especially in its
			
 
				+read-side primitives.
			
 
				+To that end, it would be good if preemptible RCU's implementation
			
 
				+of <tt>rcu_read_lock()</tt> could be inlined, however, doing
			
 
				+this requires resolving <tt>#include</tt> issues with the
			
 
				+<tt>task_struct</tt> structure.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel supports hardware configurations with up to
			
 
				+4096 CPUs, which means that RCU must be extremely scalable.
			
 
				+Algorithms that involve frequent acquisitions of global locks or
			
 
				+frequent atomic operations on global variables simply cannot be
			
 
				+tolerated within the RCU implementation.
			
 
				+RCU therefore makes heavy use of a combining tree based on the
			
 
				+<tt>rcu_node</tt> structure.
			
 
				+RCU is required to tolerate all CPUs continuously invoking any
			
 
				+combination of RCU's runtime primitives with minimal per-operation
			
 
				+overhead.
			
 
				+In fact, in many cases, increasing load must <i>decrease</i> the
			
 
				+per-operation overhead, witness the batching optimizations for
			
 
				+<tt>synchronize_rcu()</tt>, <tt>call_rcu()</tt>,
			
 
				+<tt>synchronize_rcu_expedited()</tt>, and <tt>rcu_barrier()</tt>.
			
 
				+As a general rule, RCU must cheerfully accept whatever the
			
 
				+rest of the Linux kernel decides to throw at it.
			
 
				+
			
 
				+<p>
			
 
				+The Linux kernel is used for real-time workloads, especially
			
 
				+in conjunction with the
			
 
				+<a href="https://rt.wiki.kernel.org/index.php/Main_Page">-rt patchset</a>.
			
 
				+The real-time-latency response requirements are such that the
			
 
				+traditional approach of disabling preemption across RCU
			
 
				+read-side critical sections is inappropriate.
			
 
				+Kernels built with <tt>CONFIG_PREEMPT=y</tt> therefore
			
 
				+use an RCU implementation that allows RCU read-side critical
			
 
				+sections to be preempted.
			
 
				+This requirement made its presence known after users made it
			
 
				+clear that an earlier
			
 
				+<a href="https://lwn.net/Articles/107930/">real-time patch</a>
			
 
				+did not meet their needs, in conjunction with some
			
 
				+<a href="https://lkml.kernel.org/g/20050318002026.GA2693@us.ibm.com">RCU issues</a>
			
 
				+encountered by a very early version of the -rt patchset.
			
 
				+
			
 
				+<p>
			
 
				+In addition, RCU must make do with a sub-100-microsecond real-time latency
			
 
				+budget.
			
 
				+In fact, on smaller systems with the -rt patchset, the Linux kernel
			
 
				+provides sub-20-microsecond real-time latencies for the whole kernel,
			
 
				+including RCU.
			
 
				+RCU's scalability and latency must therefore be sufficient for
			
 
				+these sorts of configurations.
			
 
				+To my surprise, the sub-100-microsecond real-time latency budget
			
 
				+<a href="http://www.rdrop.com/users/paulmck/realtime/paper/bigrt.2013.01.31a.LCA.pdf">
			
 
				+applies to even the largest systems [PDF]</a>,
			
 
				+up to and including systems with 4096 CPUs.
			
 
				+This real-time requirement motivated the grace-period kthread, which
			
 
				+also simplified handling of a number of race conditions.
			
 
				+
			
 
				+<p>
			
 
				+Finally, RCU's status as a synchronization primitive means that
			
 
				+any RCU failure can result in arbitrary memory corruption that can be
			
 
				+extremely difficult to debug.
			
 
				+This means that RCU must be extremely reliable, which in
			
 
				+practice also means that RCU must have an aggressive stress-test
			
 
				+suite.
			
 
				+This stress-test suite is called <tt>rcutorture</tt>.
			
 
				+
			
 
				+<p>
			
 
				+Although the need for <tt>rcutorture</tt> was no surprise,
			
 
				+the current immense popularity of the Linux kernel is posing
			
 
				+interesting&mdash;and perhaps unprecedented&mdash;validation
			
 
				+challenges.
			
 
				+To see this, keep in mind that there are well over one billion
			
 
				+instances of the Linux kernel running today, given Android
			
 
				+smartphones, Linux-powered televisions, and servers.
			
 
				+This number can be expected to increase sharply with the advent of
			
 
				+the celebrated Internet of Things.
			
 
				+
			
 
				+<p>
			
 
				+Suppose that RCU contains a race condition that manifests on average
			
 
				+once per million years of runtime.
			
 
				+This bug will be occurring about three times per <i>day</i> across
			
 
				+the installed base.
			
 
				+RCU could simply hide behind hardware error rates, given that no one
			
 
				+should really expect their smartphone to last for a million years.
			
 
				+However, anyone taking too much comfort from this thought should
			
 
				+consider the fact that in most jurisdictions, a successful multi-year
			
 
				+test of a given mechanism, which might include a Linux kernel,
			
 
				+suffices for a number of types of safety-critical certifications.
			
 
				+In fact, rumor has it that the Linux kernel is already being used
			
 
				+in production for safety-critical applications.
			
 
				+I don't know about you, but I would feel quite bad if a bug in RCU
			
 
				+killed someone.
			
 
				+Which might explain my recent focus on validation and verification.
			
 
				+
			
 
				+<h2><a name="Other RCU Flavors">Other RCU Flavors</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+One of the more surprising things about RCU is that there are now
			
 
				+no fewer than five <i>flavors</i>, or API families.
			
 
				+In addition, the primary flavor that has been the sole focus up to
			
 
				+this point has two different implementations, non-preemptible and
			
 
				+preemptible.
			
 
				+The other four flavors are listed below, with requirements for each
			
 
				+described in a separate section.
			
 
				+
			
 
				+<ol>
			
 
				+<li>	<a href="#Bottom-Half Flavor">Bottom-Half Flavor</a>
			
 
				+<li>	<a href="#Sched Flavor">Sched Flavor</a>
			
 
				+<li>	<a href="#Sleepable RCU">Sleepable RCU</a>
			
 
				+<li>	<a href="#Tasks RCU">Tasks RCU</a>
			
 
				+</ol>
			
 
				+
			
 
				+<h3><a name="Bottom-Half Flavor">Bottom-Half Flavor</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+The softirq-disable (AKA &ldquo;bottom-half&rdquo;,
			
 
				+hence the &ldquo;_bh&rdquo; abbreviations)
			
 
				+flavor of RCU, or <i>RCU-bh</i>, was developed by
			
 
				+Dipankar Sarma to provide a flavor of RCU that could withstand the
			
 
				+network-based denial-of-service attacks researched by Robert
			
 
				+Olsson.
			
 
				+These attacks placed so much networking load on the system
			
 
				+that some of the CPUs never exited softirq execution,
			
 
				+which in turn prevented those CPUs from ever executing a context switch,
			
 
				+which, in the RCU implementation of that time, prevented grace periods
			
 
				+from ever ending.
			
 
				+The result was an out-of-memory condition and a system hang.
			
 
				+
			
 
				+<p>
			
 
				+The solution was the creation of RCU-bh, which does
			
 
				+<tt>local_bh_disable()</tt>
			
 
				+across its read-side critical sections, and which uses the transition
			
 
				+from one type of softirq processing to another as a quiescent state
			
 
				+in addition to context switch, idle, user mode, and offline.
			
 
				+This means that RCU-bh grace periods can complete even when some of
			
 
				+the CPUs execute in softirq indefinitely, thus allowing algorithms
			
 
				+based on RCU-bh to withstand network-based denial-of-service attacks.
			
 
				+
			
 
				+<p>
			
 
				+Because
			
 
				+<tt>rcu_read_lock_bh()</tt> and <tt>rcu_read_unlock_bh()</tt>
			
 
				+disable and re-enable softirq handlers, any attempt to start a softirq
			
 
				+handlers during the
			
 
				+RCU-bh read-side critical section will be deferred.
			
 
				+In this case, <tt>rcu_read_unlock_bh()</tt>
			
 
				+will invoke softirq processing, which can take considerable time.
			
 
				+One can of course argue that this softirq overhead should be associated
			
 
				+with the code following the RCU-bh read-side critical section rather
			
 
				+than <tt>rcu_read_unlock_bh()</tt>, but the fact
			
 
				+is that most profiling tools cannot be expected to make this sort
			
 
				+of fine distinction.
			
 
				+For example, suppose that a three-millisecond-long RCU-bh read-side
			
 
				+critical section executes during a time of heavy networking load.
			
 
				+There will very likely be an attempt to invoke at least one softirq
			
 
				+handler during that three milliseconds, but any such invocation will
			
 
				+be delayed until the time of the <tt>rcu_read_unlock_bh()</tt>.
			
 
				+This can of course make it appear at first glance as if
			
 
				+<tt>rcu_read_unlock_bh()</tt> was executing very slowly.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-bh API</a>
			
 
				+includes
			
 
				+<tt>rcu_read_lock_bh()</tt>,
			
 
				+<tt>rcu_read_unlock_bh()</tt>,
			
 
				+<tt>rcu_dereference_bh()</tt>,
			
 
				+<tt>rcu_dereference_bh_check()</tt>,
			
 
				+<tt>synchronize_rcu_bh()</tt>,
			
 
				+<tt>synchronize_rcu_bh_expedited()</tt>,
			
 
				+<tt>call_rcu_bh()</tt>,
			
 
				+<tt>rcu_barrier_bh()</tt>, and
			
 
				+<tt>rcu_read_lock_bh_held()</tt>.
			
 
				+
			
 
				+<h3><a name="Sched Flavor">Sched Flavor</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Before preemptible RCU, waiting for an RCU grace period had the
			
 
				+side effect of also waiting for all pre-existing interrupt
			
 
				+and NMI handlers.
			
 
				+However, there are legitimate preemptible-RCU implementations that
			
 
				+do not have this property, given that any point in the code outside
			
 
				+of an RCU read-side critical section can be a quiescent state.
			
 
				+Therefore, <i>RCU-sched</i> was created, which follows &ldquo;classic&rdquo;
			
 
				+RCU in that an RCU-sched grace period waits for for pre-existing
			
 
				+interrupt and NMI handlers.
			
 
				+In kernels built with <tt>CONFIG_PREEMPT=n</tt>, the RCU and RCU-sched
			
 
				+APIs have identical implementations, while kernels built with
			
 
				+<tt>CONFIG_PREEMPT=y</tt> provide a separate implementation for each.
			
 
				+
			
 
				+<p>
			
 
				+Note well that in <tt>CONFIG_PREEMPT=y</tt> kernels,
			
 
				+<tt>rcu_read_lock_sched()</tt> and <tt>rcu_read_unlock_sched()</tt>
			
 
				+disable and re-enable preemption, respectively.
			
 
				+This means that if there was a preemption attempt during the
			
 
				+RCU-sched read-side critical section, <tt>rcu_read_unlock_sched()</tt>
			
 
				+will enter the scheduler, with all the latency and overhead entailed.
			
 
				+Just as with <tt>rcu_read_unlock_bh()</tt>, this can make it look
			
 
				+as if <tt>rcu_read_unlock_sched()</tt> was executing very slowly.
			
 
				+However, the highest-priority task won't be preempted, so that task
			
 
				+will enjoy low-overhead <tt>rcu_read_unlock_sched()</tt> invocations.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">RCU-sched API</a>
			
 
				+includes
			
 
				+<tt>rcu_read_lock_sched()</tt>,
			
 
				+<tt>rcu_read_unlock_sched()</tt>,
			
 
				+<tt>rcu_read_lock_sched_notrace()</tt>,
			
 
				+<tt>rcu_read_unlock_sched_notrace()</tt>,
			
 
				+<tt>rcu_dereference_sched()</tt>,
			
 
				+<tt>rcu_dereference_sched_check()</tt>,
			
 
				+<tt>synchronize_sched()</tt>,
			
 
				+<tt>synchronize_rcu_sched_expedited()</tt>,
			
 
				+<tt>call_rcu_sched()</tt>,
			
 
				+<tt>rcu_barrier_sched()</tt>, and
			
 
				+<tt>rcu_read_lock_sched_held()</tt>.
			
 
				+However, anything that disables preemption also marks an RCU-sched
			
 
				+read-side critical section, including
			
 
				+<tt>preempt_disable()</tt> and <tt>preempt_enable()</tt>,
			
 
				+<tt>local_irq_save()</tt> and <tt>local_irq_restore()</tt>,
			
 
				+and so on.
			
 
				+
			
 
				+<h3><a name="Sleepable RCU">Sleepable RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+For well over a decade, someone saying &ldquo;I need to block within
			
 
				+an RCU read-side critical section&rdquo; was a reliable indication
			
 
				+that this someone did not understand RCU.
			
 
				+After all, if you are always blocking in an RCU read-side critical
			
 
				+section, you can probably afford to use a higher-overhead synchronization
			
 
				+mechanism.
			
 
				+However, that changed with the advent of the Linux kernel's notifiers,
			
 
				+whose RCU read-side critical
			
 
				+sections almost never sleep, but sometimes need to.
			
 
				+This resulted in the introduction of
			
 
				+<a href="https://lwn.net/Articles/202847/">sleepable RCU</a>,
			
 
				+or <i>SRCU</i>.
			
 
				+
			
 
				+<p>
			
 
				+SRCU allows different domains to be defined, with each such domain
			
 
				+defined by an instance of an <tt>srcu_struct</tt> structure.
			
 
				+A pointer to this structure must be passed in to each SRCU function,
			
 
				+for example, <tt>synchronize_srcu(&amp;ss)</tt>, where
			
 
				+<tt>ss</tt> is the <tt>srcu_struct</tt> structure.
			
 
				+The key benefit of these domains is that a slow SRCU reader in one
			
 
				+domain does not delay an SRCU grace period in some other domain.
			
 
				+That said, one consequence of these domains is that read-side code
			
 
				+must pass a &ldquo;cookie&rdquo; from <tt>srcu_read_lock()</tt>
			
 
				+to <tt>srcu_read_unlock()</tt>, for example, as follows:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int idx;
			
 
				+ 2
			
 
				+ 3 idx = srcu_read_lock(&amp;ss);
			
 
				+ 4 do_something();
			
 
				+ 5 srcu_read_unlock(&amp;ss, idx);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+As noted above, it is legal to block within SRCU read-side critical sections,
			
 
				+however, with great power comes great responsibility.
			
 
				+If you block forever in one of a given domain's SRCU read-side critical
			
 
				+sections, then that domain's grace periods will also be blocked forever.
			
 
				+Of course, one good way to block forever is to deadlock, which can
			
 
				+happen if any operation in a given domain's SRCU read-side critical
			
 
				+section can block waiting, either directly or indirectly, for that domain's
			
 
				+grace period to elapse.
			
 
				+For example, this results in a self-deadlock:
			
 
				+
			
 
				+<blockquote>
			
 
				+<pre>
			
 
				+ 1 int idx;
			
 
				+ 2
			
 
				+ 3 idx = srcu_read_lock(&amp;ss);
			
 
				+ 4 do_something();
			
 
				+ 5 synchronize_srcu(&amp;ss);
			
 
				+ 6 srcu_read_unlock(&amp;ss, idx);
			
 
				+</pre>
			
 
				+</blockquote>
			
 
				+
			
 
				+<p>
			
 
				+However, if line&nbsp;5 acquired a mutex that was held across
			
 
				+a <tt>synchronize_srcu()</tt> for domain <tt>ss</tt>,
			
 
				+deadlock would still be possible.
			
 
				+Furthermore, if line&nbsp;5 acquired a mutex that was held across
			
 
				+a <tt>synchronize_srcu()</tt> for some other domain <tt>ss1</tt>,
			
 
				+and if an <tt>ss1</tt>-domain SRCU read-side critical section
			
 
				+acquired another mutex that was held across as <tt>ss</tt>-domain
			
 
				+<tt>synchronize_srcu()</tt>,
			
 
				+deadlock would again be possible.
			
 
				+Such a deadlock cycle could extend across an arbitrarily large number
			
 
				+of different SRCU domains.
			
 
				+Again, with great power comes great responsibility.
			
 
				+
			
 
				+<p>
			
 
				+Unlike the other RCU flavors, SRCU read-side critical sections can
			
 
				+run on idle and even offline CPUs.
			
 
				+This ability requires that <tt>srcu_read_lock()</tt> and
			
 
				+<tt>srcu_read_unlock()</tt> contain memory barriers, which means
			
 
				+that SRCU readers will run a bit slower than would RCU readers.
			
 
				+It also motivates the <tt>smp_mb__after_srcu_read_unlock()</tt>
			
 
				+API, which, in combination with <tt>srcu_read_unlock()</tt>,
			
 
				+guarantees a full memory barrier.
			
 
				+
			
 
				+<p>
			
 
				+The
			
 
				+<a href="https://lwn.net/Articles/609973/#RCU Per-Flavor API Table">SRCU API</a>
			
 
				+includes
			
 
				+<tt>srcu_read_lock()</tt>,
			
 
				+<tt>srcu_read_unlock()</tt>,
			
 
				+<tt>srcu_dereference()</tt>,
			
 
				+<tt>srcu_dereference_check()</tt>,
			
 
				+<tt>synchronize_srcu()</tt>,
			
 
				+<tt>synchronize_srcu_expedited()</tt>,
			
 
				+<tt>call_srcu()</tt>,
			
 
				+<tt>srcu_barrier()</tt>, and
			
 
				+<tt>srcu_read_lock_held()</tt>.
			
 
				+It also includes
			
 
				+<tt>DEFINE_SRCU()</tt>,
			
 
				+<tt>DEFINE_STATIC_SRCU()</tt>, and
			
 
				+<tt>init_srcu_struct()</tt>
			
 
				+APIs for defining and initializing <tt>srcu_struct</tt> structures.
			
 
				+
			
 
				+<h3><a name="Tasks RCU">Tasks RCU</a></h3>
			
 
				+
			
 
				+<p>
			
 
				+Some forms of tracing use &ldquo;tramopolines&rdquo; to handle the
			
 
				+binary rewriting required to install different types of probes.
			
 
				+It would be good to be able to free old trampolines, which sounds
			
 
				+like a job for some form of RCU.
			
 
				+However, because it is necessary to be able to install a trace
			
 
				+anywhere in the code, it is not possible to use read-side markers
			
 
				+such as <tt>rcu_read_lock()</tt> and <tt>rcu_read_unlock()</tt>.
			
 
				+In addition, it does not work to have these markers in the trampoline
			
 
				+itself, because there would need to be instructions following
			
 
				+<tt>rcu_read_unlock()</tt>.
			
 
				+Although <tt>synchronize_rcu()</tt> would guarantee that execution
			
 
				+reached the <tt>rcu_read_unlock()</tt>, it would not be able to
			
 
				+guarantee that execution had completely left the trampoline.
			
 
				+
			
 
				+<p>
			
 
				+The solution, in the form of
			
 
				+<a href="https://lwn.net/Articles/607117/"><i>Tasks RCU</i></a>,
			
 
				+is to have implicit
			
 
				+read-side critical sections that are delimited by voluntary context
			
 
				+switches, that is, calls to <tt>schedule()</tt>,
			
 
				+<tt>cond_resched_rcu_qs()</tt>, and
			
 
				+<tt>synchronize_rcu_tasks()</tt>.
			
 
				+In addition, transitions to and from userspace execution also delimit
			
 
				+tasks-RCU read-side critical sections.
			
 
				+
			
 
				+<p>
			
 
				+The tasks-RCU API is quite compact, consisting only of
			
 
				+<tt>call_rcu_tasks()</tt>,
			
 
				+<tt>synchronize_rcu_tasks()</tt>, and
			
 
				+<tt>rcu_barrier_tasks()</tt>.
			
 
				+
			
 
				+<h2><a name="Possible Future Changes">Possible Future Changes</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+One of the tricks that RCU uses to attain update-side scalability is
			
 
				+to increase grace-period latency with increasing numbers of CPUs.
			
 
				+If this becomes a serious problem, it will be necessary to rework the
			
 
				+grace-period state machine so as to avoid the need for the additional
			
 
				+latency.
			
 
				+
			
 
				+<p>
			
 
				+Expedited grace periods scan the CPUs, so their latency and overhead
			
 
				+increases with increasing numbers of CPUs.
			
 
				+If this becomes a serious problem on large systems, it will be necessary
			
 
				+to do some redesign to avoid this scalability problem.
			
 
				+
			
 
				+<p>
			
 
				+RCU disables CPU hotplug in a few places, perhaps most notably in the
			
 
				+expedited grace-period and <tt>rcu_barrier()</tt> operations.
			
 
				+If there is a strong reason to use expedited grace periods in CPU-hotplug
			
 
				+notifiers, it will be necessary to avoid disabling CPU hotplug.
			
 
				+This would introduce some complexity, so there had better be a <i>very</i>
			
 
				+good reason.
			
 
				+
			
 
				+<p>
			
 
				+The tradeoff between grace-period latency on the one hand and interruptions
			
 
				+of other CPUs on the other hand may need to be re-examined.
			
 
				+The desire is of course for zero grace-period latency as well as zero
			
 
				+interprocessor interrupts undertaken during an expedited grace period
			
 
				+operation.
			
 
				+While this ideal is unlikely to be achievable, it is quite possible that
			
 
				+further improvements can be made.
			
 
				+
			
 
				+<p>
			
 
				+The multiprocessor implementations of RCU use a combining tree that
			
 
				+groups CPUs so as to reduce lock contention and increase cache locality.
			
 
				+However, this combining tree does not spread its memory across NUMA
			
 
				+nodes nor does it align the CPU groups with hardware features such
			
 
				+as sockets or cores.
			
 
				+Such spreading and alignment is currently believed to be unnecessary
			
 
				+because the hotpath read-side primitives do not access the combining
			
 
				+tree, nor does <tt>call_rcu()</tt> in the common case.
			
 
				+If you believe that your architecture needs such spreading and alignment,
			
 
				+then your architecture should also benefit from the
			
 
				+<tt>rcutree.rcu_fanout_leaf</tt> boot parameter, which can be set
			
 
				+to the number of CPUs in a socket, NUMA node, or whatever.
			
 
				+If the number of CPUs is too large, use a fraction of the number of
			
 
				+CPUs.
			
 
				+If the number of CPUs is a large prime number, well, that certainly
			
 
				+is an &ldquo;interesting&rdquo; architectural choice!
			
 
				+More flexible arrangements might be considered, but only if
			
 
				+<tt>rcutree.rcu_fanout_leaf</tt> has proven inadequate, and only
			
 
				+if the inadequacy has been demonstrated by a carefully run and
			
 
				+realistic system-level workload.
			
 
				+
			
 
				+<p>
			
 
				+Please note that arrangements that require RCU to remap CPU numbers will
			
 
				+require extremely good demonstration of need and full exploration of
			
 
				+alternatives.
			
 
				+
			
 
				+<p>
			
 
				+There is an embarrassingly large number of flavors of RCU, and this
			
 
				+number has been increasing over time.
			
 
				+Perhaps it will be possible to combine some at some future date.
			
 
				+
			
 
				+<p>
			
 
				+RCU's various kthreads are reasonably recent additions.
			
 
				+It is quite likely that adjustments will be required to more gracefully
			
 
				+handle extreme loads.
			
 
				+It might also be necessary to be able to relate CPU utilization by
			
 
				+RCU's kthreads and softirq handlers to the code that instigated this
			
 
				+CPU utilization.
			
 
				+For example, RCU callback overhead might be charged back to the
			
 
				+originating <tt>call_rcu()</tt> instance, though probably not
			
 
				+in production kernels.
			
 
				+
			
 
				+<h2><a name="Summary">Summary</a></h2>
			
 
				+
			
 
				+<p>
			
 
				+This document has presented more than two decade's worth of RCU
			
 
				+requirements.
			
 
				+Given that the requirements keep changing, this will not be the last
			
 
				+word on this subject, but at least it serves to get an important
			
 
				+subset of the requirements set forth.
			
 
				+
			
 
				+<h2><a name="Acknowledgments">Acknowledgments</a></h2>
			
 
				+
			
 
				+I am grateful to Steven Rostedt, Lai Jiangshan, Ingo Molnar,
			
 
				+Oleg Nesterov, Borislav Petkov, Peter Zijlstra, Boqun Feng, and
			
 
				+Andy Lutomirski for their help in rendering
			
 
				+this article human readable, and to Michelle Rankin for her support
			
 
				+of this effort.
			
 
				+Other contributions are acknowledged in the Linux kernel's git archive.
			
 
				+The cartoon is copyright (c) 2013 by Melissa Broussard,
			
 
				+and is provided
			
 
				+under the terms of the Creative Commons Attribution-Share Alike 3.0
			
 
				+United States license.
			
 
				+
			
 
				+<p>@@QQAL@@
			
 
				+
			
 
				+</body></html>
			
--- a/Documentation/RCU/Design/htmlqqz.sh
+++ b/Documentation/RCU/Design/htmlqqz.sh
@@ -0,0 +1,108 @@
 
				+#!/bin/sh
			
 
				+#
			
 
				+# Usage: sh htmlqqz.sh file
			
 
				+#
			
 
				+# Extracts and converts quick quizzes in a proto-HTML document file.htmlx.
			
 
				+# Commands, all of which must be on a line by themselves:
			
 
				+#
			
 
				+#	"<p>@@QQ@@": Start of a quick quiz.
			
 
				+#	"<p>@@QQA@@": Start of a quick-quiz answer.
			
 
				+#	"<p>@@QQE@@": End of a quick-quiz answer, and thus of the quick quiz.
			
 
				+#	"<p>@@QQAL@@": Place to put quick-quiz answer list.
			
 
				+#
			
 
				+# Places the result in file.html.
			
 
				+#
			
 
				+# This program is free software; you can redistribute it and/or modify
			
 
				+# it under the terms of the GNU General Public License as published by
			
 
				+# the Free Software Foundation; either version 2 of the License, or
			
 
				+# (at your option) any later version.
			
 
				+#
			
 
				+# This program is distributed in the hope that it will be useful,
			
 
				+# but WITHOUT ANY WARRANTY; without even the implied warranty of
			
 
				+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
			
 
				+# GNU General Public License for more details.
			
 
				+#
			
 
				+# You should have received a copy of the GNU General Public License
			
 
				+# along with this program; if not, you can access it online at
			
 
				+# http://www.gnu.org/licenses/gpl-2.0.html.
			
 
				+#
			
 
				+# Copyright (c) 2013 Paul E. McKenney, IBM Corporation.
			
 
				+
			
 
				+fn=$1
			
 
				+if test ! -r $fn.htmlx
			
 
				+then
			
 
				+	echo "Error: $fn.htmlx unreadable."
			
 
				+	exit 1
			
 
				+fi
			
 
				+
			
 
				+echo "<!-- DO NOT HAND EDIT. -->" > $fn.html
			
 
				+echo "<!-- Instead, edit $fn.htmlx and run 'sh htmlqqz.sh $fn' -->" >> $fn.html
			
 
				+awk < $fn.htmlx >> $fn.html '
			
 
				+
			
 
				+state == "" && $1 != "<p>@@QQ@@" && $1 != "<p>@@QQAL@@" {
			
 
				+	print $0;
			
 
				+	if ($0 ~ /^<p>@@QQ/)
			
 
				+		print "Bad Quick Quiz command: " NR " (expected <p>@@QQ@@ or <p>@@QQAL@@)." > "/dev/stderr"
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "" && $1 == "<p>@@QQ@@" {
			
 
				+	qqn++;
			
 
				+	qqlineno = NR;
			
 
				+	haveqq = 1;
			
 
				+	state = "qq";
			
 
				+	print "<p><a name=\"Quick Quiz " qqn "\"><b>Quick Quiz " qqn "</b>:</a>"
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "qq" && $1 != "<p>@@QQA@@" {
			
 
				+	qq[qqn] = qq[qqn] $0 "\n";
			
 
				+	print $0
			
 
				+	if ($0 ~ /^<p>@@QQ/)
			
 
				+		print "Bad Quick Quiz command: " NR ". (expected <p>@@QQA@@)" > "/dev/stderr"
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "qq" && $1 == "<p>@@QQA@@" {
			
 
				+	state = "qqa";
			
 
				+	print "<br><a href=\"#qq" qqn "answer\">Answer</a>"
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "qqa" && $1 != "<p>@@QQE@@" {
			
 
				+	qqa[qqn] = qqa[qqn] $0 "\n";
			
 
				+	if ($0 ~ /^<p>@@QQ/)
			
 
				+		print "Bad Quick Quiz command: " NR " (expected <p>@@QQE@@)." > "/dev/stderr"
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "qqa" && $1 == "<p>@@QQE@@" {
			
 
				+	state = "";
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+state == "" && $1 == "<p>@@QQAL@@" {
			
 
				+	haveqq = "";
			
 
				+	print "<h3><a name=\"Answers to Quick Quizzes\">"
			
 
				+	print "Answers to Quick Quizzes</a></h3>"
			
 
				+	print "";
			
 
				+	for (i = 1; i <= qqn; i++) {
			
 
				+		print "<a name=\"qq" i "answer\"></a>"
			
 
				+		print "<p><b>Quick Quiz " i "</b>:"
			
 
				+		print qq[i];
			
 
				+		print "";
			
 
				+		print "</p><p><b>Answer</b>:"
			
 
				+		print qqa[i];
			
 
				+		print "";
			
 
				+		print "</p><p><a href=\"#Quick%20Quiz%20" i "\"><b>Back to Quick Quiz " i "</b>.</a>"
			
 
				+		print "";
			
 
				+	}
			
 
				+	next;
			
 
				+}
			
 
				+
			
 
				+END {
			
 
				+	if (state != "")
			
 
				+		print "Unterminated Quick Quiz: " qqlineno "." > "/dev/stderr"
			
 
				+	else if (haveqq)
			
 
				+		print "Missing \"<p>@@QQAL@@\", no Quick Quiz." > "/dev/stderr"
			
 
				+}'
			
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3296,18 +3296,35 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
				 	rcutorture.verbose= [KNL]
			
 
				 			Enable additional printk() statements.
			
 
				 
			
 
				+	rcupdate.rcu_cpu_stall_suppress= [KNL]
			
 
				+			Suppress RCU CPU stall warning messages.
			
 
				+
			
 
				+	rcupdate.rcu_cpu_stall_timeout= [KNL]
			
 
				+			Set timeout for RCU CPU stall warning messages.
			
 
				+
			
 
				 	rcupdate.rcu_expedited= [KNL]
			
 
				 			Use expedited grace-period primitives, for
			
 
				 			example, synchronize_rcu_expedited() instead
			
 
				 			of synchronize_rcu().  This reduces latency,
			
 
				 			but can increase CPU utilization, degrade
			
 
				 			real-time latency, and degrade energy efficiency.
			
 
				-
			
 
				-	rcupdate.rcu_cpu_stall_suppress= [KNL]
			
 
				-			Suppress RCU CPU stall warning messages.
			
 
				-
			
 
				-	rcupdate.rcu_cpu_stall_timeout= [KNL]
			
 
				-			Set timeout for RCU CPU stall warning messages.
			
 
				+			No effect on CONFIG_TINY_RCU kernels.
			
 
				+
			
 
				+	rcupdate.rcu_normal= [KNL]
			
 
				+			Use only normal grace-period primitives,
			
 
				+			for example, synchronize_rcu() instead of
			
 
				+			synchronize_rcu_expedited().  This improves
			
 
				+			real-time latency, CPU utilization, and
			
 
				+			energy efficiency, but can expose users to
			
 
				+			increased grace-period latency.  This parameter
			
 
				+			overrides rcupdate.rcu_expedited.  No effect on
			
 
				+			CONFIG_TINY_RCU kernels.
			
 
				+
			
 
				+	rcupdate.rcu_normal_after_boot= [KNL]
			
 
				+			Once boot has completed (that is, after
			
 
				+			rcu_end_inkernel_boot() has been invoked), use
			
 
				+			only normal grace-period primitives.  No effect
			
 
				+			on CONFIG_TINY_RCU kernels.
			
 
				 
			
 
				 	rcupdate.rcu_task_stall_timeout= [KNL]
			
 
				 			Set timeout in jiffies for RCU task stall warning
			
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -194,7 +194,7 @@ There are some minimal guarantees that may be expected of a CPU:
 
				  (*) On any given CPU, dependent memory accesses will be issued in order, with
			
 
				      respect to itself.  This means that for:
			
 
				 
			
 
				-	WRITE_ONCE(Q, P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
			
 
				+	Q = READ_ONCE(P); smp_read_barrier_depends(); D = READ_ONCE(*Q);
			
 
				 
			
 
				      the CPU will issue the following memory operations:
			
 
				 
			
@@ -202,9 +202,9 @@ There are some minimal guarantees that may be expected of a CPU:
 
				 
			
 
				      and always in that order.  On most systems, smp_read_barrier_depends()
			
 
				      does nothing, but it is required for DEC Alpha.  The READ_ONCE()
			
 
				-     and WRITE_ONCE() are required to prevent compiler mischief.  Please
			
 
				-     note that you should normally use something like rcu_dereference()
			
 
				-     instead of open-coding smp_read_barrier_depends().
			
 
				+     is required to prevent compiler mischief.  Please note that you
			
 
				+     should normally use something like rcu_dereference() instead of
			
 
				+     open-coding smp_read_barrier_depends().
			
 
				 
			
 
				  (*) Overlapping loads and stores within a particular CPU will appear to be
			
 
				      ordered within that CPU.  This means that for:
			
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -133,6 +133,12 @@ static void sysrq_handle_crash(int key)
 
				 {
			
 
				 	char *killer = NULL;
			
 
				 
			
 
				+	/* we need to release the RCU read lock here,
			
 
				+	 * otherwise we get an annoying
			
 
				+	 * 'BUG: sleeping function called from invalid context'
			
 
				+	 * complaint from the kernel before the panic.
			
 
				+	 */
			
 
				+	rcu_read_unlock();
			
 
				 	panic_on_oops = 1;	/* force panic */
			
 
				 	wmb();
			
 
				 	*killer = 1;
			
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -24,7 +24,7 @@
 
				 
			
 
				 static inline void INIT_LIST_HEAD(struct list_head *list)
			
 
				 {
			
 
				-	list->next = list;
			
 
				+	WRITE_ONCE(list->next, list);
			
 
				 	list->prev = list;
			
 
				 }
			
 
				 
			
@@ -42,7 +42,7 @@ static inline void __list_add(struct list_head *new,
 
				 	next->prev = new;
			
 
				 	new->next = next;
			
 
				 	new->prev = prev;
			
 
				-	prev->next = new;
			
 
				+	WRITE_ONCE(prev->next, new);
			
 
				 }
			
 
				 #else
			
 
				 extern void __list_add(struct list_head *new,
			
@@ -186,7 +186,7 @@ static inline int list_is_last(const struct list_head *list,
 
				  */
			
 
				 static inline int list_empty(const struct list_head *head)
			
 
				 {
			
 
				-	return head->next == head;
			
 
				+	return READ_ONCE(head->next) == head;
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -608,7 +608,7 @@ static inline int hlist_unhashed(const struct hlist_node *h)
 
				 
			
 
				 static inline int hlist_empty(const struct hlist_head *h)
			
 
				 {
			
 
				-	return !h->first;
			
 
				+	return !READ_ONCE(h->first);
			
 
				 }
			
 
				 
			
 
				 static inline void __hlist_del(struct hlist_node *n)
			
@@ -642,7 +642,7 @@ static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 
				 	n->next = first;
			
 
				 	if (first)
			
 
				 		first->pprev = &n->next;
			
 
				-	h->first = n;
			
 
				+	WRITE_ONCE(h->first, n);
			
 
				 	n->pprev = &h->first;
			
 
				 }
			
 
				 
			
@@ -653,14 +653,14 @@ static inline void hlist_add_before(struct hlist_node *n,
 
				 	n->pprev = next->pprev;
			
 
				 	n->next = next;
			
 
				 	next->pprev = &n->next;
			
 
				-	*(n->pprev) = n;
			
 
				+	WRITE_ONCE(*(n->pprev), n);
			
 
				 }
			
 
				 
			
 
				 static inline void hlist_add_behind(struct hlist_node *n,
			
 
				 				    struct hlist_node *prev)
			
 
				 {
			
 
				 	n->next = prev->next;
			
 
				-	prev->next = n;
			
 
				+	WRITE_ONCE(prev->next, n);
			
 
				 	n->pprev = &prev->next;
			
 
				 
			
 
				 	if (n->next)
			
--- a/include/linux/list_bl.h
+++ b/include/linux/list_bl.h
@@ -70,7 +70,7 @@ static inline void hlist_bl_set_first(struct hlist_bl_head *h,
 
				 
			
 
				 static inline int hlist_bl_empty(const struct hlist_bl_head *h)
			
 
				 {
			
 
				-	return !((unsigned long)h->first & ~LIST_BL_LOCKMASK);
			
 
				+	return !((unsigned long)READ_ONCE(h->first) & ~LIST_BL_LOCKMASK);
			
 
				 }
			
 
				 
			
 
				 static inline void hlist_bl_add_head(struct hlist_bl_node *n,
			
--- a/include/linux/list_nulls.h
+++ b/include/linux/list_nulls.h
@@ -57,7 +57,7 @@ static inline int hlist_nulls_unhashed(const struct hlist_nulls_node *h)
 
				 
			
 
				 static inline int hlist_nulls_empty(const struct hlist_nulls_head *h)
			
 
				 {
			
 
				-	return is_a_nulls(h->first);
			
 
				+	return is_a_nulls(READ_ONCE(h->first));
			
 
				 }
			
 
				 
			
 
				 static inline void hlist_nulls_add_head(struct hlist_nulls_node *n,
			
--- a/include/linux/rculist.h
+++ b/include/linux/rculist.h
@@ -179,32 +179,31 @@ static inline void list_replace_rcu(struct list_head *old,
 
				 }
			
 
				 
			
 
				 /**
			
 
				- * list_splice_init_rcu - splice an RCU-protected list into an existing list.
			
 
				+ * __list_splice_init_rcu - join an RCU-protected list into an existing list.
			
 
				  * @list:	the RCU-protected list to splice
			
 
				- * @head:	the place in the list to splice the first list into
			
 
				+ * @prev:	points to the last element of the existing list
			
 
				+ * @next:	points to the first element of the existing list
			
 
				  * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
			
 
				  *
			
 
				- * @head can be RCU-read traversed concurrently with this function.
			
 
				+ * The list pointed to by @prev and @next can be RCU-read traversed
			
 
				+ * concurrently with this function.
			
 
				  *
			
 
				  * Note that this function blocks.
			
 
				  *
			
 
				- * Important note: the caller must take whatever action is necessary to
			
 
				- *	prevent any other updates to @head.  In principle, it is possible
			
 
				- *	to modify the list as soon as sync() begins execution.
			
 
				- *	If this sort of thing becomes necessary, an alternative version
			
 
				- *	based on call_rcu() could be created.  But only if -really-
			
 
				- *	needed -- there is no shortage of RCU API members.
			
 
				+ * Important note: the caller must take whatever action is necessary to prevent
			
 
				+ * any other updates to the existing list.  In principle, it is possible to
			
 
				+ * modify the list as soon as sync() begins execution. If this sort of thing
			
 
				+ * becomes necessary, an alternative version based on call_rcu() could be
			
 
				+ * created.  But only if -really- needed -- there is no shortage of RCU API
			
 
				+ * members.
			
 
				  */
			
 
				-static inline void list_splice_init_rcu(struct list_head *list,
			
 
				-					struct list_head *head,
			
 
				-					void (*sync)(void))
			
 
				+static inline void __list_splice_init_rcu(struct list_head *list,
			
 
				+					  struct list_head *prev,
			
 
				+					  struct list_head *next,
			
 
				+					  void (*sync)(void))
			
 
				 {
			
 
				 	struct list_head *first = list->next;
			
 
				 	struct list_head *last = list->prev;
			
 
				-	struct list_head *at = head->next;
			
 
				-
			
 
				-	if (list_empty(list))
			
 
				-		return;
			
 
				 
			
 
				 	/*
			
 
				 	 * "first" and "last" tracking list, so initialize it.  RCU readers
			
@@ -231,10 +230,40 @@ static inline void list_splice_init_rcu(struct list_head *list,
 
				 	 * this function.
			
 
				 	 */
			
 
				 
			
 
				-	last->next = at;
			
 
				-	rcu_assign_pointer(list_next_rcu(head), first);
			
 
				-	first->prev = head;
			
 
				-	at->prev = last;
			
 
				+	last->next = next;
			
 
				+	rcu_assign_pointer(list_next_rcu(prev), first);
			
 
				+	first->prev = prev;
			
 
				+	next->prev = last;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * list_splice_init_rcu - splice an RCU-protected list into an existing list,
			
 
				+ *                        designed for stacks.
			
 
				+ * @list:	the RCU-protected list to splice
			
 
				+ * @head:	the place in the existing list to splice the first list into
			
 
				+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
			
 
				+ */
			
 
				+static inline void list_splice_init_rcu(struct list_head *list,
			
 
				+					struct list_head *head,
			
 
				+					void (*sync)(void))
			
 
				+{
			
 
				+	if (!list_empty(list))
			
 
				+		__list_splice_init_rcu(list, head, head->next, sync);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * list_splice_tail_init_rcu - splice an RCU-protected list into an existing
			
 
				+ *                             list, designed for queues.
			
 
				+ * @list:	the RCU-protected list to splice
			
 
				+ * @head:	the place in the existing list to splice the first list into
			
 
				+ * @sync:	function to sync: synchronize_rcu(), synchronize_sched(), ...
			
 
				+ */
			
 
				+static inline void list_splice_tail_init_rcu(struct list_head *list,
			
 
				+					     struct list_head *head,
			
 
				+					     void (*sync)(void))
			
 
				+{
			
 
				+	if (!list_empty(list))
			
 
				+		__list_splice_init_rcu(list, head->prev, head, sync);
			
 
				 }
			
 
				 
			
 
				 /**
			
@@ -304,6 +333,42 @@ static inline void list_splice_init_rcu(struct list_head *list,
 
				 		&pos->member != (head); \
			
 
				 		pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
			
 
				 
			
 
				+/**
			
 
				+ * list_entry_lockless - get the struct for this entry
			
 
				+ * @ptr:        the &struct list_head pointer.
			
 
				+ * @type:       the type of the struct this is embedded in.
			
 
				+ * @member:     the name of the list_head within the struct.
			
 
				+ *
			
 
				+ * This primitive may safely run concurrently with the _rcu list-mutation
			
 
				+ * primitives such as list_add_rcu(), but requires some implicit RCU
			
 
				+ * read-side guarding.  One example is running within a special
			
 
				+ * exception-time environment where preemption is disabled and where
			
 
				+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
			
 
				+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
			
 
				+ * example is when items are added to the list, but never deleted.
			
 
				+ */
			
 
				+#define list_entry_lockless(ptr, type, member) \
			
 
				+	container_of((typeof(ptr))lockless_dereference(ptr), type, member)
			
 
				+
			
 
				+/**
			
 
				+ * list_for_each_entry_lockless - iterate over rcu list of given type
			
 
				+ * @pos:	the type * to use as a loop cursor.
			
 
				+ * @head:	the head for your list.
			
 
				+ * @member:	the name of the list_struct within the struct.
			
 
				+ *
			
 
				+ * This primitive may safely run concurrently with the _rcu list-mutation
			
 
				+ * primitives such as list_add_rcu(), but requires some implicit RCU
			
 
				+ * read-side guarding.  One example is running within a special
			
 
				+ * exception-time environment where preemption is disabled and where
			
 
				+ * lockdep cannot be invoked (in which case updaters must use RCU-sched,
			
 
				+ * as in synchronize_sched(), call_rcu_sched(), and friends).  Another
			
 
				+ * example is when items are added to the list, but never deleted.
			
 
				+ */
			
 
				+#define list_for_each_entry_lockless(pos, head, member) \
			
 
				+	for (pos = list_entry_lockless((head)->next, typeof(*pos), member); \
			
 
				+	     &pos->member != (head); \
			
 
				+	     pos = list_entry_lockless(pos->member.next, typeof(*pos), member))
			
 
				+
			
 
				 /**
			
 
				  * list_for_each_entry_continue_rcu - continue iteration over list of given type
			
 
				  * @pos:	the type * to use as a loop cursor.
			
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,10 +48,17 @@
 
				 
			
 
				 #include <asm/barrier.h>
			
 
				 
			
 
				+#ifndef CONFIG_TINY_RCU
			
 
				 extern int rcu_expedited; /* for sysctl */
			
 
				+extern int rcu_normal;    /* also for sysctl */
			
 
				+#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				 
			
 
				 #ifdef CONFIG_TINY_RCU
			
 
				 /* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
			
 
				+static inline bool rcu_gp_is_normal(void)  /* Internal RCU use. */
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				 static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
			
 
				 {
			
 
				 	return false;
			
@@ -65,6 +72,7 @@ static inline void rcu_unexpedite_gp(void)
 
				 {
			
 
				 }
			
 
				 #else /* #ifdef CONFIG_TINY_RCU */
			
 
				+bool rcu_gp_is_normal(void);     /* Internal RCU use. */
			
 
				 bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
			
 
				 void rcu_expedite_gp(void);
			
 
				 void rcu_unexpedite_gp(void);
			
@@ -321,7 +329,6 @@ static inline int rcu_preempt_depth(void)
 
				 
			
 
				 /* Internal to kernel */
			
 
				 void rcu_init(void);
			
 
				-void rcu_end_inkernel_boot(void);
			
 
				 void rcu_sched_qs(void);
			
 
				 void rcu_bh_qs(void);
			
 
				 void rcu_check_callbacks(int user);
			
@@ -329,6 +336,12 @@ struct notifier_block;
 
				 int rcu_cpu_notify(struct notifier_block *self,
			
 
				 		   unsigned long action, void *hcpu);
			
 
				 
			
 
				+#ifndef CONFIG_TINY_RCU
			
 
				+void rcu_end_inkernel_boot(void);
			
 
				+#else /* #ifndef CONFIG_TINY_RCU */
			
 
				+static inline void rcu_end_inkernel_boot(void) { }
			
 
				+#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				+
			
 
				 #ifdef CONFIG_RCU_STALL_COMMON
			
 
				 void rcu_sysrq_start(void);
			
 
				 void rcu_sysrq_end(void);
			
@@ -379,9 +392,9 @@ static inline void rcu_init_nohz(void)
 
				  */
			
 
				 #define RCU_NONIDLE(a) \
			
 
				 	do { \
			
 
				-		rcu_irq_enter(); \
			
 
				+		rcu_irq_enter_irqson(); \
			
 
				 		do { a; } while (0); \
			
 
				-		rcu_irq_exit(); \
			
 
				+		rcu_irq_exit_irqson(); \
			
 
				 	} while (0)
			
 
				 
			
 
				 /*
			
@@ -741,7 +754,7 @@ static inline void rcu_preempt_sleep_check(void)
 
				  * The tracing infrastructure traces RCU (we want that), but unfortunately
			
 
				  * some of the RCU checks causes tracing to lock up the system.
			
 
				  *
			
 
				- * The tracing version of rcu_dereference_raw() must not call
			
 
				+ * The no-tracing version of rcu_dereference_raw() must not call
			
 
				  * rcu_read_lock_held().
			
 
				  */
			
 
				 #define rcu_dereference_raw_notrace(p) __rcu_dereference_check((p), 1, __rcu)
			
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -181,6 +181,14 @@ static inline void rcu_irq_enter(void)
 
				 {
			
 
				 }
			
 
				 
			
 
				+static inline void rcu_irq_exit_irqson(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				+static inline void rcu_irq_enter_irqson(void)
			
 
				+{
			
 
				+}
			
 
				+
			
 
				 static inline void rcu_irq_exit(void)
			
 
				 {
			
 
				 }
			
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -37,7 +37,7 @@ void rcu_cpu_stall_reset(void);
 
				 /*
			
 
				  * Note a virtualization-based context switch.  This is simply a
			
 
				  * wrapper around rcu_note_context_switch(), which allows TINY_RCU
			
 
				- * to save a few bytes.
			
 
				+ * to save a few bytes. The caller must have disabled interrupts.
			
 
				  */
			
 
				 static inline void rcu_virt_note_context_switch(int cpu)
			
 
				 {
			
@@ -97,6 +97,8 @@ void rcu_idle_enter(void);
 
				 void rcu_idle_exit(void);
			
 
				 void rcu_irq_enter(void);
			
 
				 void rcu_irq_exit(void);
			
 
				+void rcu_irq_enter_irqson(void);
			
 
				+void rcu_irq_exit_irqson(void);
			
 
				 
			
 
				 void exit_rcu(void);
			
 
				 
			
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -171,8 +171,8 @@ extern void syscall_unregfunc(void);
 
				 				TP_PROTO(data_proto),			\
			
 
				 				TP_ARGS(data_args),			\
			
 
				 				TP_CONDITION(cond),			\
			
 
				-				rcu_irq_enter(),			\
			
 
				-				rcu_irq_exit());			\
			
 
				+				rcu_irq_enter_irqson(),			\
			
 
				+				rcu_irq_exit_irqson());			\
			
 
				 	}
			
 
				 #else
			
 
				 #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args)
			
--- a/init/main.c
+++ b/init/main.c
@@ -943,6 +943,8 @@ static int __ref kernel_init(void *unused)
 
				 
			
 
				 	flush_delayed_fput();
			
 
				 
			
 
				+	rcu_end_inkernel_boot();
			
 
				+
			
 
				 	if (ramdisk_execute_command) {
			
 
				 		ret = run_init_process(ramdisk_execute_command);
			
 
				 		if (!ret)
			
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -20,7 +20,7 @@
 
				 #include <linux/capability.h>
			
 
				 #include <linux/compiler.h>
			
 
				 
			
 
				-#include <linux/rcupdate.h>	/* rcu_expedited */
			
 
				+#include <linux/rcupdate.h>	/* rcu_expedited and rcu_normal */
			
 
				 
			
 
				 #define KERNEL_ATTR_RO(_name) \
			
 
				 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
			
@@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj,
 
				 }
			
 
				 KERNEL_ATTR_RO(fscaps);
			
 
				 
			
 
				+#ifndef CONFIG_TINY_RCU
			
 
				 int rcu_expedited;
			
 
				 static ssize_t rcu_expedited_show(struct kobject *kobj,
			
 
				 				  struct kobj_attribute *attr, char *buf)
			
 
				 {
			
 
				-	return sprintf(buf, "%d\n", rcu_expedited);
			
 
				+	return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited));
			
 
				 }
			
 
				 static ssize_t rcu_expedited_store(struct kobject *kobj,
			
 
				 				   struct kobj_attribute *attr,
			
@@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj,
 
				 }
			
 
				 KERNEL_ATTR_RW(rcu_expedited);
			
 
				 
			
 
				+int rcu_normal;
			
 
				+static ssize_t rcu_normal_show(struct kobject *kobj,
			
 
				+			       struct kobj_attribute *attr, char *buf)
			
 
				+{
			
 
				+	return sprintf(buf, "%d\n", READ_ONCE(rcu_normal));
			
 
				+}
			
 
				+static ssize_t rcu_normal_store(struct kobject *kobj,
			
 
				+				struct kobj_attribute *attr,
			
 
				+				const char *buf, size_t count)
			
 
				+{
			
 
				+	if (kstrtoint(buf, 0, &rcu_normal))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	return count;
			
 
				+}
			
 
				+KERNEL_ATTR_RW(rcu_normal);
			
 
				+#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				+
			
 
				 /*
			
 
				  * Make /sys/kernel/notes give the raw contents of our kernel .notes section.
			
 
				  */
			
@@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = {
 
				 	&kexec_crash_size_attr.attr,
			
 
				 	&vmcoreinfo_attr.attr,
			
 
				 #endif
			
 
				+#ifndef CONFIG_TINY_RCU
			
 
				 	&rcu_expedited_attr.attr,
			
 
				+	&rcu_normal_attr.attr,
			
 
				+#endif
			
 
				 	NULL
			
 
				 };
			
 
				 
			
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -162,6 +162,27 @@ static int rcu_torture_writer_state;
 
				 #define RTWS_SYNC		7
			
 
				 #define RTWS_STUTTER		8
			
 
				 #define RTWS_STOPPING		9
			
 
				+static const char * const rcu_torture_writer_state_names[] = {
			
 
				+	"RTWS_FIXED_DELAY",
			
 
				+	"RTWS_DELAY",
			
 
				+	"RTWS_REPLACE",
			
 
				+	"RTWS_DEF_FREE",
			
 
				+	"RTWS_EXP_SYNC",
			
 
				+	"RTWS_COND_GET",
			
 
				+	"RTWS_COND_SYNC",
			
 
				+	"RTWS_SYNC",
			
 
				+	"RTWS_STUTTER",
			
 
				+	"RTWS_STOPPING",
			
 
				+};
			
 
				+
			
 
				+static const char *rcu_torture_writer_state_getname(void)
			
 
				+{
			
 
				+	unsigned int i = READ_ONCE(rcu_torture_writer_state);
			
 
				+
			
 
				+	if (i >= ARRAY_SIZE(rcu_torture_writer_state_names))
			
 
				+		return "???";
			
 
				+	return rcu_torture_writer_state_names[i];
			
 
				+}
			
 
				 
			
 
				 #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
			
 
				 #define RCUTORTURE_RUNNABLE_INIT 1
			
@@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void)
 
				 
			
 
				 		rcutorture_get_gp_data(cur_ops->ttype,
			
 
				 				       &flags, &gpnum, &completed);
			
 
				-		pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n",
			
 
				+		pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n",
			
 
				+			 rcu_torture_writer_state_getname(),
			
 
				 			 rcu_torture_writer_state,
			
 
				 			 gpnum, completed, flags);
			
 
				 		show_rcu_gp_kthreads();
			
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
 
				  */
			
 
				 void synchronize_srcu(struct srcu_struct *sp)
			
 
				 {
			
 
				-	__synchronize_srcu(sp, rcu_gp_is_expedited()
			
 
				+	__synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal())
			
 
				 			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
			
 
				 			   : SYNCHRONIZE_SRCU_TRYCOUNT);
			
 
				 }
			
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree");
 
				 
			
 
				 /* Data structures. */
			
 
				 
			
 
				-static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
			
 
				-static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
			
 
				-static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
			
 
				-
			
 
				 /*
			
 
				  * In order to export the rcu_state name to the tracing tools, it
			
 
				  * needs to be added in the __tracepoint_string section.
			
@@ -246,24 +242,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 
				  */
			
 
				 void rcu_sched_qs(void)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) {
			
 
				-		trace_rcu_grace_period(TPS("rcu_sched"),
			
 
				-				       __this_cpu_read(rcu_sched_data.gpnum),
			
 
				-				       TPS("cpuqs"));
			
 
				-		__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
			
 
				-		if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
			
 
				-			return;
			
 
				-		local_irq_save(flags);
			
 
				-		if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) {
			
 
				-			__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
			
 
				-			rcu_report_exp_rdp(&rcu_sched_state,
			
 
				-					   this_cpu_ptr(&rcu_sched_data),
			
 
				-					   true);
			
 
				-		}
			
 
				-		local_irq_restore(flags);
			
 
				-	}
			
 
				+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s))
			
 
				+		return;
			
 
				+	trace_rcu_grace_period(TPS("rcu_sched"),
			
 
				+			       __this_cpu_read(rcu_sched_data.gpnum),
			
 
				+			       TPS("cpuqs"));
			
 
				+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false);
			
 
				+	if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))
			
 
				+		return;
			
 
				+	__this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false);
			
 
				+	rcu_report_exp_rdp(&rcu_sched_state,
			
 
				+			   this_cpu_ptr(&rcu_sched_data), true);
			
 
				 }
			
 
				 
			
 
				 void rcu_bh_qs(void)
			
@@ -300,17 +289,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr);
 
				  * We inform the RCU core by emulating a zero-duration dyntick-idle
			
 
				  * period, which we in turn do by incrementing the ->dynticks counter
			
 
				  * by two.
			
 
				+ *
			
 
				+ * The caller must have disabled interrupts.
			
 
				  */
			
 
				 static void rcu_momentary_dyntick_idle(void)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	struct rcu_data *rdp;
			
 
				 	struct rcu_dynticks *rdtp;
			
 
				 	int resched_mask;
			
 
				 	struct rcu_state *rsp;
			
 
				 
			
 
				-	local_irq_save(flags);
			
 
				-
			
 
				 	/*
			
 
				 	 * Yes, we can lose flag-setting operations.  This is OK, because
			
 
				 	 * the flag will be set again after some delay.
			
@@ -340,13 +328,12 @@ static void rcu_momentary_dyntick_idle(void)
 
				 		smp_mb__after_atomic(); /* Later stuff after QS. */
			
 
				 		break;
			
 
				 	}
			
 
				-	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				 /*
			
 
				  * Note a context switch.  This is a quiescent state for RCU-sched,
			
 
				  * and requires special handling for preemptible RCU.
			
 
				- * The caller must have disabled preemption.
			
 
				+ * The caller must have disabled interrupts.
			
 
				  */
			
 
				 void rcu_note_context_switch(void)
			
 
				 {
			
@@ -376,9 +363,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
				  */
			
 
				 void rcu_all_qs(void)
			
 
				 {
			
 
				+	unsigned long flags;
			
 
				+
			
 
				 	barrier(); /* Avoid RCU read-side critical sections leaking down. */
			
 
				-	if (unlikely(raw_cpu_read(rcu_sched_qs_mask)))
			
 
				+	if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) {
			
 
				+		local_irq_save(flags);
			
 
				 		rcu_momentary_dyntick_idle();
			
 
				+		local_irq_restore(flags);
			
 
				+	}
			
 
				 	this_cpu_inc(rcu_qs_ctr);
			
 
				 	barrier(); /* Avoid RCU read-side critical sections leaking up. */
			
 
				 }
			
@@ -605,25 +597,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp)
 
				  * The caller must have disabled interrupts to prevent races with
			
 
				  * normal callback registry.
			
 
				  */
			
 
				-static int
			
 
				+static bool
			
 
				 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
			
 
				 {
			
 
				 	int i;
			
 
				 
			
 
				 	if (rcu_gp_in_progress(rsp))
			
 
				-		return 0;  /* No, a grace period is already in progress. */
			
 
				+		return false;  /* No, a grace period is already in progress. */
			
 
				 	if (rcu_future_needs_gp(rsp))
			
 
				-		return 1;  /* Yes, a no-CBs CPU needs one. */
			
 
				+		return true;  /* Yes, a no-CBs CPU needs one. */
			
 
				 	if (!rdp->nxttail[RCU_NEXT_TAIL])
			
 
				-		return 0;  /* No, this is a no-CBs (or offline) CPU. */
			
 
				+		return false;  /* No, this is a no-CBs (or offline) CPU. */
			
 
				 	if (*rdp->nxttail[RCU_NEXT_READY_TAIL])
			
 
				-		return 1;  /* Yes, this CPU has newly registered callbacks. */
			
 
				+		return true;  /* Yes, CPU has newly registered callbacks. */
			
 
				 	for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++)
			
 
				 		if (rdp->nxttail[i - 1] != rdp->nxttail[i] &&
			
 
				 		    ULONG_CMP_LT(READ_ONCE(rsp->completed),
			
 
				 				 rdp->nxtcompleted[i]))
			
 
				-			return 1;  /* Yes, CBs for future grace period. */
			
 
				-	return 0; /* No grace period needed. */
			
 
				+			return true;  /* Yes, CBs for future grace period. */
			
 
				+	return false; /* No grace period needed. */
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -740,7 +732,7 @@ void rcu_user_enter(void)
 
				  *
			
 
				  * Exit from an interrupt handler, which might possibly result in entering
			
 
				  * idle mode, in other words, leaving the mode in which read-side critical
			
 
				- * sections can occur.
			
 
				+ * sections can occur.  The caller must have disabled interrupts.
			
 
				  *
			
 
				  * This code assumes that the idle loop never does anything that might
			
 
				  * result in unbalanced calls to irq_enter() and irq_exit().  If your
			
@@ -753,11 +745,10 @@ void rcu_user_enter(void)
 
				  */
			
 
				 void rcu_irq_exit(void)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	long long oldval;
			
 
				 	struct rcu_dynticks *rdtp;
			
 
				 
			
 
				-	local_irq_save(flags);
			
 
				+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
			
 
				 	rdtp = this_cpu_ptr(&rcu_dynticks);
			
 
				 	oldval = rdtp->dynticks_nesting;
			
 
				 	rdtp->dynticks_nesting--;
			
@@ -768,6 +759,17 @@ void rcu_irq_exit(void)
 
				 	else
			
 
				 		rcu_eqs_enter_common(oldval, true);
			
 
				 	rcu_sysidle_enter(1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wrapper for rcu_irq_exit() where interrupts are enabled.
			
 
				+ */
			
 
				+void rcu_irq_exit_irqson(void)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	rcu_irq_exit();
			
 
				 	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
@@ -865,7 +867,7 @@ void rcu_user_exit(void)
 
				  *
			
 
				  * Enter an interrupt handler, which might possibly result in exiting
			
 
				  * idle mode, in other words, entering the mode in which read-side critical
			
 
				- * sections can occur.
			
 
				+ * sections can occur.  The caller must have disabled interrupts.
			
 
				  *
			
 
				  * Note that the Linux kernel is fully capable of entering an interrupt
			
 
				  * handler that it never exits, for example when doing upcalls to
			
@@ -881,11 +883,10 @@ void rcu_user_exit(void)
 
				  */
			
 
				 void rcu_irq_enter(void)
			
 
				 {
			
 
				-	unsigned long flags;
			
 
				 	struct rcu_dynticks *rdtp;
			
 
				 	long long oldval;
			
 
				 
			
 
				-	local_irq_save(flags);
			
 
				+	RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
			
 
				 	rdtp = this_cpu_ptr(&rcu_dynticks);
			
 
				 	oldval = rdtp->dynticks_nesting;
			
 
				 	rdtp->dynticks_nesting++;
			
@@ -896,6 +897,17 @@ void rcu_irq_enter(void)
 
				 	else
			
 
				 		rcu_eqs_exit_common(oldval, true);
			
 
				 	rcu_sysidle_exit(1);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Wrapper for rcu_irq_enter() where interrupts are enabled.
			
 
				+ */
			
 
				+void rcu_irq_enter_irqson(void)
			
 
				+{
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	local_irq_save(flags);
			
 
				+	rcu_irq_enter();
			
 
				 	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
@@ -1186,6 +1198,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp)
 
				 	rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Convert a ->gp_state value to a character string.
			
 
				+ */
			
 
				+static const char *gp_state_getname(short gs)
			
 
				+{
			
 
				+	if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names))
			
 
				+		return "???";
			
 
				+	return gp_state_names[gs];
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Complain about starvation of grace-period kthread.
			
 
				  */
			
@@ -1196,12 +1218,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
 
				 
			
 
				 	j = jiffies;
			
 
				 	gpa = READ_ONCE(rsp->gp_activity);
			
 
				-	if (j - gpa > 2 * HZ)
			
 
				-		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n",
			
 
				+	if (j - gpa > 2 * HZ) {
			
 
				+		pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n",
			
 
				 		       rsp->name, j - gpa,
			
 
				 		       rsp->gpnum, rsp->completed,
			
 
				-		       rsp->gp_flags, rsp->gp_state,
			
 
				-		       rsp->gp_kthread ? rsp->gp_kthread->state : 0);
			
 
				+		       rsp->gp_flags,
			
 
				+		       gp_state_getname(rsp->gp_state), rsp->gp_state,
			
 
				+		       rsp->gp_kthread ? rsp->gp_kthread->state : ~0);
			
 
				+		if (rsp->gp_kthread)
			
 
				+			sched_show_task(rsp->gp_kthread);
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1214,7 +1240,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
 
				 	struct rcu_node *rnp;
			
 
				 
			
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		if (rnp->qsmask != 0) {
			
 
				 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
			
 
				 				if (rnp->qsmask & (1UL << cpu))
			
@@ -1237,7 +1263,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
				 
			
 
				 	/* Only let one CPU complain about others per time interval. */
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	delta = jiffies - READ_ONCE(rsp->jiffies_stall);
			
 
				 	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
			
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
@@ -1256,7 +1282,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 
				 	       rsp->name);
			
 
				 	print_cpu_stall_info_begin();
			
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		ndetected += rcu_print_task_stall(rnp);
			
 
				 		if (rnp->qsmask != 0) {
			
 
				 			for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
			
@@ -1327,7 +1353,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
 
				 
			
 
				 	rcu_dump_cpu_stacks(rsp);
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
			
 
				 		WRITE_ONCE(rsp->jiffies_stall,
			
 
				 			   jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
			
@@ -1534,10 +1560,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
 
				 	 * hold it, acquire the root rcu_node structure's lock in order to
			
 
				 	 * start one (if needed).
			
 
				 	 */
			
 
				-	if (rnp != rnp_root) {
			
 
				-		raw_spin_lock(&rnp_root->lock);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				-	}
			
 
				+	if (rnp != rnp_root)
			
 
				+		raw_spin_lock_rcu_node(rnp_root);
			
 
				 
			
 
				 	/*
			
 
				 	 * Get a new grace-period number.  If there really is no grace
			
@@ -1786,11 +1810,10 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
 
				 	if ((rdp->gpnum == READ_ONCE(rnp->gpnum) &&
			
 
				 	     rdp->completed == READ_ONCE(rnp->completed) &&
			
 
				 	     !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */
			
 
				-	    !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */
			
 
				+	    !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */
			
 
				 		local_irq_restore(flags);
			
 
				 		return;
			
 
				 	}
			
 
				-	smp_mb__after_unlock_lock();
			
 
				 	needwake = __note_gp_changes(rsp, rnp, rdp);
			
 
				 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 	if (needwake)
			
@@ -1805,21 +1828,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay)
 
				 }
			
 
				 
			
 
				 /*
			
 
				- * Initialize a new grace period.  Return 0 if no grace period required.
			
 
				+ * Initialize a new grace period.  Return false if no grace period required.
			
 
				  */
			
 
				-static int rcu_gp_init(struct rcu_state *rsp)
			
 
				+static bool rcu_gp_init(struct rcu_state *rsp)
			
 
				 {
			
 
				 	unsigned long oldmask;
			
 
				 	struct rcu_data *rdp;
			
 
				 	struct rcu_node *rnp = rcu_get_root(rsp);
			
 
				 
			
 
				 	WRITE_ONCE(rsp->gp_activity, jiffies);
			
 
				-	raw_spin_lock_irq(&rnp->lock);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irq_rcu_node(rnp);
			
 
				 	if (!READ_ONCE(rsp->gp_flags)) {
			
 
				 		/* Spurious wakeup, tell caller to go back to sleep.  */
			
 
				 		raw_spin_unlock_irq(&rnp->lock);
			
 
				-		return 0;
			
 
				+		return false;
			
 
				 	}
			
 
				 	WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
			
 
				 
			
@@ -1829,7 +1851,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
				 		 * Not supposed to be able to happen.
			
 
				 		 */
			
 
				 		raw_spin_unlock_irq(&rnp->lock);
			
 
				-		return 0;
			
 
				+		return false;
			
 
				 	}
			
 
				 
			
 
				 	/* Advance to a new grace period and initialize state. */
			
@@ -1847,8 +1869,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
				 	 */
			
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				 		rcu_gp_slow(rsp, gp_preinit_delay);
			
 
				-		raw_spin_lock_irq(&rnp->lock);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irq_rcu_node(rnp);
			
 
				 		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
			
 
				 		    !rnp->wait_blkd_tasks) {
			
 
				 			/* Nothing to do on this leaf rcu_node structure. */
			
@@ -1904,8 +1925,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
				 	 */
			
 
				 	rcu_for_each_node_breadth_first(rsp, rnp) {
			
 
				 		rcu_gp_slow(rsp, gp_init_delay);
			
 
				-		raw_spin_lock_irq(&rnp->lock);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irq_rcu_node(rnp);
			
 
				 		rdp = this_cpu_ptr(rsp->rda);
			
 
				 		rcu_preempt_check_blocked_tasks(rnp);
			
 
				 		rnp->qsmask = rnp->qsmaskinit;
			
@@ -1923,7 +1943,7 @@ static int rcu_gp_init(struct rcu_state *rsp)
 
				 		WRITE_ONCE(rsp->gp_activity, jiffies);
			
 
				 	}
			
 
				 
			
 
				-	return 1;
			
 
				+	return true;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -1973,8 +1993,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
 
				 	}
			
 
				 	/* Clear flag to prevent immediate re-entry. */
			
 
				 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
			
 
				-		raw_spin_lock_irq(&rnp->lock);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irq_rcu_node(rnp);
			
 
				 		WRITE_ONCE(rsp->gp_flags,
			
 
				 			   READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
			
 
				 		raw_spin_unlock_irq(&rnp->lock);
			
@@ -1993,8 +2012,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 
				 	struct rcu_node *rnp = rcu_get_root(rsp);
			
 
				 
			
 
				 	WRITE_ONCE(rsp->gp_activity, jiffies);
			
 
				-	raw_spin_lock_irq(&rnp->lock);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irq_rcu_node(rnp);
			
 
				 	gp_duration = jiffies - rsp->gp_start;
			
 
				 	if (gp_duration > rsp->gp_max)
			
 
				 		rsp->gp_max = gp_duration;
			
@@ -2019,8 +2037,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 
				 	 * grace period is recorded in any of the rcu_node structures.
			
 
				 	 */
			
 
				 	rcu_for_each_node_breadth_first(rsp, rnp) {
			
 
				-		raw_spin_lock_irq(&rnp->lock);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irq_rcu_node(rnp);
			
 
				 		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
			
 
				 		WARN_ON_ONCE(rnp->qsmask);
			
 
				 		WRITE_ONCE(rnp->completed, rsp->gpnum);
			
@@ -2035,8 +2052,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 
				 		rcu_gp_slow(rsp, gp_cleanup_delay);
			
 
				 	}
			
 
				 	rnp = rcu_get_root(rsp);
			
 
				-	raw_spin_lock_irq(&rnp->lock);
			
 
				-	smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */
			
 
				+	raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */
			
 
				 	rcu_nocb_gp_set(rnp, nocb);
			
 
				 
			
 
				 	/* Declare grace period done. */
			
@@ -2284,8 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 		rnp_c = rnp;
			
 
				 		rnp = rnp->parent;
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		oldmask = rnp_c->qsmask;
			
 
				 	}
			
 
				 
			
@@ -2332,8 +2347,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 
				 	gps = rnp->gpnum;
			
 
				 	mask = rnp->grpmask;
			
 
				 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
			
 
				-	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_rcu_node(rnp_p);	/* irqs already disabled. */
			
 
				 	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
			
 
				 }
			
 
				 
			
@@ -2355,8 +2369,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 
				 	struct rcu_node *rnp;
			
 
				 
			
 
				 	rnp = rdp->mynode;
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	if ((rdp->cpu_no_qs.b.norm &&
			
 
				 	     rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) ||
			
 
				 	    rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum ||
			
@@ -2582,8 +2595,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 
				 		rnp = rnp->parent;
			
 
				 		if (!rnp)
			
 
				 			break;
			
 
				-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
			
 
				-		smp_mb__after_unlock_lock(); /* GP memory ordering. */
			
 
				+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
			
 
				 		rnp->qsmaskinit &= ~mask;
			
 
				 		rnp->qsmask &= ~mask;
			
 
				 		if (rnp->qsmaskinit) {
			
@@ -2611,8 +2623,7 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
 
				 
			
 
				 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
			
 
				 	mask = rdp->grpmask;
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
			
 
				 	rnp->qsmaskinitnext &= ~mask;
			
 
				 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 }
			
@@ -2809,8 +2820,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				 		cond_resched_rcu_qs();
			
 
				 		mask = 0;
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		if (rnp->qsmask == 0) {
			
 
				 			if (rcu_state_p == &rcu_sched_state ||
			
 
				 			    rsp != rcu_state_p ||
			
@@ -2881,8 +2891,7 @@ static void force_quiescent_state(struct rcu_state *rsp)
 
				 	/* rnp_old == rcu_get_root(rsp), rnp == NULL. */
			
 
				 
			
 
				 	/* Reached the root of the rcu_node tree, acquire lock. */
			
 
				-	raw_spin_lock_irqsave(&rnp_old->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp_old, flags);
			
 
				 	raw_spin_unlock(&rnp_old->fqslock);
			
 
				 	if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
			
 
				 		rsp->n_force_qs_lh++;
			
@@ -2914,7 +2923,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
 
				 	/* Does this CPU require a not-yet-started grace period? */
			
 
				 	local_irq_save(flags);
			
 
				 	if (cpu_needs_another_gp(rsp, rdp)) {
			
 
				-		raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */
			
 
				+		raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
			
 
				 		needwake = rcu_start_gp(rsp);
			
 
				 		raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
			
 
				 		if (needwake)
			
@@ -3005,8 +3014,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 
				 		if (!rcu_gp_in_progress(rsp)) {
			
 
				 			struct rcu_node *rnp_root = rcu_get_root(rsp);
			
 
				 
			
 
				-			raw_spin_lock(&rnp_root->lock);
			
 
				-			smp_mb__after_unlock_lock();
			
 
				+			raw_spin_lock_rcu_node(rnp_root);
			
 
				 			needwake = rcu_start_gp(rsp);
			
 
				 			raw_spin_unlock(&rnp_root->lock);
			
 
				 			if (needwake)
			
@@ -3365,7 +3373,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp)
 
				 {
			
 
				 	unsigned long s;
			
 
				 
			
 
				-	smp_mb(); /* Caller's modifications seen first by other CPUs. */
			
 
				 	s = (READ_ONCE(*sp) + 3) & ~0x1;
			
 
				 	smp_mb(); /* Above access must not bleed into critical section. */
			
 
				 	return s;
			
@@ -3392,6 +3399,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp)
 
				 }
			
 
				 static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp)
			
 
				 {
			
 
				+	smp_mb(); /* Caller's modifications seen first by other CPUs. */
			
 
				 	return rcu_seq_snap(&rsp->expedited_sequence);
			
 
				 }
			
 
				 static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s)
			
@@ -3426,8 +3434,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 
				 	 * CPUs for the current rcu_node structure up the rcu_node tree.
			
 
				 	 */
			
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		if (rnp->expmaskinit == rnp->expmaskinitnext) {
			
 
				 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 			continue;  /* No new CPUs, nothing to do. */
			
@@ -3447,8 +3454,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
 
				 		rnp_up = rnp->parent;
			
 
				 		done = false;
			
 
				 		while (rnp_up) {
			
 
				-			raw_spin_lock_irqsave(&rnp_up->lock, flags);
			
 
				-			smp_mb__after_unlock_lock();
			
 
				+			raw_spin_lock_irqsave_rcu_node(rnp_up, flags);
			
 
				 			if (rnp_up->expmaskinit)
			
 
				 				done = true;
			
 
				 			rnp_up->expmaskinit |= mask;
			
@@ -3472,8 +3478,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
 
				 
			
 
				 	sync_exp_reset_tree_hotplug(rsp);
			
 
				 	rcu_for_each_node_breadth_first(rsp, rnp) {
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		WARN_ON_ONCE(rnp->expmask);
			
 
				 		rnp->expmask = rnp->expmaskinit;
			
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
@@ -3531,8 +3536,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 
				 		mask = rnp->grpmask;
			
 
				 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
			
 
				 		rnp = rnp->parent;
			
 
				-		raw_spin_lock(&rnp->lock); /* irqs already disabled */
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
			
 
				 		WARN_ON_ONCE(!(rnp->expmask & mask));
			
 
				 		rnp->expmask &= ~mask;
			
 
				 	}
			
@@ -3549,8 +3553,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp,
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	__rcu_report_exp_rnp(rsp, rnp, wake, flags);
			
 
				 }
			
 
				 
			
@@ -3564,8 +3567,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
 
				 {
			
 
				 	unsigned long flags;
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	if (!(rnp->expmask & mask)) {
			
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 		return;
			
@@ -3609,7 +3611,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp,
 
				  */
			
 
				 static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
			
 
				 {
			
 
				-	struct rcu_data *rdp;
			
 
				+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
			
 
				 	struct rcu_node *rnp0;
			
 
				 	struct rcu_node *rnp1 = NULL;
			
 
				 
			
@@ -3623,7 +3625,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 
				 	if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) {
			
 
				 		if (mutex_trylock(&rnp0->exp_funnel_mutex)) {
			
 
				 			if (sync_exp_work_done(rsp, rnp0, NULL,
			
 
				-					       &rsp->expedited_workdone0, s))
			
 
				+					       &rdp->expedited_workdone0, s))
			
 
				 				return NULL;
			
 
				 			return rnp0;
			
 
				 		}
			
@@ -3637,14 +3639,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 
				 	 * can be inexact, as it is just promoting locality and is not
			
 
				 	 * strictly needed for correctness.
			
 
				 	 */
			
 
				-	rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id());
			
 
				-	if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s))
			
 
				+	if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s))
			
 
				 		return NULL;
			
 
				 	mutex_lock(&rdp->exp_funnel_mutex);
			
 
				 	rnp0 = rdp->mynode;
			
 
				 	for (; rnp0 != NULL; rnp0 = rnp0->parent) {
			
 
				 		if (sync_exp_work_done(rsp, rnp1, rdp,
			
 
				-				       &rsp->expedited_workdone2, s))
			
 
				+				       &rdp->expedited_workdone2, s))
			
 
				 			return NULL;
			
 
				 		mutex_lock(&rnp0->exp_funnel_mutex);
			
 
				 		if (rnp1)
			
@@ -3654,7 +3655,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s)
 
				 		rnp1 = rnp0;
			
 
				 	}
			
 
				 	if (sync_exp_work_done(rsp, rnp1, rdp,
			
 
				-			       &rsp->expedited_workdone3, s))
			
 
				+			       &rdp->expedited_workdone3, s))
			
 
				 		return NULL;
			
 
				 	return rnp1;
			
 
				 }
			
@@ -3708,8 +3709,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
 
				 
			
 
				 	sync_exp_reset_tree(rsp);
			
 
				 	rcu_for_each_leaf_node(rsp, rnp) {
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 
			
 
				 		/* Each pass checks a CPU for identity, offline, and idle. */
			
 
				 		mask_ofl_test = 0;
			
@@ -3741,24 +3741,22 @@ retry_ipi:
 
				 			ret = smp_call_function_single(cpu, func, rsp, 0);
			
 
				 			if (!ret) {
			
 
				 				mask_ofl_ipi &= ~mask;
			
 
				-			} else {
			
 
				-				/* Failed, raced with offline. */
			
 
				-				raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-				if (cpu_online(cpu) &&
			
 
				-				    (rnp->expmask & mask)) {
			
 
				-					raw_spin_unlock_irqrestore(&rnp->lock,
			
 
				-								   flags);
			
 
				-					schedule_timeout_uninterruptible(1);
			
 
				-					if (cpu_online(cpu) &&
			
 
				-					    (rnp->expmask & mask))
			
 
				-						goto retry_ipi;
			
 
				-					raw_spin_lock_irqsave(&rnp->lock,
			
 
				-							      flags);
			
 
				-				}
			
 
				-				if (!(rnp->expmask & mask))
			
 
				-					mask_ofl_ipi &= ~mask;
			
 
				+				continue;
			
 
				+			}
			
 
				+			/* Failed, raced with offline. */
			
 
				+			raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				+			if (cpu_online(cpu) &&
			
 
				+			    (rnp->expmask & mask)) {
			
 
				 				raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				+				schedule_timeout_uninterruptible(1);
			
 
				+				if (cpu_online(cpu) &&
			
 
				+				    (rnp->expmask & mask))
			
 
				+					goto retry_ipi;
			
 
				+				raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 			}
			
 
				+			if (!(rnp->expmask & mask))
			
 
				+				mask_ofl_ipi &= ~mask;
			
 
				+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 		}
			
 
				 		/* Report quiescent states for those that went offline. */
			
 
				 		mask_ofl_test |= mask_ofl_ipi;
			
@@ -3773,6 +3771,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 
				 	unsigned long jiffies_stall;
			
 
				 	unsigned long jiffies_start;
			
 
				 	unsigned long mask;
			
 
				+	int ndetected;
			
 
				 	struct rcu_node *rnp;
			
 
				 	struct rcu_node *rnp_root = rcu_get_root(rsp);
			
 
				 	int ret;
			
@@ -3785,7 +3784,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 
				 				rsp->expedited_wq,
			
 
				 				sync_rcu_preempt_exp_done(rnp_root),
			
 
				 				jiffies_stall);
			
 
				-		if (ret > 0)
			
 
				+		if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root))
			
 
				 			return;
			
 
				 		if (ret < 0) {
			
 
				 			/* Hit a signal, disable CPU stall warnings. */
			
@@ -3795,14 +3794,16 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 
				 		}
			
 
				 		pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {",
			
 
				 		       rsp->name);
			
 
				+		ndetected = 0;
			
 
				 		rcu_for_each_leaf_node(rsp, rnp) {
			
 
				-			(void)rcu_print_task_exp_stall(rnp);
			
 
				+			ndetected = rcu_print_task_exp_stall(rnp);
			
 
				 			mask = 1;
			
 
				 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
			
 
				 				struct rcu_data *rdp;
			
 
				 
			
 
				 				if (!(rnp->expmask & mask))
			
 
				 					continue;
			
 
				+				ndetected++;
			
 
				 				rdp = per_cpu_ptr(rsp->rda, cpu);
			
 
				 				pr_cont(" %d-%c%c%c", cpu,
			
 
				 					"O."[cpu_online(cpu)],
			
@@ -3811,8 +3812,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
 
				 			}
			
 
				 			mask <<= 1;
			
 
				 		}
			
 
				-		pr_cont(" } %lu jiffies s: %lu\n",
			
 
				-			jiffies - jiffies_start, rsp->expedited_sequence);
			
 
				+		pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n",
			
 
				+			jiffies - jiffies_start, rsp->expedited_sequence,
			
 
				+			rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]);
			
 
				+		if (!ndetected) {
			
 
				+			pr_err("blocking rcu_node structures:");
			
 
				+			rcu_for_each_node_breadth_first(rsp, rnp) {
			
 
				+				if (rnp == rnp_root)
			
 
				+					continue; /* printed unconditionally */
			
 
				+				if (sync_rcu_preempt_exp_done(rnp))
			
 
				+					continue;
			
 
				+				pr_cont(" l=%u:%d-%d:%#lx/%c",
			
 
				+					rnp->level, rnp->grplo, rnp->grphi,
			
 
				+					rnp->expmask,
			
 
				+					".T"[!!rnp->exp_tasks]);
			
 
				+			}
			
 
				+			pr_cont("\n");
			
 
				+		}
			
 
				 		rcu_for_each_leaf_node(rsp, rnp) {
			
 
				 			mask = 1;
			
 
				 			for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) {
			
@@ -3847,6 +3863,16 @@ void synchronize_sched_expedited(void)
 
				 	struct rcu_node *rnp;
			
 
				 	struct rcu_state *rsp = &rcu_sched_state;
			
 
				 
			
 
				+	/* If only one CPU, this is automatically a grace period. */
			
 
				+	if (rcu_blocking_is_gp())
			
 
				+		return;
			
 
				+
			
 
				+	/* If expedited grace periods are prohibited, fall back to normal. */
			
 
				+	if (rcu_gp_is_normal()) {
			
 
				+		wait_rcu_gp(call_rcu_sched);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	/* Take a snapshot of the sequence number.  */
			
 
				 	s = rcu_exp_gp_seq_snap(rsp);
			
 
				 
			
@@ -4135,7 +4161,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
 
				 		rnp = rnp->parent;
			
 
				 		if (rnp == NULL)
			
 
				 			return;
			
 
				-		raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
			
 
				+		raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
			
 
				 		rnp->qsmaskinit |= mask;
			
 
				 		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
			
 
				 	}
			
@@ -4152,7 +4178,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
 
				 	struct rcu_node *rnp = rcu_get_root(rsp);
			
 
				 
			
 
				 	/* Set up local state, ensuring consistent view of global state. */
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo);
			
 
				 	rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
			
 
				 	WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE);
			
@@ -4179,7 +4205,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
				 	struct rcu_node *rnp = rcu_get_root(rsp);
			
 
				 
			
 
				 	/* Set up local state, ensuring consistent view of global state. */
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	rdp->qlen_last_fqs_check = 0;
			
 
				 	rdp->n_force_qs_snap = rsp->n_force_qs;
			
 
				 	rdp->blimit = blimit;
			
@@ -4198,8 +4224,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 
				 	 */
			
 
				 	rnp = rdp->mynode;
			
 
				 	mask = rdp->grpmask;
			
 
				-	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_rcu_node(rnp);		/* irqs already disabled. */
			
 
				 	rnp->qsmaskinitnext |= mask;
			
 
				 	rnp->expmaskinitnext |= mask;
			
 
				 	if (!rdp->beenonline)
			
@@ -4327,14 +4352,14 @@ static int __init rcu_spawn_gp_kthread(void)
 
				 		t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name);
			
 
				 		BUG_ON(IS_ERR(t));
			
 
				 		rnp = rcu_get_root(rsp);
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+		raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 		rsp->gp_kthread = t;
			
 
				 		if (kthread_prio) {
			
 
				 			sp.sched_priority = kthread_prio;
			
 
				 			sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
			
 
				 		}
			
 
				-		wake_up_process(t);
			
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				+		wake_up_process(t);
			
 
				 	}
			
 
				 	rcu_spawn_nocb_kthreads();
			
 
				 	rcu_spawn_boost_kthreads();
			
@@ -4385,12 +4410,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt)
 
				 /*
			
 
				  * Helper function for rcu_init() that initializes one rcu_state structure.
			
 
				  */
			
 
				-static void __init rcu_init_one(struct rcu_state *rsp,
			
 
				-		struct rcu_data __percpu *rda)
			
 
				+static void __init rcu_init_one(struct rcu_state *rsp)
			
 
				 {
			
 
				 	static const char * const buf[] = RCU_NODE_NAME_INIT;
			
 
				 	static const char * const fqs[] = RCU_FQS_NAME_INIT;
			
 
				 	static const char * const exp[] = RCU_EXP_NAME_INIT;
			
 
				+	static struct lock_class_key rcu_node_class[RCU_NUM_LVLS];
			
 
				+	static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS];
			
 
				+	static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS];
			
 
				 	static u8 fl_mask = 0x1;
			
 
				 
			
 
				 	int levelcnt[RCU_NUM_LVLS];		/* # nodes in each level. */
			
@@ -4576,8 +4603,8 @@ void __init rcu_init(void)
 
				 
			
 
				 	rcu_bootup_announce();
			
 
				 	rcu_init_geometry();
			
 
				-	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
			
 
				-	rcu_init_one(&rcu_sched_state, &rcu_sched_data);
			
 
				+	rcu_init_one(&rcu_bh_state);
			
 
				+	rcu_init_one(&rcu_sched_state);
			
 
				 	if (dump_tree)
			
 
				 		rcu_dump_rcu_node_tree(&rcu_sched_state);
			
 
				 	__rcu_init_preempt();
			
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -178,6 +178,8 @@ struct rcu_node {
 
				 				/*  beginning of each expedited GP. */
			
 
				 	unsigned long expmaskinitnext;
			
 
				 				/* Online CPUs for next expedited GP. */
			
 
				+				/*  Any CPU that has ever been online will */
			
 
				+				/*  have its bit set. */
			
 
				 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
			
 
				 				/*  Only one bit will be set in this mask. */
			
 
				 	int	grplo;		/* lowest-numbered CPU or group here. */
			
@@ -384,6 +386,10 @@ struct rcu_data {
 
				 	struct rcu_head oom_head;
			
 
				 #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */
			
 
				 	struct mutex exp_funnel_mutex;
			
 
				+	atomic_long_t expedited_workdone0;	/* # done by others #0. */
			
 
				+	atomic_long_t expedited_workdone1;	/* # done by others #1. */
			
 
				+	atomic_long_t expedited_workdone2;	/* # done by others #2. */
			
 
				+	atomic_long_t expedited_workdone3;	/* # done by others #3. */
			
 
				 
			
 
				 	/* 7) Callback offloading. */
			
 
				 #ifdef CONFIG_RCU_NOCB_CPU
			
@@ -498,10 +504,6 @@ struct rcu_state {
 
				 	/* End of fields guarded by barrier_mutex. */
			
 
				 
			
 
				 	unsigned long expedited_sequence;	/* Take a ticket. */
			
 
				-	atomic_long_t expedited_workdone0;	/* # done by others #0. */
			
 
				-	atomic_long_t expedited_workdone1;	/* # done by others #1. */
			
 
				-	atomic_long_t expedited_workdone2;	/* # done by others #2. */
			
 
				-	atomic_long_t expedited_workdone3;	/* # done by others #3. */
			
 
				 	atomic_long_t expedited_normal;		/* # fallbacks to normal. */
			
 
				 	atomic_t expedited_need_qs;		/* # CPUs left to check in. */
			
 
				 	wait_queue_head_t expedited_wq;		/* Wait for check-ins. */
			
@@ -545,6 +547,18 @@ struct rcu_state {
 
				 #define RCU_GP_CLEANUP   5	/* Grace-period cleanup started. */
			
 
				 #define RCU_GP_CLEANED   6	/* Grace-period cleanup complete. */
			
 
				 
			
 
				+#ifndef RCU_TREE_NONCORE
			
 
				+static const char * const gp_state_names[] = {
			
 
				+	"RCU_GP_IDLE",
			
 
				+	"RCU_GP_WAIT_GPS",
			
 
				+	"RCU_GP_DONE_GPS",
			
 
				+	"RCU_GP_WAIT_FQS",
			
 
				+	"RCU_GP_DOING_FQS",
			
 
				+	"RCU_GP_CLEANUP",
			
 
				+	"RCU_GP_CLEANED",
			
 
				+};
			
 
				+#endif /* #ifndef RCU_TREE_NONCORE */
			
 
				+
			
 
				 extern struct list_head rcu_struct_flavors;
			
 
				 
			
 
				 /* Sequence through rcu_state structures for each RCU flavor. */
			
@@ -664,3 +678,42 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
 
				 #else /* #ifdef CONFIG_PPC */
			
 
				 #define smp_mb__after_unlock_lock()	do { } while (0)
			
 
				 #endif /* #else #ifdef CONFIG_PPC */
			
 
				+
			
 
				+/*
			
 
				+ * Wrappers for the rcu_node::lock acquire.
			
 
				+ *
			
 
				+ * Because the rcu_nodes form a tree, the tree traversal locking will observe
			
 
				+ * different lock values, this in turn means that an UNLOCK of one level
			
 
				+ * followed by a LOCK of another level does not imply a full memory barrier;
			
 
				+ * and most importantly transitivity is lost.
			
 
				+ *
			
 
				+ * In order to restore full ordering between tree levels, augment the regular
			
 
				+ * lock acquire functions with smp_mb__after_unlock_lock().
			
 
				+ */
			
 
				+static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
			
 
				+{
			
 
				+	raw_spin_lock(&rnp->lock);
			
 
				+	smp_mb__after_unlock_lock();
			
 
				+}
			
 
				+
			
 
				+static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
			
 
				+{
			
 
				+	raw_spin_lock_irq(&rnp->lock);
			
 
				+	smp_mb__after_unlock_lock();
			
 
				+}
			
 
				+
			
 
				+#define raw_spin_lock_irqsave_rcu_node(rnp, flags)	\
			
 
				+do {							\
			
 
				+	typecheck(unsigned long, flags);		\
			
 
				+	raw_spin_lock_irqsave(&(rnp)->lock, flags);	\
			
 
				+	smp_mb__after_unlock_lock();			\
			
 
				+} while (0)
			
 
				+
			
 
				+static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
			
 
				+{
			
 
				+	bool locked = raw_spin_trylock(&rnp->lock);
			
 
				+
			
 
				+	if (locked)
			
 
				+		smp_mb__after_unlock_lock();
			
 
				+	return locked;
			
 
				+}
			
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
 
				 
			
 
				 /*
			
 
				  * Check the RCU kernel configuration parameters and print informative
			
 
				- * messages about anything out of the ordinary.  If you like #ifdef, you
			
 
				- * will love this function.
			
 
				+ * messages about anything out of the ordinary.
			
 
				  */
			
 
				 static void __init rcu_bootup_announce_oddness(void)
			
 
				 {
			
@@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void)
 
				  * the corresponding expedited grace period will also be the end of the
			
 
				  * normal grace period.
			
 
				  */
			
 
				-static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
			
 
				-				   unsigned long flags) __releases(rnp->lock)
			
 
				+static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
			
 
				+	__releases(rnp->lock) /* But leaves rrupts disabled. */
			
 
				 {
			
 
				 	int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
			
 
				 			 (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
			
@@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 
				 		rnp->gp_tasks = &t->rcu_node_entry;
			
 
				 	if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
			
 
				 		rnp->exp_tasks = &t->rcu_node_entry;
			
 
				-	raw_spin_unlock(&rnp->lock);
			
 
				+	raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
			
 
				 
			
 
				 	/*
			
 
				 	 * Report the quiescent state for the expedited GP.  This expedited
			
@@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp,
 
				 	} else {
			
 
				 		WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs);
			
 
				 	}
			
 
				-	local_irq_restore(flags);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -286,12 +284,11 @@ static void rcu_preempt_qs(void)
 
				  * predating the current grace period drain, in other words, until
			
 
				  * rnp->gp_tasks becomes NULL.
			
 
				  *
			
 
				- * Caller must disable preemption.
			
 
				+ * Caller must disable interrupts.
			
 
				  */
			
 
				 static void rcu_preempt_note_context_switch(void)
			
 
				 {
			
 
				 	struct task_struct *t = current;
			
 
				-	unsigned long flags;
			
 
				 	struct rcu_data *rdp;
			
 
				 	struct rcu_node *rnp;
			
 
				 
			
@@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void)
 
				 		/* Possibly blocking in an RCU read-side critical section. */
			
 
				 		rdp = this_cpu_ptr(rcu_state_p->rda);
			
 
				 		rnp = rdp->mynode;
			
 
				-		raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_rcu_node(rnp);
			
 
				 		t->rcu_read_unlock_special.b.blocked = true;
			
 
				 		t->rcu_blocked_node = rnp;
			
 
				 
			
@@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void)
 
				 				       (rnp->qsmask & rdp->grpmask)
			
 
				 				       ? rnp->gpnum
			
 
				 				       : rnp->gpnum + 1);
			
 
				-		rcu_preempt_ctxt_queue(rnp, rdp, flags);
			
 
				+		rcu_preempt_ctxt_queue(rnp, rdp);
			
 
				 	} else if (t->rcu_read_lock_nesting < 0 &&
			
 
				 		   t->rcu_read_unlock_special.s) {
			
 
				 
			
@@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t)
 
				 
			
 
				 		/*
			
 
				 		 * Remove this task from the list it blocked on.  The task
			
 
				-		 * now remains queued on the rcu_node corresponding to
			
 
				-		 * the CPU it first blocked on, so the first attempt to
			
 
				-		 * acquire the task's rcu_node's ->lock will succeed.
			
 
				-		 * Keep the loop and add a WARN_ON() out of sheer paranoia.
			
 
				+		 * now remains queued on the rcu_node corresponding to the
			
 
				+		 * CPU it first blocked on, so there is no longer any need
			
 
				+		 * to loop.  Retain a WARN_ON_ONCE() out of sheer paranoia.
			
 
				 		 */
			
 
				-		for (;;) {
			
 
				-			rnp = t->rcu_blocked_node;
			
 
				-			raw_spin_lock(&rnp->lock);  /* irqs already disabled. */
			
 
				-			smp_mb__after_unlock_lock();
			
 
				-			if (rnp == t->rcu_blocked_node)
			
 
				-				break;
			
 
				-			WARN_ON_ONCE(1);
			
 
				-			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
			
 
				-		}
			
 
				+		rnp = t->rcu_blocked_node;
			
 
				+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
			
 
				+		WARN_ON_ONCE(rnp != t->rcu_blocked_node);
			
 
				 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
			
 
				 		empty_exp = sync_rcu_preempt_exp_done(rnp);
			
 
				 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
			
@@ -527,7 +516,7 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
 
				 	unsigned long flags;
			
 
				 	struct task_struct *t;
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	if (!rcu_preempt_blocked_readers_cgp(rnp)) {
			
 
				 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 		return;
			
@@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void)
 
				 	struct rcu_state *rsp = rcu_state_p;
			
 
				 	unsigned long s;
			
 
				 
			
 
				+	/* If expedited grace periods are prohibited, fall back to normal. */
			
 
				+	if (rcu_gp_is_normal()) {
			
 
				+		wait_rcu_gp(call_rcu);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	s = rcu_exp_gp_seq_snap(rsp);
			
 
				 
			
 
				 	rnp_unlock = exp_funnel_lock(rsp, s);
			
@@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
 
				  */
			
 
				 static void __init __rcu_init_preempt(void)
			
 
				 {
			
 
				-	rcu_init_one(rcu_state_p, rcu_data_p);
			
 
				+	rcu_init_one(rcu_state_p);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -989,8 +984,7 @@ static int rcu_boost(struct rcu_node *rnp)
 
				 	    READ_ONCE(rnp->boost_tasks) == NULL)
			
 
				 		return 0;  /* Nothing left to boost. */
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 
			
 
				 	/*
			
 
				 	 * Recheck under the lock: all tasks in need of boosting
			
@@ -1176,8 +1170,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 
				 			   "rcub/%d", rnp_index);
			
 
				 	if (IS_ERR(t))
			
 
				 		return PTR_ERR(t);
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	rnp->boost_kthread_task = t;
			
 
				 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 	sp.sched_priority = kthread_prio;
			
@@ -1524,7 +1517,8 @@ static void rcu_prepare_for_idle(void)
 
				 	struct rcu_state *rsp;
			
 
				 	int tne;
			
 
				 
			
 
				-	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL))
			
 
				+	if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) ||
			
 
				+	    rcu_is_nocb_cpu(smp_processor_id()))
			
 
				 		return;
			
 
				 
			
 
				 	/* Handle nohz enablement switches conservatively. */
			
@@ -1538,10 +1532,6 @@ static void rcu_prepare_for_idle(void)
 
				 	if (!tne)
			
 
				 		return;
			
 
				 
			
 
				-	/* If this is a no-CBs CPU, no callbacks, just return. */
			
 
				-	if (rcu_is_nocb_cpu(smp_processor_id()))
			
 
				-		return;
			
 
				-
			
 
				 	/*
			
 
				 	 * If a non-lazy callback arrived at a CPU having only lazy
			
 
				 	 * callbacks, invoke RCU core for the side-effect of recalculating
			
@@ -1567,8 +1557,7 @@ static void rcu_prepare_for_idle(void)
 
				 		if (!*rdp->nxttail[RCU_DONE_TAIL])
			
 
				 			continue;
			
 
				 		rnp = rdp->mynode;
			
 
				-		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
			
 
				-		smp_mb__after_unlock_lock();
			
 
				+		raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
			
 
				 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
			
 
				 		raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
			
 
				 		if (needwake)
			
@@ -2068,8 +2057,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
 
				 	bool needwake;
			
 
				 	struct rcu_node *rnp = rdp->mynode;
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				-	smp_mb__after_unlock_lock();
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	needwake = rcu_start_future_gp(rnp, rdp, &c);
			
 
				 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
			
 
				 	if (needwake)
			
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -1,5 +1,5 @@
 
				 /*
			
 
				- * Read-Copy Update tracing for classic implementation
			
 
				+ * Read-Copy Update tracing for hierarchical implementation.
			
 
				  *
			
 
				  * This program is free software; you can redistribute it and/or modify
			
 
				  * it under the terms of the GNU General Public License as published by
			
@@ -16,6 +16,7 @@
 
				  * http://www.gnu.org/licenses/gpl-2.0.html.
			
 
				  *
			
 
				  * Copyright IBM Corporation, 2008
			
 
				+ * Author: Paul E. McKenney
			
 
				  *
			
 
				  * Papers:  http://www.rdrop.com/users/paulmck/RCU
			
 
				  *
			
@@ -33,9 +34,7 @@
 
				 #include <linux/sched.h>
			
 
				 #include <linux/atomic.h>
			
 
				 #include <linux/bitops.h>
			
 
				-#include <linux/module.h>
			
 
				 #include <linux/completion.h>
			
 
				-#include <linux/moduleparam.h>
			
 
				 #include <linux/percpu.h>
			
 
				 #include <linux/notifier.h>
			
 
				 #include <linux/cpu.h>
			
@@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = {
 
				 
			
 
				 static int show_rcuexp(struct seq_file *m, void *v)
			
 
				 {
			
 
				+	int cpu;
			
 
				 	struct rcu_state *rsp = (struct rcu_state *)m->private;
			
 
				-
			
 
				+	struct rcu_data *rdp;
			
 
				+	unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0;
			
 
				+
			
 
				+	for_each_possible_cpu(cpu) {
			
 
				+		rdp = per_cpu_ptr(rsp->rda, cpu);
			
 
				+		s0 += atomic_long_read(&rdp->expedited_workdone0);
			
 
				+		s1 += atomic_long_read(&rdp->expedited_workdone1);
			
 
				+		s2 += atomic_long_read(&rdp->expedited_workdone2);
			
 
				+		s3 += atomic_long_read(&rdp->expedited_workdone3);
			
 
				+	}
			
 
				 	seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n",
			
 
				-		   rsp->expedited_sequence,
			
 
				-		   atomic_long_read(&rsp->expedited_workdone0),
			
 
				-		   atomic_long_read(&rsp->expedited_workdone1),
			
 
				-		   atomic_long_read(&rsp->expedited_workdone2),
			
 
				-		   atomic_long_read(&rsp->expedited_workdone3),
			
 
				+		   rsp->expedited_sequence, s0, s1, s2, s3,
			
 
				 		   atomic_long_read(&rsp->expedited_normal),
			
 
				 		   atomic_read(&rsp->expedited_need_qs),
			
 
				 		   rsp->expedited_sequence / 2);
			
@@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
 
				 	unsigned long gpmax;
			
 
				 	struct rcu_node *rnp = &rsp->node[0];
			
 
				 
			
 
				-	raw_spin_lock_irqsave(&rnp->lock, flags);
			
 
				+	raw_spin_lock_irqsave_rcu_node(rnp, flags);
			
 
				 	completed = READ_ONCE(rsp->completed);
			
 
				 	gpnum = READ_ONCE(rsp->gpnum);
			
 
				 	if (completed == gpnum)
			
@@ -487,16 +492,4 @@ free_out:
 
				 	debugfs_remove_recursive(rcudir);
			
 
				 	return 1;
			
 
				 }
			
 
				-
			
 
				-static void __exit rcutree_trace_cleanup(void)
			
 
				-{
			
 
				-	debugfs_remove_recursive(rcudir);
			
 
				-}
			
 
				-
			
 
				-
			
 
				-module_init(rcutree_trace_init);
			
 
				-module_exit(rcutree_trace_cleanup);
			
 
				-
			
 
				-MODULE_AUTHOR("Paul E. McKenney");
			
 
				-MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation");
			
 
				-MODULE_LICENSE("GPL");
			
 
				+device_initcall(rcutree_trace_init);
			
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate");
 
				 #endif
			
 
				 #define MODULE_PARAM_PREFIX "rcupdate."
			
 
				 
			
 
				+#ifndef CONFIG_TINY_RCU
			
 
				 module_param(rcu_expedited, int, 0);
			
 
				+module_param(rcu_normal, int, 0);
			
 
				+static int rcu_normal_after_boot;
			
 
				+module_param(rcu_normal_after_boot, int, 0);
			
 
				+#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				 
			
 
				 #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT)
			
 
				 /**
			
@@ -113,6 +118,17 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held);
 
				 
			
 
				 #ifndef CONFIG_TINY_RCU
			
 
				 
			
 
				+/*
			
 
				+ * Should expedited grace-period primitives always fall back to their
			
 
				+ * non-expedited counterparts?  Intended for use within RCU.  Note
			
 
				+ * that if the user specifies both rcu_expedited and rcu_normal, then
			
 
				+ * rcu_normal wins.
			
 
				+ */
			
 
				+bool rcu_gp_is_normal(void)
			
 
				+{
			
 
				+	return READ_ONCE(rcu_normal);
			
 
				+}
			
 
				+
			
 
				 static atomic_t rcu_expedited_nesting =
			
 
				 	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
			
 
				 
			
@@ -157,8 +173,6 @@ void rcu_unexpedite_gp(void)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
			
 
				 
			
 
				-#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				-
			
 
				 /*
			
 
				  * Inform RCU of the end of the in-kernel boot sequence.
			
 
				  */
			
@@ -166,8 +180,12 @@ void rcu_end_inkernel_boot(void)
 
				 {
			
 
				 	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
			
 
				 		rcu_unexpedite_gp();
			
 
				+	if (rcu_normal_after_boot)
			
 
				+		WRITE_ONCE(rcu_normal, 1);
			
 
				 }
			
 
				 
			
 
				+#endif /* #ifndef CONFIG_TINY_RCU */
			
 
				+
			
 
				 #ifdef CONFIG_PREEMPT_RCU
			
 
				 
			
 
				 /*
			
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3109,7 +3109,6 @@ static void __sched notrace __schedule(bool preempt)
 
				 
			
 
				 	cpu = smp_processor_id();
			
 
				 	rq = cpu_rq(cpu);
			
 
				-	rcu_note_context_switch();
			
 
				 	prev = rq->curr;
			
 
				 
			
 
				 	/*
			
@@ -3128,13 +3127,16 @@ static void __sched notrace __schedule(bool preempt)
 
				 	if (sched_feat(HRTICK))
			
 
				 		hrtick_clear(rq);
			
 
				 
			
 
				+	local_irq_disable();
			
 
				+	rcu_note_context_switch();
			
 
				+
			
 
				 	/*
			
 
				 	 * Make sure that signal_pending_state()->signal_pending() below
			
 
				 	 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
			
 
				 	 * done by the caller to avoid the race with signal_wake_up().
			
 
				 	 */
			
 
				 	smp_mb__before_spinlock();
			
 
				-	raw_spin_lock_irq(&rq->lock);
			
 
				+	raw_spin_lock(&rq->lock);
			
 
				 	lockdep_pin_lock(&rq->lock);
			
 
				 
			
 
				 	rq->clock_skip_update <<= 1; /* promote REQ to ACT */
			
--- a/lib/list_debug.c
+++ b/lib/list_debug.c
@@ -37,7 +37,7 @@ void __list_add(struct list_head *new,
 
				 	next->prev = new;
			
 
				 	new->next = next;
			
 
				 	new->prev = prev;
			
 
				-	prev->next = new;
			
 
				+	WRITE_ONCE(prev->next, new);
			
 
				 }
			
 
				 EXPORT_SYMBOL(__list_add);
			
 
				 
			
--- a/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm-test-1-run.sh
@@ -38,8 +38,6 @@
 
				 #
			
 
				 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
			
 
				 
			
 
				-grace=120
			
 
				-
			
 
				 T=/tmp/kvm-test-1-run.sh.$$
			
 
				 trap 'rm -rf $T' 0
			
 
				 touch $T
			
@@ -152,7 +150,7 @@ fi
 
				 qemu_args="`specify_qemu_cpus "$QEMU" "$qemu_args" "$cpu_count"`"
			
 
				 
			
 
				 # Generate architecture-specific and interaction-specific qemu arguments
			
 
				-qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$builddir/console.log"`"
			
 
				+qemu_args="$qemu_args `identify_qemu_args "$QEMU" "$resdir/console.log"`"
			
 
				 
			
 
				 # Generate qemu -append arguments
			
 
				 qemu_append="`identify_qemu_append "$QEMU"`"
			
@@ -168,7 +166,7 @@ then
 
				 	touch $resdir/buildonly
			
 
				 	exit 0
			
 
				 fi
			
 
				-echo "NOTE: $QEMU either did not run or was interactive" > $builddir/console.log
			
 
				+echo "NOTE: $QEMU either did not run or was interactive" > $resdir/console.log
			
 
				 echo $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append \"$qemu_append $boot_args\" > $resdir/qemu-cmd
			
 
				 ( $QEMU $qemu_args -m 512 -kernel $resdir/bzImage -append "$qemu_append $boot_args"; echo $? > $resdir/qemu-retval ) &
			
 
				 qemu_pid=$!
			
@@ -214,7 +212,7 @@ then
 
				 		else
			
 
				 			break
			
 
				 		fi
			
 
				-		if test $kruntime -ge $((seconds + grace))
			
 
				+		if test $kruntime -ge $((seconds + $TORTURE_SHUTDOWN_GRACE))
			
 
				 		then
			
 
				 			echo "!!! PID $qemu_pid hung at $kruntime vs. $seconds seconds" >> $resdir/Warnings 2>&1
			
 
				 			kill -KILL $qemu_pid
			
@@ -224,6 +222,5 @@ then
 
				 	done
			
 
				 fi
			
 
				 
			
 
				-cp $builddir/console.log $resdir
			
 
				 parse-torture.sh $resdir/console.log $title
			
 
				 parse-console.sh $resdir/console.log $title
			
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -42,6 +42,7 @@ TORTURE_DEFCONFIG=defconfig
 
				 TORTURE_BOOT_IMAGE=""
			
 
				 TORTURE_INITRD="$KVM/initrd"; export TORTURE_INITRD
			
 
				 TORTURE_KMAKE_ARG=""
			
 
				+TORTURE_SHUTDOWN_GRACE=180
			
 
				 TORTURE_SUITE=rcu
			
 
				 resdir=""
			
 
				 configs=""
			
@@ -149,6 +150,11 @@ do
 
				 		resdir=$2
			
 
				 		shift
			
 
				 		;;
			
 
				+	--shutdown-grace)
			
 
				+		checkarg --shutdown-grace "(seconds)" "$#" "$2" '^[0-9]*$' '^error'
			
 
				+		TORTURE_SHUTDOWN_GRACE=$2
			
 
				+		shift
			
 
				+		;;
			
 
				 	--torture)
			
 
				 		checkarg --torture "(suite name)" "$#" "$2" '^\(lock\|rcu\)$' '^--'
			
 
				 		TORTURE_SUITE=$2
			
@@ -266,6 +272,7 @@ TORTURE_KMAKE_ARG="$TORTURE_KMAKE_ARG"; export TORTURE_KMAKE_ARG
 
				 TORTURE_QEMU_CMD="$TORTURE_QEMU_CMD"; export TORTURE_QEMU_CMD
			
 
				 TORTURE_QEMU_INTERACTIVE="$TORTURE_QEMU_INTERACTIVE"; export TORTURE_QEMU_INTERACTIVE
			
 
				 TORTURE_QEMU_MAC="$TORTURE_QEMU_MAC"; export TORTURE_QEMU_MAC
			
 
				+TORTURE_SHUTDOWN_GRACE="$TORTURE_SHUTDOWN_GRACE"; export TORTURE_SHUTDOWN_GRACE
			
 
				 TORTURE_SUITE="$TORTURE_SUITE"; export TORTURE_SUITE
			
 
				 if ! test -e $resdir
			
 
				 then
			
@@ -307,10 +314,10 @@ awk < $T/cfgcpu.pack \
 
				 }
			
 
				 
			
 
				 # Dump out the scripting required to run one test batch.
			
 
				-function dump(first, pastlast)
			
 
				+function dump(first, pastlast, batchnum)
			
 
				 {
			
 
				-	print "echo ----Start batch: `date`";
			
 
				-	print "echo ----Start batch: `date` >> " rd "/log";
			
 
				+	print "echo ----Start batch " batchnum ": `date`";
			
 
				+	print "echo ----Start batch " batchnum ": `date` >> " rd "/log";
			
 
				 	jn=1
			
 
				 	for (j = first; j < pastlast; j++) {
			
 
				 		builddir=KVM "/b" jn
			
@@ -371,25 +378,28 @@ END {
 
				 	njobs = i;
			
 
				 	nc = ncpus;
			
 
				 	first = 0;
			
 
				+	batchnum = 1;
			
 
				 
			
 
				 	# Each pass through the following loop considers one test.
			
 
				 	for (i = 0; i < njobs; i++) {
			
 
				 		if (ncpus == 0) {
			
 
				 			# Sequential test specified, each test its own batch.
			
 
				-			dump(i, i + 1);
			
 
				+			dump(i, i + 1, batchnum);
			
 
				 			first = i;
			
 
				+			batchnum++;
			
 
				 		} else if (nc < cpus[i] && i != 0) {
			
 
				 			# Out of CPUs, dump out a batch.
			
 
				-			dump(first, i);
			
 
				+			dump(first, i, batchnum);
			
 
				 			first = i;
			
 
				 			nc = ncpus;
			
 
				+			batchnum++;
			
 
				 		}
			
 
				 		# Account for the CPUs needed by the current test.
			
 
				 		nc -= cpus[i];
			
 
				 	}
			
 
				 	# Dump the last batch.
			
 
				 	if (ncpus != 0)
			
 
				-		dump(first, i);
			
 
				+		dump(first, i, batchnum);
			
 
				 }' >> $T/script
			
 
				 
			
 
				 cat << ___EOF___ >> $T/script
			
--- a/tools/testing/selftests/rcutorture/bin/parse-console.sh
+++ b/tools/testing/selftests/rcutorture/bin/parse-console.sh
@@ -24,9 +24,6 @@
 
				 #
			
 
				 # Authors: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
			
 
				 
			
 
				-T=/tmp/abat-chk-badness.sh.$$
			
 
				-trap 'rm -f $T' 0
			
 
				-
			
 
				 file="$1"
			
 
				 title="$2"
			
 
				 
			
@@ -36,9 +33,41 @@ if grep -Pq '\x00' < $file
 
				 then
			
 
				 	print_warning Console output contains nul bytes, old qemu still running?
			
 
				 fi
			
 
				-egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $T
			
 
				-if test -s $T
			
 
				+egrep 'Badness|WARNING:|Warn|BUG|===========|Call Trace:|Oops:|detected stalls on CPUs/tasks:|Stall ended before state dump start' < $file | grep -v 'ODEBUG: ' | grep -v 'Warning: unable to open an initial console' > $1.diags
			
 
				+if test -s $1.diags
			
 
				 then
			
 
				 	print_warning Assertion failure in $file $title
			
 
				-	cat $T
			
 
				+	# cat $1.diags
			
 
				+	summary=""
			
 
				+	n_badness=`grep -c Badness $1`
			
 
				+	if test "$n_badness" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  Badness: $n_badness"
			
 
				+	fi
			
 
				+	n_warn=`grep -v 'Warning: unable to open an initial console' $1 | egrep -c 'WARNING:|Warn'`
			
 
				+	if test "$n_warn" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  Warnings: $n_warn"
			
 
				+	fi
			
 
				+	n_bugs=`egrep -c 'BUG|Oops:' $1`
			
 
				+	if test "$n_bugs" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  Bugs: $n_bugs"
			
 
				+	fi
			
 
				+	n_calltrace=`grep -c 'Call Trace:' $1`
			
 
				+	if test "$n_calltrace" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  Call Traces: $n_calltrace"
			
 
				+	fi
			
 
				+	n_lockdep=`grep -c =========== $1`
			
 
				+	if test "$n_badness" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  lockdep: $n_badness"
			
 
				+	fi
			
 
				+	n_stalls=`egrep -c 'detected stalls on CPUs/tasks:|Stall ended before state dump start' $1`
			
 
				+	if test "$n_stalls" -ne 0
			
 
				+	then
			
 
				+		summary="$summary  Stalls: $n_stalls"
			
 
				+	fi
			
 
				+	print_warning Summary: $summary
			
 
				 fi
			
--- a/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
+++ b/tools/testing/selftests/rcutorture/doc/TINY_RCU.txt
@@ -20,7 +20,6 @@ CONFIG_PROVE_RCU
 
				 
			
 
				 CONFIG_NO_HZ_FULL_SYSIDLE
			
 
				 CONFIG_RCU_NOCB_CPU
			
 
				-CONFIG_RCU_USER_QS
			
 
				 
			
 
				 	Meaningless for TINY_RCU.
			
 
				 
			
--- a/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
+++ b/tools/testing/selftests/rcutorture/doc/TREE_RCU-kconfig.txt
@@ -72,10 +72,6 @@ CONFIG_RCU_TORTURE_TEST_RUNNABLE
 
				 
			
 
				 	Always used in KVM testing.
			
 
				 
			
 
				-CONFIG_RCU_USER_QS
			
 
				-
			
 
				-	Redundant with CONFIG_NO_HZ_FULL.
			
 
				-
			
 
				 CONFIG_PREEMPT_RCU
			
 
				 CONFIG_TREE_RCU