8 éve · 5ef26e966d
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -24,8 +24,6 @@ DMA-ISA-LPC.txt
 
				 	- How to do DMA with ISA (and LPC) devices.
			
 
				 DMA-attributes.txt
			
 
				 	- listing of the various possible attributes a DMA region can have
			
 
				-DocBook/
			
 
				-	- directory with DocBook templates etc. for kernel documentation.
			
 
				 EDID/
			
 
				 	- directory with info on customizing EDID for broken gfx/displays.
			
 
				 IPMI.txt
			
@@ -40,8 +38,6 @@ Intel-IOMMU.txt
 
				 	- basic info on the Intel IOMMU virtualization support.
			
 
				 Makefile
			
 
				 	- It's not of interest for those who aren't touching the build system.
			
 
				-Makefile.sphinx
			
 
				-	- It's not of interest for those who aren't touching the build system.
			
 
				 PCI/
			
 
				 	- info related to PCI drivers.
			
 
				 RCU/
			
@@ -246,8 +242,6 @@ kprobes.txt
 
				 	- documents the kernel probes debugging feature.
			
 
				 kref.txt
			
 
				 	- docs on adding reference counters (krefs) to kernel objects.
			
 
				-kselftest.txt
			
 
				-	- small unittests for (some) individual codepaths in the kernel.
			
 
				 laptops/
			
 
				 	- directory with laptop related info and laptop driver documentation.
			
 
				 ldm.txt
			
@@ -264,6 +258,8 @@ logo.gif
 
				 	- full colour GIF image of Linux logo (penguin - Tux).
			
 
				 logo.txt
			
 
				 	- info on creator of above logo & site to get additional images from.
			
 
				+lsm.txt
			
 
				+	- Linux Security Modules: General Security Hooks for Linux
			
 
				 lzo.txt
			
 
				 	- kernel LZO decompressor input formats
			
 
				 m68k/
			
--- a/Documentation/ABI/stable/sysfs-class-udc
+++ b/Documentation/ABI/stable/sysfs-class-udc
@@ -55,14 +55,6 @@ Description:
 
				 		Indicates the maximum USB speed supported by this port.
			
 
				 Users:
			
 
				 
			
 
				-What:		/sys/class/udc/<udc>/maximum_speed
			
 
				-Date:		June 2011
			
 
				-KernelVersion:	3.1
			
 
				-Contact:	Felipe Balbi <balbi@kernel.org>
			
 
				-Description:
			
 
				-		Indicates the maximum USB speed supported by this port.
			
 
				-Users:
			
 
				-
			
 
				 What:		/sys/class/udc/<udc>/soft_connect
			
 
				 Date:		June 2011
			
 
				 KernelVersion:	3.1
			
@@ -91,3 +83,11 @@ Description:
 
				 		'configured', and 'suspended'; however not all USB Device
			
 
				 		Controllers support reporting all states.
			
 
				 Users:
			
 
				+
			
 
				+What:		/sys/class/udc/<udc>/function
			
 
				+Date:		June 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	Felipe Balbi <balbi@kernel.org>
			
 
				+Description:
			
 
				+		Prints out name of currently running USB Gadget Driver.
			
 
				+Users:
			
--- a/Documentation/ABI/stable/sysfs-driver-aspeed-vuart
+++ b/Documentation/ABI/stable/sysfs-driver-aspeed-vuart
@@ -0,0 +1,15 @@
 
				+What:		/sys/bus/platform/drivers/aspeed-vuart/*/lpc_address
			
 
				+Date:		April 2017
			
 
				+Contact:	Jeremy Kerr <jk@ozlabs.org>
			
 
				+Description:	Configures which IO port the host side of the UART
			
 
				+		will appear on the host <-> BMC LPC bus.
			
 
				+Users:		OpenBMC.  Proposed changes should be mailed to
			
 
				+		openbmc@lists.ozlabs.org
			
 
				+
			
 
				+What:		/sys/bus/platform/drivers/aspeed-vuart*/sirq
			
 
				+Date:		April 2017
			
 
				+Contact:	Jeremy Kerr <jk@ozlabs.org>
			
 
				+Description:	Configures which interrupt number the host side of
			
 
				+		the UART will appear on the host <-> BMC LPC bus.
			
 
				+Users:		OpenBMC.  Proposed changes should be mailed to
			
 
				+		openbmc@lists.ozlabs.org
			
--- a/Documentation/ABI/stable/sysfs-hypervisor-xen
+++ b/Documentation/ABI/stable/sysfs-hypervisor-xen
@@ -0,0 +1,119 @@
 
				+What:		/sys/hypervisor/compilation/compile_date
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Contains the build time stamp of the Xen hypervisor
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/compilation/compiled_by
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Contains information who built the Xen hypervisor
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/compilation/compiler
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Compiler which was used to build the Xen hypervisor
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/capabilities
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Space separated list of supported guest system types. Each type
			
 
				+		is in the format: <class>-<major>.<minor>-<arch>
			
 
				+		With:
			
 
				+			<class>: "xen" -- x86: paravirtualized, arm: standard
			
 
				+				 "hvm" -- x86 only: fully virtualized
			
 
				+			<major>: major guest interface version
			
 
				+			<minor>: minor guest interface version
			
 
				+			<arch>:  architecture, e.g.:
			
 
				+				 "x86_32": 32 bit x86 guest without PAE
			
 
				+				 "x86_32p": 32 bit x86 guest with PAE
			
 
				+				 "x86_64": 64 bit x86 guest
			
 
				+				 "armv7l": 32 bit arm guest
			
 
				+				 "aarch64": 64 bit arm guest
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/changeset
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Changeset of the hypervisor (git commit)
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/features
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Features the Xen hypervisor supports for the guest as defined
			
 
				+		in include/xen/interface/features.h printed as a hex value.
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/pagesize
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Default page size of the hypervisor printed as a hex value.
			
 
				+		Might return "0" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/virtual_start
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Virtual address of the hypervisor as a hex value.
			
 
				+
			
 
				+What:		/sys/hypervisor/type
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Type of hypervisor:
			
 
				+		"xen": Xen hypervisor
			
 
				+
			
 
				+What:		/sys/hypervisor/uuid
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		UUID of the guest as known to the Xen hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/version/extra
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		The Xen version is in the format <major>.<minor><extra>
			
 
				+		This is the <extra> part of it.
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
 
				+
			
 
				+What:		/sys/hypervisor/version/major
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		The Xen version is in the format <major>.<minor><extra>
			
 
				+		This is the <major> part of it.
			
 
				+
			
 
				+What:		/sys/hypervisor/version/minor
			
 
				+Date:		March 2009
			
 
				+KernelVersion:	2.6.30
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		The Xen version is in the format <major>.<minor><extra>
			
 
				+		This is the <minor> part of it.
			
--- a/Documentation/ABI/testing/configfs-usb-gadget-uac1
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uac1
@@ -1,12 +1,14 @@
 
				 What:		/config/usb-gadget/gadget/functions/uac1.name
			
 
				-Date:		Sep 2014
			
 
				-KernelVersion:	3.18
			
 
				+Date:		June 2017
			
 
				+KernelVersion:	4.14
			
 
				 Description:
			
 
				 		The attributes:
			
 
				 
			
 
				-		audio_buf_size - audio buffer size
			
 
				-		fn_cap - capture pcm device file name
			
 
				-		fn_cntl - control device file name
			
 
				-		fn_play - playback pcm device file name
			
 
				-		req_buf_size - ISO OUT endpoint request buffer size
			
 
				-		req_count - ISO OUT endpoint request count
			
 
				+		c_chmask - capture channel mask
			
 
				+		c_srate - capture sampling rate
			
 
				+		c_ssize - capture sample size (bytes)
			
 
				+		p_chmask - playback channel mask
			
 
				+		p_srate - playback sampling rate
			
 
				+		p_ssize - playback sample size (bytes)
			
 
				+		req_number - the number of pre-allocated request
			
 
				+			for both capture and playback
			
--- a/Documentation/ABI/testing/configfs-usb-gadget-uac1_legacy
+++ b/Documentation/ABI/testing/configfs-usb-gadget-uac1_legacy
@@ -0,0 +1,12 @@
 
				+What:		/config/usb-gadget/gadget/functions/uac1_legacy.name
			
 
				+Date:		Sep 2014
			
 
				+KernelVersion:	3.18
			
 
				+Description:
			
 
				+		The attributes:
			
 
				+
			
 
				+		audio_buf_size - audio buffer size
			
 
				+		fn_cap - capture pcm device file name
			
 
				+		fn_cntl - control device file name
			
 
				+		fn_play - playback pcm device file name
			
 
				+		req_buf_size - ISO OUT endpoint request buffer size
			
 
				+		req_count - ISO OUT endpoint request count
			
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -34,9 +34,10 @@ Description:
 
				 			fsuuid:= file system UUID (e.g 8bcbe394-4f13-4144-be8e-5aa9ea2ce2f6)
			
 
				 			uid:= decimal value
			
 
				 			euid:= decimal value
			
 
				-			fowner:=decimal value
			
 
				+			fowner:= decimal value
			
 
				 		lsm:  	are LSM specific
			
 
				 		option:	appraise_type:= [imasig]
			
 
				+			pcr:= decimal value
			
 
				 
			
 
				 		default policy:
			
 
				 			# PROC_SUPER_MAGIC
			
@@ -96,3 +97,8 @@ Description:
 
				 
			
 
				 		Smack:
			
 
				 			measure subj_user=_ func=FILE_CHECK mask=MAY_READ
			
 
				+
			
 
				+		Example of measure rules using alternate PCRs:
			
 
				+
			
 
				+			measure func=KEXEC_KERNEL_CHECK pcr=4
			
 
				+			measure func=KEXEC_INITRAMFS_CHECK pcr=5
			
--- a/Documentation/ABI/testing/sysfs-bus-fsi
+++ b/Documentation/ABI/testing/sysfs-bus-fsi
@@ -0,0 +1,38 @@
 
				+What:           /sys/bus/platform/devices/fsi-master/rescan
			
 
				+Date:		May 2017
			
 
				+KernelVersion:  4.12
			
 
				+Contact:        cbostic@linux.vnet.ibm.com
			
 
				+Description:
			
 
				+                Initiates a FSI master scan for all connected slave devices
			
 
				+		on its links.
			
 
				+
			
 
				+What:           /sys/bus/platform/devices/fsi-master/break
			
 
				+Date:		May 2017
			
 
				+KernelVersion:  4.12
			
 
				+Contact:        cbostic@linux.vnet.ibm.com
			
 
				+Description:
			
 
				+		Sends an FSI BREAK command on a master's communication
			
 
				+		link to any connnected slaves.  A BREAK resets connected
			
 
				+		device's logic and preps it to receive further commands
			
 
				+		from the master.
			
 
				+
			
 
				+What:           /sys/bus/platform/devices/fsi-master/slave@00:00/term
			
 
				+Date:		May 2017
			
 
				+KernelVersion:  4.12
			
 
				+Contact:        cbostic@linux.vnet.ibm.com
			
 
				+Description:
			
 
				+		Sends an FSI terminate command from the master to its
			
 
				+		connected slave. A terminate resets the slave's state machines
			
 
				+		that control access to the internally connected engines.  In
			
 
				+		addition the slave freezes its internal error register for
			
 
				+		debugging purposes.  This command is also needed to abort any
			
 
				+		ongoing operation in case of an expired 'Master Time Out'
			
 
				+		timer.
			
 
				+
			
 
				+What:           /sys/bus/platform/devices/fsi-master/slave@00:00/raw
			
 
				+Date:		May 2017
			
 
				+KernelVersion:  4.12
			
 
				+Contact:        cbostic@linux.vnet.ibm.com
			
 
				+Description:
			
 
				+		Provides a means of reading/writing a 32 bit value from/to a
			
 
				+		specified FSI bus address.
			
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -1425,6 +1425,17 @@ Description:
 
				 		guarantees that the hardware fifo is flushed to the device
			
 
				 		buffer.
			
 
				 
			
 
				+What:		/sys/bus/iio/devices/iio:device*/buffer/hwfifo_timeout
			
 
				+KernelVersion:	4.12
			
 
				+Contact:	linux-iio@vger.kernel.org
			
 
				+Description:
			
 
				+		A read/write property to provide capability to delay reporting of
			
 
				+		samples till a timeout is reached. This allows host processors to
			
 
				+		sleep, while the sensor is storing samples in its internal fifo.
			
 
				+		The maximum timeout in seconds can be specified by setting
			
 
				+		hwfifo_timeout.The current delay can be read by reading
			
 
				+		hwfifo_timeout. A value of 0 means that there is no timeout.
			
 
				+
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/buffer/hwfifo_watermark
			
 
				 KernelVersion: 4.2
			
 
				 Contact:	linux-iio@vger.kernel.org
			
--- a/Documentation/ABI/testing/sysfs-bus-iio-meas-spec
+++ b/Documentation/ABI/testing/sysfs-bus-iio-meas-spec
@@ -5,4 +5,3 @@ Description:
 
				                 Reading returns either '1' or '0'. '1' means that the
			
 
				                 battery level supplied to sensor is below 2.25V.
			
 
				                 This ABI is available for tsys02d, htu21, ms8607
			
 
				-		This ABI is available for htu21, ms8607
			
--- a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
+++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32
@@ -16,6 +16,54 @@ Description:
 
				 		- "OC2REF"    : OC2REF signal is used as trigger output.
			
 
				 		- "OC3REF"    : OC3REF signal is used as trigger output.
			
 
				 		- "OC4REF"    : OC4REF signal is used as trigger output.
			
 
				+		Additional modes (on TRGO2 only):
			
 
				+		- "OC5REF"    : OC5REF signal is used as trigger output.
			
 
				+		- "OC6REF"    : OC6REF signal is used as trigger output.
			
 
				+		- "compare_pulse_OC4REF":
			
 
				+		  OC4REF rising or falling edges generate pulses.
			
 
				+		- "compare_pulse_OC6REF":
			
 
				+		  OC6REF rising or falling edges generate pulses.
			
 
				+		- "compare_pulse_OC4REF_r_or_OC6REF_r":
			
 
				+		  OC4REF or OC6REF rising edges generate pulses.
			
 
				+		- "compare_pulse_OC4REF_r_or_OC6REF_f":
			
 
				+		  OC4REF rising or OC6REF falling edges generate pulses.
			
 
				+		- "compare_pulse_OC5REF_r_or_OC6REF_r":
			
 
				+		  OC5REF or OC6REF rising edges generate pulses.
			
 
				+		- "compare_pulse_OC5REF_r_or_OC6REF_f":
			
 
				+		  OC5REF rising or OC6REF falling edges generate pulses.
			
 
				+
			
 
				+		+-----------+   +-------------+            +---------+
			
 
				+		| Prescaler +-> | Counter     |        +-> | Master  | TRGO(2)
			
 
				+		+-----------+   +--+--------+-+        |-> | Control +-->
			
 
				+		                   |        |          ||  +---------+
			
 
				+		                +--v--------+-+ OCxREF ||  +---------+
			
 
				+		                | Chx compare +----------> | Output  | ChX
			
 
				+		                +-----------+-+         |  | Control +-->
			
 
				+		                      .     |           |  +---------+
			
 
				+		                      .     |           |    .
			
 
				+		                +-----------v-+ OC6REF  |    .
			
 
				+		                | Ch6 compare +---------+>
			
 
				+		                +-------------+
			
 
				+
			
 
				+		Example with: "compare_pulse_OC4REF_r_or_OC6REF_r":
			
 
				+
			
 
				+		                X
			
 
				+		              X   X
			
 
				+		            X .   . X
			
 
				+		          X   .   .   X
			
 
				+		        X     .   .     X
			
 
				+		count X .     .   .     . X
			
 
				+		        .     .   .     .
			
 
				+		        .     .   .     .
			
 
				+		        +---------------+
			
 
				+		OC4REF  |     .   .     |
			
 
				+		      +-+     .   .     +-+
			
 
				+		        .     +---+     .
			
 
				+		OC6REF  .     |   |     .
			
 
				+		      +-------+   +-------+
			
 
				+		        +-+   +-+
			
 
				+		TRGO2   | |   | |
			
 
				+		      +-+ +---+ +---------+
			
 
				 
			
 
				 What:		/sys/bus/iio/devices/triggerX/master_mode
			
 
				 KernelVersion:	4.11
			
@@ -90,3 +138,18 @@ Description:
 
				 			Counting is enabled on rising edge of the connected
			
 
				 			trigger, and remains enabled for the duration of this
			
 
				 			selected mode.
			
 
				+
			
 
				+What:		/sys/bus/iio/devices/iio:deviceX/in_count_trigger_mode_available
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	benjamin.gaignard@st.com
			
 
				+Description:
			
 
				+		Reading returns the list possible trigger modes.
			
 
				+
			
 
				+What:		/sys/bus/iio/devices/iio:deviceX/in_count0_trigger_mode
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	benjamin.gaignard@st.com
			
 
				+Description:
			
 
				+		Configure the device counter trigger mode
			
 
				+		counting direction is set by in_count0_count_direction
			
 
				+		attribute and the counter is clocked by the connected trigger
			
 
				+		rising edges.
			
--- a/Documentation/ABI/testing/sysfs-bus-thunderbolt
+++ b/Documentation/ABI/testing/sysfs-bus-thunderbolt
@@ -0,0 +1,110 @@
 
				+What: /sys/bus/thunderbolt/devices/.../domainX/security
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute holds current Thunderbolt security level
			
 
				+		set by the system BIOS. Possible values are:
			
 
				+
			
 
				+		none: All devices are automatically authorized
			
 
				+		user: Devices are only authorized based on writing
			
 
				+		      appropriate value to the authorized attribute
			
 
				+		secure: Require devices that support secure connect at
			
 
				+			minimum. User needs to authorize each device.
			
 
				+		dponly: Automatically tunnel Display port (and USB). No
			
 
				+			PCIe tunnels are created.
			
 
				+
			
 
				+What: /sys/bus/thunderbolt/devices/.../authorized
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute is used to authorize Thunderbolt devices
			
 
				+		after they have been connected. If the device is not
			
 
				+		authorized, no devices such as PCIe and Display port are
			
 
				+		available to the system.
			
 
				+
			
 
				+		Contents of this attribute will be 0 when the device is not
			
 
				+		yet authorized.
			
 
				+
			
 
				+		Possible values are supported:
			
 
				+		1: The device will be authorized and connected
			
 
				+
			
 
				+		When key attribute contains 32 byte hex string the possible
			
 
				+		values are:
			
 
				+		1: The 32 byte hex string is added to the device NVM and
			
 
				+		   the device is authorized.
			
 
				+		2: Send a challenge based on the 32 byte hex string. If the
			
 
				+		   challenge response from device is valid, the device is
			
 
				+		   authorized. In case of failure errno will be ENOKEY if
			
 
				+		   the device did not contain a key at all, and
			
 
				+		   EKEYREJECTED if the challenge response did not match.
			
 
				+
			
 
				+What: /sys/bus/thunderbolt/devices/.../key
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	When a devices supports Thunderbolt secure connect it will
			
 
				+		have this attribute. Writing 32 byte hex string changes
			
 
				+		authorization to use the secure connection method instead.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../device
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute contains id of this device extracted from
			
 
				+		the device DROM.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../device_name
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute contains name of this device extracted from
			
 
				+		the device DROM.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../vendor
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute contains vendor id of this device extracted
			
 
				+		from the device DROM.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../vendor_name
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute contains vendor name of this device extracted
			
 
				+		from the device DROM.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../unique_id
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	This attribute contains unique_id string of this device.
			
 
				+		This is either read from hardware registers (UUID on
			
 
				+		newer hardware) or based on UID from the device DROM.
			
 
				+		Can be used to uniquely identify particular device.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../nvm_version
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	If the device has upgradeable firmware the version
			
 
				+		number is available here. Format: %x.%x, major.minor.
			
 
				+		If the device is in safe mode reading the file returns
			
 
				+		-ENODATA instead as the NVM version is not available.
			
 
				+
			
 
				+What:		/sys/bus/thunderbolt/devices/.../nvm_authenticate
			
 
				+Date:		Sep 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	thunderbolt-software@lists.01.org
			
 
				+Description:	When new NVM image is written to the non-active NVM
			
 
				+		area (through non_activeX NVMem device), the
			
 
				+		authentication procedure is started by writing 1 to
			
 
				+		this file. If everything goes well, the device is
			
 
				+		restarted with the new NVM firmware. If the image
			
 
				+		verification fails an error code is returned instead.
			
 
				+
			
 
				+		When read holds status of the last authentication
			
 
				+		operation if an error occurred during the process. This
			
 
				+		is directly the status value from the DMA configuration
			
 
				+		based mailbox before the device is power cycled. Writing
			
 
				+		0 here clears the status.
			
--- a/Documentation/ABI/testing/sysfs-class-mtd
+++ b/Documentation/ABI/testing/sysfs-class-mtd
@@ -229,6 +229,6 @@ KernelVersion:	4.1
 
				 Contact:	linux-mtd@lists.infradead.org
			
 
				 Description:
			
 
				 		For a partition, the offset of that partition from the start
			
 
				-		of the master device in bytes. This attribute is absent on
			
 
				-		main devices, so it can be used to distinguish between
			
 
				-		partitions and devices that aren't partitions.
			
 
				+		of the parent (another partition or a flash device) in bytes.
			
 
				+		This attribute is absent on flash devices, so it can be used
			
 
				+		to distinguish them from partitions.
			
--- a/Documentation/ABI/testing/sysfs-class-mux
+++ b/Documentation/ABI/testing/sysfs-class-mux
@@ -0,0 +1,16 @@
 
				+What:		/sys/class/mux/
			
 
				+Date:		April 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	Peter Rosin <peda@axentia.se>
			
 
				+Description:
			
 
				+		The mux/ class sub-directory belongs to the Generic MUX
			
 
				+		Framework and provides a sysfs interface for using MUX
			
 
				+		controllers.
			
 
				+
			
 
				+What:		/sys/class/mux/muxchipN/
			
 
				+Date:		April 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	Peter Rosin <peda@axentia.se>
			
 
				+Description:
			
 
				+		A /sys/class/mux/muxchipN directory is created for each
			
 
				+		probed MUX chip where N is a simple enumeration.
			
--- a/Documentation/ABI/testing/sysfs-class-net
+++ b/Documentation/ABI/testing/sysfs-class-net
@@ -251,3 +251,11 @@ Contact:	netdev@vger.kernel.org
 
				 Description:
			
 
				 		Indicates the unique physical switch identifier of a switch this
			
 
				 		port belongs to, as a string.
			
 
				+
			
 
				+What:		/sys/class/net/<iface>/phydev
			
 
				+Date:		May 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	netdev@vger.kernel.org
			
 
				+Description:
			
 
				+		Symbolic link to the PHY device this network device is attached
			
 
				+		to.
			
--- a/Documentation/ABI/testing/sysfs-class-net-phydev
+++ b/Documentation/ABI/testing/sysfs-class-net-phydev
@@ -0,0 +1,36 @@
 
				+What:		/sys/class/mdio_bus/<bus>/<device>/attached_dev
			
 
				+Date:		May 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	netdev@vger.kernel.org
			
 
				+Description:
			
 
				+		Symbolic link to the network device this PHY device is
			
 
				+		attached to.
			
 
				+
			
 
				+What:		/sys/class/mdio_bus/<bus>/<device>/phy_has_fixups
			
 
				+Date:		February 2014
			
 
				+KernelVersion:	3.15
			
 
				+Contact:	netdev@vger.kernel.org
			
 
				+Description:
			
 
				+		Boolean value indicating whether the PHY device has
			
 
				+		any fixups registered against it (phy_register_fixup)
			
 
				+
			
 
				+What:		/sys/class/mdio_bus/<bus>/<device>/phy_id
			
 
				+Date:		November 2012
			
 
				+KernelVersion:	3.8
			
 
				+Contact:	netdev@vger.kernel.org
			
 
				+Description:
			
 
				+		32-bit hexadecimal value corresponding to the PHY device's OUI,
			
 
				+		model and revision number.
			
 
				+
			
 
				+What:		/sys/class/mdio_bus/<bus>/<device>/phy_interface
			
 
				+Date:		February 2014
			
 
				+KernelVersion:	3.15
			
 
				+Contact:	netdev@vger.kernel.org
			
 
				+Description:
			
 
				+		String value indicating the PHY interface, possible
			
 
				+		values are:.
			
 
				+		<empty> (not available), mii, gmii, sgmii, tbi, rev-mii,
			
 
				+		rmii, rgmii, rgmii-id, rgmii-rxid, rgmii-txid, rtbi, smii
			
 
				+		xgmii, moca, qsgmii, trgmii, 1000base-x, 2500base-x, rxaui,
			
 
				+		xaui, 10gbase-kr, unknown
			
 
				+
			
--- a/Documentation/ABI/testing/sysfs-class-power-twl4030
+++ b/Documentation/ABI/testing/sysfs-class-power-twl4030
@@ -1,20 +1,3 @@
 
				-What: /sys/class/power_supply/twl4030_ac/max_current
			
 
				-      /sys/class/power_supply/twl4030_usb/max_current
			
 
				-Description:
			
 
				-	Read/Write limit on current which may
			
 
				-	be drawn from the ac (Accessory Charger) or
			
 
				-	USB port.
			
 
				-
			
 
				-	Value is in micro-Amps.
			
 
				-
			
 
				-	Value is set automatically to an appropriate
			
 
				-	value when a cable is plugged or unplugged.
			
 
				-
			
 
				-	Value can the set by writing to the attribute.
			
 
				-	The change will only persist until the next
			
 
				-	plug event.  These event are reported via udev.
			
 
				-
			
 
				-
			
 
				 What: /sys/class/power_supply/twl4030_usb/mode
			
 
				 Description:
			
 
				 	Changing mode for USB port.
			
--- a/Documentation/ABI/testing/sysfs-class-typec
+++ b/Documentation/ABI/testing/sysfs-class-typec
@@ -30,6 +30,21 @@ Description:
 
				 
			
 
				 		Valid values: source, sink
			
 
				 
			
 
				+What:           /sys/class/typec/<port>/port_type
			
 
				+Date:           May 2017
			
 
				+Contact:	Badhri Jagan Sridharan <Badhri@google.com>
			
 
				+Description:
			
 
				+		Indicates the type of the port. This attribute can be used for
			
 
				+		requesting a change in the port type. Port type change is
			
 
				+		supported as a synchronous operation, so write(2) to the
			
 
				+		attribute will not return until the operation has finished.
			
 
				+
			
 
				+		Valid values:
			
 
				+		- source (The port will behave as source only DFP port)
			
 
				+		- sink (The port will behave as sink only UFP port)
			
 
				+		- dual (The port will behave as dual-role-data and
			
 
				+			dual-role-power port)
			
 
				+
			
 
				 What:		/sys/class/typec/<port>/vconn_source
			
 
				 Date:		April 2017
			
 
				 Contact:	Heikki Krogerus <heikki.krogerus@linux.intel.com>
			
--- a/Documentation/ABI/testing/sysfs-firmware-ofw
+++ b/Documentation/ABI/testing/sysfs-firmware-ofw
@@ -1,6 +1,6 @@
 
				 What:		/sys/firmware/devicetree/*
			
 
				 Date:		November 2013
			
 
				-Contact:	Grant Likely <grant.likely@linaro.org>
			
 
				+Contact:	Grant Likely <grant.likely@arm.com>, devicetree@vger.kernel.org
			
 
				 Description:
			
 
				 		When using OpenFirmware or a Flattened Device Tree to enumerate
			
 
				 		hardware, the device tree structure will be exposed in this
			
@@ -26,3 +26,27 @@ Description:
 
				 		name plus address). Properties are represented as files
			
 
				 		in the directory. The contents of each file is the exact
			
 
				 		binary data from the device tree.
			
 
				+
			
 
				+What:		/sys/firmware/fdt
			
 
				+Date:		February 2015
			
 
				+KernelVersion:	3.19
			
 
				+Contact:	Frank Rowand <frowand.list@gmail.com>, devicetree@vger.kernel.org
			
 
				+Description:
			
 
				+		Exports the FDT blob that was passed to the kernel by
			
 
				+		the bootloader. This allows userland applications such
			
 
				+		as kexec to access the raw binary. This blob is also
			
 
				+		useful when debugging since it contains any changes
			
 
				+		made to the blob by the bootloader.
			
 
				+
			
 
				+		The fact that this node does not reside under
			
 
				+		/sys/firmware/device-tree is deliberate: FDT is also used
			
 
				+		on arm64 UEFI/ACPI systems to communicate just the UEFI
			
 
				+		and ACPI entry points, but the FDT is never unflattened
			
 
				+		and used to configure the system.
			
 
				+
			
 
				+		A CRC32 checksum is calculated over the entire FDT
			
 
				+		blob, and verified at late_initcall time. The sysfs
			
 
				+		entry is instantiated only if the checksum is valid,
			
 
				+		i.e., if the FDT blob has not been modified in the mean
			
 
				+		time. Otherwise, a warning is printed.
			
 
				+Users:		kexec, debugging
			
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -75,7 +75,7 @@ Contact:	"Jaegeuk Kim" <jaegeuk.kim@samsung.com>
 
				 Description:
			
 
				 		 Controls the memory footprint used by f2fs.
			
 
				 
			
 
				-What:		/sys/fs/f2fs/<disk>/trim_sections
			
 
				+What:		/sys/fs/f2fs/<disk>/batched_trim_sections
			
 
				 Date:		February 2015
			
 
				 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
			
 
				 Description:
			
@@ -112,3 +112,21 @@ Date:		January 2016
 
				 Contact:	"Shuoran Liu" <liushuoran@huawei.com>
			
 
				 Description:
			
 
				 		 Shows total written kbytes issued to disk.
			
 
				+
			
 
				+What:		/sys/fs/f2fs/<disk>/inject_rate
			
 
				+Date:		May 2016
			
 
				+Contact:	"Sheng Yong" <shengyong1@huawei.com>
			
 
				+Description:
			
 
				+		 Controls the injection rate.
			
 
				+
			
 
				+What:		/sys/fs/f2fs/<disk>/inject_type
			
 
				+Date:		May 2016
			
 
				+Contact:	"Sheng Yong" <shengyong1@huawei.com>
			
 
				+Description:
			
 
				+		 Controls the injection type.
			
 
				+
			
 
				+What:		/sys/fs/f2fs/<disk>/reserved_blocks
			
 
				+Date:		June 2017
			
 
				+Contact:	"Chao Yu" <yuchao0@huawei.com>
			
 
				+Description:
			
 
				+		 Controls current reserved blocks in system.
			
--- a/Documentation/ABI/testing/sysfs-hypervisor-xen
+++ b/Documentation/ABI/testing/sysfs-hypervisor-xen
@@ -1,8 +1,19 @@
 
				+What:		/sys/hypervisor/guest_type
			
 
				+Date:		June 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Type of guest:
			
 
				+		"Xen": standard guest type on arm
			
 
				+		"HVM": fully virtualized guest (x86)
			
 
				+		"PV": paravirtualized guest (x86)
			
 
				+		"PVH": fully virtualized guest without legacy emulation (x86)
			
 
				+
			
 
				 What:		/sys/hypervisor/pmu/pmu_mode
			
 
				 Date:		August 2015
			
 
				 KernelVersion:	4.3
			
 
				 Contact:	Boris Ostrovsky <boris.ostrovsky@oracle.com>
			
 
				-Description:
			
 
				+Description:	If running under Xen:
			
 
				 		Describes mode that Xen's performance-monitoring unit (PMU)
			
 
				 		uses. Accepted values are
			
 
				 			"off"  -- PMU is disabled
			
@@ -17,7 +28,16 @@ What:           /sys/hypervisor/pmu/pmu_features
 
				 Date:           August 2015
			
 
				 KernelVersion:  4.3
			
 
				 Contact:        Boris Ostrovsky <boris.ostrovsky@oracle.com>
			
 
				-Description:
			
 
				+Description:	If running under Xen:
			
 
				 		Describes Xen PMU features (as an integer). A set bit indicates
			
 
				 		that the corresponding feature is enabled. See
			
 
				 		include/xen/interface/xenpmu.h for available features
			
 
				+
			
 
				+What:		/sys/hypervisor/properties/buildid
			
 
				+Date:		June 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	xen-devel@lists.xenproject.org
			
 
				+Description:	If running under Xen:
			
 
				+		Build id of the hypervisor, needed for hypervisor live patching.
			
 
				+		Might return "<denied>" in case of special security settings
			
 
				+		in the hypervisor.
			
--- a/Documentation/ABI/testing/sysfs-platform-ideapad-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-ideapad-laptop
@@ -17,3 +17,11 @@ Description:
 
				 			* 2 -> Dust Cleaning
			
 
				 			* 4 -> Efficient Thermal Dissipation Mode
			
 
				 
			
 
				+What:		/sys/devices/platform/ideapad/touchpad
			
 
				+Date:		May 2017
			
 
				+KernelVersion:	4.13
			
 
				+Contact:	"Ritesh Raj Sarraf <rrs@debian.org>"
			
 
				+Description:
			
 
				+		Control touchpad mode.
			
 
				+			* 1 -> Switched On
			
 
				+			* 0 -> Switched Off
			
--- a/Documentation/ABI/testing/sysfs-uevent
+++ b/Documentation/ABI/testing/sysfs-uevent
@@ -0,0 +1,47 @@
 
				+What:           /sys/.../uevent
			
 
				+Date:           May 2017
			
 
				+KernelVersion:  4.13
			
 
				+Contact:        Linux kernel mailing list <linux-kernel@vger.kernel.org>
			
 
				+Description:
			
 
				+                Enable passing additional variables for synthetic uevents that
			
 
				+                are generated by writing /sys/.../uevent file.
			
 
				+
			
 
				+                Recognized extended format is ACTION [UUID [KEY=VALUE ...].
			
 
				+
			
 
				+                The ACTION is compulsory - it is the name of the uevent action
			
 
				+                ("add", "change", "remove"). There is no change compared to
			
 
				+                previous functionality here. The rest of the extended format
			
 
				+                is optional.
			
 
				+
			
 
				+                You need to pass UUID first before any KEY=VALUE pairs.
			
 
				+                The UUID must be in "xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx"
			
 
				+                format where 'x' is a hex digit. The UUID is considered to be
			
 
				+                a transaction identifier so it's possible to use the same UUID
			
 
				+                value for one or more synthetic uevents in which case we
			
 
				+                logically group these uevents together for any userspace
			
 
				+                listeners. The UUID value appears in uevent as
			
 
				+                "SYNTH_UUID=xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" environment
			
 
				+                variable.
			
 
				+
			
 
				+                If UUID is not passed in, the generated synthetic uevent gains
			
 
				+                "SYNTH_UUID=0" environment variable automatically.
			
 
				+
			
 
				+                The KEY=VALUE pairs can contain alphanumeric characters only.
			
 
				+                It's possible to define zero or more pairs - each pair is then
			
 
				+                delimited by a space character ' '. Each pair appears in
			
 
				+                synthetic uevent as "SYNTH_ARG_KEY=VALUE". That means the KEY
			
 
				+                name gains "SYNTH_ARG_" prefix to avoid possible collisions
			
 
				+                with existing variables.
			
 
				+
			
 
				+                Example of valid sequence written to the uevent file:
			
 
				+
			
 
				+                    add fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed A=1 B=abc
			
 
				+
			
 
				+                This generates synthetic uevent including these variables:
			
 
				+
			
 
				+                    ACTION=add
			
 
				+                    SYNTH_ARG_A=1
			
 
				+                    SYNTH_ARG_B=abc
			
 
				+                    SYNTH_UUID=fe4d7c9d-b8c6-4a70-9ef1-3d8a58d18eed
			
 
				+Users:
			
 
				+                udev, userspace tools generating synthetic uevents
			
--- a/Documentation/DMA-API-HOWTO.txt
+++ b/Documentation/DMA-API-HOWTO.txt
@@ -1,22 +1,24 @@
 
				-		     Dynamic DMA mapping Guide
			
 
				-		     =========================
			
 
				+=========================
			
 
				+Dynamic DMA mapping Guide
			
 
				+=========================
			
 
				 
			
 
				-		 David S. Miller <davem@redhat.com>
			
 
				-		 Richard Henderson <rth@cygnus.com>
			
 
				-		  Jakub Jelinek <jakub@redhat.com>
			
 
				+:Author: David S. Miller <davem@redhat.com>
			
 
				+:Author: Richard Henderson <rth@cygnus.com>
			
 
				+:Author: Jakub Jelinek <jakub@redhat.com>
			
 
				 
			
 
				 This is a guide to device driver writers on how to use the DMA API
			
 
				 with example pseudo-code.  For a concise description of the API, see
			
 
				 DMA-API.txt.
			
 
				 
			
 
				-                       CPU and DMA addresses
			
 
				+CPU and DMA addresses
			
 
				+=====================
			
 
				 
			
 
				 There are several kinds of addresses involved in the DMA API, and it's
			
 
				 important to understand the differences.
			
 
				 
			
 
				 The kernel normally uses virtual addresses.  Any address returned by
			
 
				 kmalloc(), vmalloc(), and similar interfaces is a virtual address and can
			
 
				-be stored in a "void *".
			
 
				+be stored in a ``void *``.
			
 
				 
			
 
				 The virtual memory system (TLB, page tables, etc.) translates virtual
			
 
				 addresses to CPU physical addresses, which are stored as "phys_addr_t" or
			
@@ -37,7 +39,7 @@ be restricted to a subset of that space.  For example, even if a system
 
				 supports 64-bit addresses for main memory and PCI BARs, it may use an IOMMU
			
 
				 so devices only need to use 32-bit DMA addresses.
			
 
				 
			
 
				-Here's a picture and some examples:
			
 
				+Here's a picture and some examples::
			
 
				 
			
 
				                CPU                  CPU                  Bus
			
 
				              Virtual              Physical             Address
			
@@ -98,15 +100,16 @@ microprocessor architecture. You should use the DMA API rather than the
 
				 bus-specific DMA API, i.e., use the dma_map_*() interfaces rather than the
			
 
				 pci_map_*() interfaces.
			
 
				 
			
 
				-First of all, you should make sure
			
 
				+First of all, you should make sure::
			
 
				 
			
 
				-#include <linux/dma-mapping.h>
			
 
				+	#include <linux/dma-mapping.h>
			
 
				 
			
 
				 is in your driver, which provides the definition of dma_addr_t.  This type
			
 
				 can hold any valid DMA address for the platform and should be used
			
 
				 everywhere you hold a DMA address returned from the DMA mapping functions.
			
 
				 
			
 
				-			 What memory is DMA'able?
			
 
				+What memory is DMA'able?
			
 
				+========================
			
 
				 
			
 
				 The first piece of information you must know is what kernel memory can
			
 
				 be used with the DMA mapping facilities.  There has been an unwritten
			
@@ -143,7 +146,8 @@ What about block I/O and networking buffers?  The block I/O and
 
				 networking subsystems make sure that the buffers they use are valid
			
 
				 for you to DMA from/to.
			
 
				 
			
 
				-			DMA addressing limitations
			
 
				+DMA addressing limitations
			
 
				+==========================
			
 
				 
			
 
				 Does your device have any DMA addressing limitations?  For example, is
			
 
				 your device only capable of driving the low order 24-bits of address?
			
@@ -166,7 +170,7 @@ style to do this even if your device holds the default setting,
 
				 because this shows that you did think about these issues wrt. your
			
 
				 device.
			
 
				 
			
 
				-The query is performed via a call to dma_set_mask_and_coherent():
			
 
				+The query is performed via a call to dma_set_mask_and_coherent()::
			
 
				 
			
 
				 	int dma_set_mask_and_coherent(struct device *dev, u64 mask);
			
 
				 
			
@@ -175,12 +179,12 @@ If you have some special requirements, then the following two separate
 
				 queries can be used instead:
			
 
				 
			
 
				 	The query for streaming mappings is performed via a call to
			
 
				-	dma_set_mask():
			
 
				+	dma_set_mask()::
			
 
				 
			
 
				 		int dma_set_mask(struct device *dev, u64 mask);
			
 
				 
			
 
				 	The query for consistent allocations is performed via a call
			
 
				-	to dma_set_coherent_mask():
			
 
				+	to dma_set_coherent_mask()::
			
 
				 
			
 
				 		int dma_set_coherent_mask(struct device *dev, u64 mask);
			
 
				 
			
@@ -209,7 +213,7 @@ of your driver reports that performance is bad or that the device is not
 
				 even detected, you can ask them for the kernel messages to find out
			
 
				 exactly why.
			
 
				 
			
 
				-The standard 32-bit addressing device would do something like this:
			
 
				+The standard 32-bit addressing device would do something like this::
			
 
				 
			
 
				 	if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32))) {
			
 
				 		dev_warn(dev, "mydev: No suitable DMA available\n");
			
@@ -225,7 +229,7 @@ than 64-bit addressing.  For example, Sparc64 PCI SAC addressing is
 
				 more efficient than DAC addressing.
			
 
				 
			
 
				 Here is how you would handle a 64-bit capable device which can drive
			
 
				-all 64-bits when accessing streaming DMA:
			
 
				+all 64-bits when accessing streaming DMA::
			
 
				 
			
 
				 	int using_dac;
			
 
				 
			
@@ -239,7 +243,7 @@ all 64-bits when accessing streaming DMA:
 
				 	}
			
 
				 
			
 
				 If a card is capable of using 64-bit consistent allocations as well,
			
 
				-the case would look like this:
			
 
				+the case would look like this::
			
 
				 
			
 
				 	int using_dac, consistent_using_dac;
			
 
				 
			
@@ -260,7 +264,7 @@ uses consistent allocations, one would have to check the return value from
 
				 dma_set_coherent_mask().
			
 
				 
			
 
				 Finally, if your device can only drive the low 24-bits of
			
 
				-address you might do something like:
			
 
				+address you might do something like::
			
 
				 
			
 
				 	if (dma_set_mask(dev, DMA_BIT_MASK(24))) {
			
 
				 		dev_warn(dev, "mydev: 24-bit DMA addressing not available\n");
			
@@ -280,7 +284,7 @@ only provide the functionality which the machine can handle.  It
 
				 is important that the last call to dma_set_mask() be for the
			
 
				 most specific mask.
			
 
				 
			
 
				-Here is pseudo-code showing how this might be done:
			
 
				+Here is pseudo-code showing how this might be done::
			
 
				 
			
 
				 	#define PLAYBACK_ADDRESS_BITS	DMA_BIT_MASK(32)
			
 
				 	#define RECORD_ADDRESS_BITS	DMA_BIT_MASK(24)
			
@@ -308,7 +312,8 @@ A sound card was used as an example here because this genre of PCI
 
				 devices seems to be littered with ISA chips given a PCI front end,
			
 
				 and thus retaining the 16MB DMA addressing limitations of ISA.
			
 
				 
			
 
				-			Types of DMA mappings
			
 
				+Types of DMA mappings
			
 
				+=====================
			
 
				 
			
 
				 There are two types of DMA mappings:
			
 
				 
			
@@ -336,12 +341,14 @@ There are two types of DMA mappings:
 
				   to memory is immediately visible to the device, and vice
			
 
				   versa.  Consistent mappings guarantee this.
			
 
				 
			
 
				-  IMPORTANT: Consistent DMA memory does not preclude the usage of
			
 
				-             proper memory barriers.  The CPU may reorder stores to
			
 
				+  .. important::
			
 
				+
			
 
				+	     Consistent DMA memory does not preclude the usage of
			
 
				+	     proper memory barriers.  The CPU may reorder stores to
			
 
				 	     consistent memory just as it may normal memory.  Example:
			
 
				 	     if it is important for the device to see the first word
			
 
				 	     of a descriptor updated before the second, you must do
			
 
				-	     something like:
			
 
				+	     something like::
			
 
				 
			
 
				 		desc->word0 = address;
			
 
				 		wmb();
			
@@ -377,16 +384,17 @@ Also, systems with caches that aren't DMA-coherent will work better
 
				 when the underlying buffers don't share cache lines with other data.
			
 
				 
			
 
				 
			
 
				-		 Using Consistent DMA mappings.
			
 
				+Using Consistent DMA mappings
			
 
				+=============================
			
 
				 
			
 
				 To allocate and map large (PAGE_SIZE or so) consistent DMA regions,
			
 
				-you should do:
			
 
				+you should do::
			
 
				 
			
 
				 	dma_addr_t dma_handle;
			
 
				 
			
 
				 	cpu_addr = dma_alloc_coherent(dev, size, &dma_handle, gfp);
			
 
				 
			
 
				-where device is a struct device *. This may be called in interrupt
			
 
				+where device is a ``struct device *``. This may be called in interrupt
			
 
				 context with the GFP_ATOMIC flag.
			
 
				 
			
 
				 Size is the length of the region you want to allocate, in bytes.
			
@@ -415,7 +423,7 @@ exists (for example) to guarantee that if you allocate a chunk
 
				 which is smaller than or equal to 64 kilobytes, the extent of the
			
 
				 buffer you receive will not cross a 64K boundary.
			
 
				 
			
 
				-To unmap and free such a DMA region, you call:
			
 
				+To unmap and free such a DMA region, you call::
			
 
				 
			
 
				 	dma_free_coherent(dev, size, cpu_addr, dma_handle);
			
 
				 
			
@@ -430,7 +438,7 @@ a kmem_cache, but it uses dma_alloc_coherent(), not __get_free_pages().
 
				 Also, it understands common hardware constraints for alignment,
			
 
				 like queue heads needing to be aligned on N byte boundaries.
			
 
				 
			
 
				-Create a dma_pool like this:
			
 
				+Create a dma_pool like this::
			
 
				 
			
 
				 	struct dma_pool *pool;
			
 
				 
			
@@ -444,7 +452,7 @@ pass 0 for boundary; passing 4096 says memory allocated from this pool
 
				 must not cross 4KByte boundaries (but at that time it may be better to
			
 
				 use dma_alloc_coherent() directly instead).
			
 
				 
			
 
				-Allocate memory from a DMA pool like this:
			
 
				+Allocate memory from a DMA pool like this::
			
 
				 
			
 
				 	cpu_addr = dma_pool_alloc(pool, flags, &dma_handle);
			
 
				 
			
@@ -452,7 +460,7 @@ flags are GFP_KERNEL if blocking is permitted (not in_interrupt nor
 
				 holding SMP locks), GFP_ATOMIC otherwise.  Like dma_alloc_coherent(),
			
 
				 this returns two values, cpu_addr and dma_handle.
			
 
				 
			
 
				-Free memory that was allocated from a dma_pool like this:
			
 
				+Free memory that was allocated from a dma_pool like this::
			
 
				 
			
 
				 	dma_pool_free(pool, cpu_addr, dma_handle);
			
 
				 
			
@@ -460,7 +468,7 @@ where pool is what you passed to dma_pool_alloc(), and cpu_addr and
 
				 dma_handle are the values dma_pool_alloc() returned. This function
			
 
				 may be called in interrupt context.
			
 
				 
			
 
				-Destroy a dma_pool by calling:
			
 
				+Destroy a dma_pool by calling::
			
 
				 
			
 
				 	dma_pool_destroy(pool);
			
 
				 
			
@@ -468,11 +476,12 @@ Make sure you've called dma_pool_free() for all memory allocated
 
				 from a pool before you destroy the pool. This function may not
			
 
				 be called in interrupt context.
			
 
				 
			
 
				-			DMA Direction
			
 
				+DMA Direction
			
 
				+=============
			
 
				 
			
 
				 The interfaces described in subsequent portions of this document
			
 
				 take a DMA direction argument, which is an integer and takes on
			
 
				-one of the following values:
			
 
				+one of the following values::
			
 
				 
			
 
				  DMA_BIDIRECTIONAL
			
 
				  DMA_TO_DEVICE
			
@@ -521,14 +530,15 @@ packets, map/unmap them with the DMA_TO_DEVICE direction
 
				 specifier.  For receive packets, just the opposite, map/unmap them
			
 
				 with the DMA_FROM_DEVICE direction specifier.
			
 
				 
			
 
				-		  Using Streaming DMA mappings
			
 
				+Using Streaming DMA mappings
			
 
				+============================
			
 
				 
			
 
				 The streaming DMA mapping routines can be called from interrupt
			
 
				 context.  There are two versions of each map/unmap, one which will
			
 
				 map/unmap a single memory region, and one which will map/unmap a
			
 
				 scatterlist.
			
 
				 
			
 
				-To map a single region, you do:
			
 
				+To map a single region, you do::
			
 
				 
			
 
				 	struct device *dev = &my_dev->dev;
			
 
				 	dma_addr_t dma_handle;
			
@@ -545,37 +555,16 @@ To map a single region, you do:
 
				 		goto map_error_handling;
			
 
				 	}
			
 
				 
			
 
				-and to unmap it:
			
 
				+and to unmap it::
			
 
				 
			
 
				 	dma_unmap_single(dev, dma_handle, size, direction);
			
 
				 
			
 
				 You should call dma_mapping_error() as dma_map_single() could fail and return
			
 
				-error. Not all DMA implementations support the dma_mapping_error() interface.
			
 
				-However, it is a good practice to call dma_mapping_error() interface, which
			
 
				-will invoke the generic mapping error check interface. Doing so will ensure
			
 
				-that the mapping code will work correctly on all DMA implementations without
			
 
				-any dependency on the specifics of the underlying implementation. Using the
			
 
				-returned address without checking for errors could result in failures ranging
			
 
				-from panics to silent data corruption. A couple of examples of incorrect ways
			
 
				-to check for errors that make assumptions about the underlying DMA
			
 
				-implementation are as follows and these are applicable to dma_map_page() as
			
 
				-well.
			
 
				-
			
 
				-Incorrect example 1:
			
 
				-	dma_addr_t dma_handle;
			
 
				-
			
 
				-	dma_handle = dma_map_single(dev, addr, size, direction);
			
 
				-	if ((dma_handle & 0xffff != 0) || (dma_handle >= 0x1000000)) {
			
 
				-		goto map_error;
			
 
				-	}
			
 
				-
			
 
				-Incorrect example 2:
			
 
				-	dma_addr_t dma_handle;
			
 
				-
			
 
				-	dma_handle = dma_map_single(dev, addr, size, direction);
			
 
				-	if (dma_handle == DMA_ERROR_CODE) {
			
 
				-		goto map_error;
			
 
				-	}
			
 
				+error.  Doing so will ensure that the mapping code will work correctly on all
			
 
				+DMA implementations without any dependency on the specifics of the underlying
			
 
				+implementation. Using the returned address without checking for errors could
			
 
				+result in failures ranging from panics to silent data corruption.  The same
			
 
				+applies to dma_map_page() as well.
			
 
				 
			
 
				 You should call dma_unmap_single() when the DMA activity is finished, e.g.,
			
 
				 from the interrupt which told you that the DMA transfer is done.
			
@@ -584,7 +573,7 @@ Using CPU pointers like this for single mappings has a disadvantage:
 
				 you cannot reference HIGHMEM memory in this way.  Thus, there is a
			
 
				 map/unmap interface pair akin to dma_{map,unmap}_single().  These
			
 
				 interfaces deal with page/offset pairs instead of CPU pointers.
			
 
				-Specifically:
			
 
				+Specifically::
			
 
				 
			
 
				 	struct device *dev = &my_dev->dev;
			
 
				 	dma_addr_t dma_handle;
			
@@ -614,7 +603,7 @@ error as outlined under the dma_map_single() discussion.
 
				 You should call dma_unmap_page() when the DMA activity is finished, e.g.,
			
 
				 from the interrupt which told you that the DMA transfer is done.
			
 
				 
			
 
				-With scatterlists, you map a region gathered from several regions by:
			
 
				+With scatterlists, you map a region gathered from several regions by::
			
 
				 
			
 
				 	int i, count = dma_map_sg(dev, sglist, nents, direction);
			
 
				 	struct scatterlist *sg;
			
@@ -638,16 +627,18 @@ Then you should loop count times (note: this can be less than nents times)
 
				 and use sg_dma_address() and sg_dma_len() macros where you previously
			
 
				 accessed sg->address and sg->length as shown above.
			
 
				 
			
 
				-To unmap a scatterlist, just call:
			
 
				+To unmap a scatterlist, just call::
			
 
				 
			
 
				 	dma_unmap_sg(dev, sglist, nents, direction);
			
 
				 
			
 
				 Again, make sure DMA activity has already finished.
			
 
				 
			
 
				-PLEASE NOTE:  The 'nents' argument to the dma_unmap_sg call must be
			
 
				-              the _same_ one you passed into the dma_map_sg call,
			
 
				-	      it should _NOT_ be the 'count' value _returned_ from the
			
 
				-              dma_map_sg call.
			
 
				+.. note::
			
 
				+
			
 
				+	The 'nents' argument to the dma_unmap_sg call must be
			
 
				+	the _same_ one you passed into the dma_map_sg call,
			
 
				+	it should _NOT_ be the 'count' value _returned_ from the
			
 
				+	dma_map_sg call.
			
 
				 
			
 
				 Every dma_map_{single,sg}() call should have its dma_unmap_{single,sg}()
			
 
				 counterpart, because the DMA address space is a shared resource and
			
@@ -659,11 +650,11 @@ properly in order for the CPU and device to see the most up-to-date and
 
				 correct copy of the DMA buffer.
			
 
				 
			
 
				 So, firstly, just map it with dma_map_{single,sg}(), and after each DMA
			
 
				-transfer call either:
			
 
				+transfer call either::
			
 
				 
			
 
				 	dma_sync_single_for_cpu(dev, dma_handle, size, direction);
			
 
				 
			
 
				-or:
			
 
				+or::
			
 
				 
			
 
				 	dma_sync_sg_for_cpu(dev, sglist, nents, direction);
			
 
				 
			
@@ -671,17 +662,19 @@ as appropriate.
 
				 
			
 
				 Then, if you wish to let the device get at the DMA area again,
			
 
				 finish accessing the data with the CPU, and then before actually
			
 
				-giving the buffer to the hardware call either:
			
 
				+giving the buffer to the hardware call either::
			
 
				 
			
 
				 	dma_sync_single_for_device(dev, dma_handle, size, direction);
			
 
				 
			
 
				-or:
			
 
				+or::
			
 
				 
			
 
				 	dma_sync_sg_for_device(dev, sglist, nents, direction);
			
 
				 
			
 
				 as appropriate.
			
 
				 
			
 
				-PLEASE NOTE:  The 'nents' argument to dma_sync_sg_for_cpu() and
			
 
				+.. note::
			
 
				+
			
 
				+	      The 'nents' argument to dma_sync_sg_for_cpu() and
			
 
				 	      dma_sync_sg_for_device() must be the same passed to
			
 
				 	      dma_map_sg(). It is _NOT_ the count returned by
			
 
				 	      dma_map_sg().
			
@@ -692,7 +685,7 @@ dma_map_*() call till dma_unmap_*(), then you don't have to call the
 
				 dma_sync_*() routines at all.
			
 
				 
			
 
				 Here is pseudo code which shows a situation in which you would need
			
 
				-to use the dma_sync_*() interfaces.
			
 
				+to use the dma_sync_*() interfaces::
			
 
				 
			
 
				 	my_card_setup_receive_buffer(struct my_card *cp, char *buffer, int len)
			
 
				 	{
			
@@ -768,7 +761,8 @@ is planned to completely remove virt_to_bus() and bus_to_virt() as
 
				 they are entirely deprecated.  Some ports already do not provide these
			
 
				 as it is impossible to correctly support them.
			
 
				 
			
 
				-			Handling Errors
			
 
				+Handling Errors
			
 
				+===============
			
 
				 
			
 
				 DMA address space is limited on some architectures and an allocation
			
 
				 failure can be determined by:
			
@@ -776,7 +770,7 @@ failure can be determined by:
 
				 - checking if dma_alloc_coherent() returns NULL or dma_map_sg returns 0
			
 
				 
			
 
				 - checking the dma_addr_t returned from dma_map_single() and dma_map_page()
			
 
				-  by using dma_mapping_error():
			
 
				+  by using dma_mapping_error()::
			
 
				 
			
 
				 	dma_addr_t dma_handle;
			
 
				 
			
@@ -794,7 +788,8 @@ failure can be determined by:
 
				   of a multiple page mapping attempt. These example are applicable to
			
 
				   dma_map_page() as well.
			
 
				 
			
 
				-Example 1:
			
 
				+Example 1::
			
 
				+
			
 
				 	dma_addr_t dma_handle1;
			
 
				 	dma_addr_t dma_handle2;
			
 
				 
			
@@ -823,8 +818,12 @@ Example 1:
 
				 		dma_unmap_single(dma_handle1);
			
 
				 	map_error_handling1:
			
 
				 
			
 
				-Example 2: (if buffers are allocated in a loop, unmap all mapped buffers when
			
 
				-	    mapping error is detected in the middle)
			
 
				+Example 2::
			
 
				+
			
 
				+	/*
			
 
				+	 * if buffers are allocated in a loop, unmap all mapped buffers when
			
 
				+	 * mapping error is detected in the middle
			
 
				+	 */
			
 
				 
			
 
				 	dma_addr_t dma_addr;
			
 
				 	dma_addr_t array[DMA_BUFFERS];
			
@@ -867,7 +866,8 @@ SCSI drivers must return SCSI_MLQUEUE_HOST_BUSY if the DMA mapping
 
				 fails in the queuecommand hook. This means that the SCSI subsystem
			
 
				 passes the command to the driver again later.
			
 
				 
			
 
				-		Optimizing Unmap State Space Consumption
			
 
				+Optimizing Unmap State Space Consumption
			
 
				+========================================
			
 
				 
			
 
				 On many platforms, dma_unmap_{single,page}() is simply a nop.
			
 
				 Therefore, keeping track of the mapping address and length is a waste
			
@@ -879,7 +879,7 @@ Actually, instead of describing the macros one by one, we'll
 
				 transform some example code.
			
 
				 
			
 
				 1) Use DEFINE_DMA_UNMAP_{ADDR,LEN} in state saving structures.
			
 
				-   Example, before:
			
 
				+   Example, before::
			
 
				 
			
 
				 	struct ring_state {
			
 
				 		struct sk_buff *skb;
			
@@ -887,7 +887,7 @@ transform some example code.
 
				 		__u32 len;
			
 
				 	};
			
 
				 
			
 
				-   after:
			
 
				+   after::
			
 
				 
			
 
				 	struct ring_state {
			
 
				 		struct sk_buff *skb;
			
@@ -896,23 +896,23 @@ transform some example code.
 
				 	};
			
 
				 
			
 
				 2) Use dma_unmap_{addr,len}_set() to set these values.
			
 
				-   Example, before:
			
 
				+   Example, before::
			
 
				 
			
 
				 	ringp->mapping = FOO;
			
 
				 	ringp->len = BAR;
			
 
				 
			
 
				-   after:
			
 
				+   after::
			
 
				 
			
 
				 	dma_unmap_addr_set(ringp, mapping, FOO);
			
 
				 	dma_unmap_len_set(ringp, len, BAR);
			
 
				 
			
 
				 3) Use dma_unmap_{addr,len}() to access these values.
			
 
				-   Example, before:
			
 
				+   Example, before::
			
 
				 
			
 
				 	dma_unmap_single(dev, ringp->mapping, ringp->len,
			
 
				 			 DMA_FROM_DEVICE);
			
 
				 
			
 
				-   after:
			
 
				+   after::
			
 
				 
			
 
				 	dma_unmap_single(dev,
			
 
				 			 dma_unmap_addr(ringp, mapping),
			
@@ -923,7 +923,8 @@ It really should be self-explanatory.  We treat the ADDR and LEN
 
				 separately, because it is possible for an implementation to only
			
 
				 need the address in order to perform the unmap operation.
			
 
				 
			
 
				-			Platform Issues
			
 
				+Platform Issues
			
 
				+===============
			
 
				 
			
 
				 If you are just writing drivers for Linux and do not maintain
			
 
				 an architecture port for the kernel, you can safely skip down
			
@@ -949,12 +950,13 @@ to "Closing".
 
				    alignment constraints (e.g. the alignment constraints about 64-bit
			
 
				    objects).
			
 
				 
			
 
				-			   Closing
			
 
				+Closing
			
 
				+=======
			
 
				 
			
 
				 This document, and the API itself, would not be in its current
			
 
				 form without the feedback and suggestions from numerous individuals.
			
 
				 We would like to specifically mention, in no particular order, the
			
 
				-following people:
			
 
				+following people::
			
 
				 
			
 
				 	Russell King <rmk@arm.linux.org.uk>
			
 
				 	Leo Dagum <dagum@barrel.engr.sgi.com>
			
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -1,7 +1,8 @@
 
				-               Dynamic DMA mapping using the generic device
			
 
				-               ============================================
			
 
				+============================================
			
 
				+Dynamic DMA mapping using the generic device
			
 
				+============================================
			
 
				 
			
 
				-        James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
			
 
				+:Author: James E.J. Bottomley <James.Bottomley@HansenPartnership.com>
			
 
				 
			
 
				 This document describes the DMA API.  For a more gentle introduction
			
 
				 of the API (and actual examples), see Documentation/DMA-API-HOWTO.txt.
			
@@ -12,10 +13,10 @@ machines.  Unless you know that your driver absolutely has to support
 
				 non-consistent platforms (this is usually only legacy platforms) you
			
 
				 should only use the API described in part I.
			
 
				 
			
 
				-Part I - dma_ API
			
 
				--------------------------------------
			
 
				+Part I - dma_API
			
 
				+----------------
			
 
				 
			
 
				-To get the dma_ API, you must #include <linux/dma-mapping.h>.  This
			
 
				+To get the dma_API, you must #include <linux/dma-mapping.h>.  This
			
 
				 provides dma_addr_t and the interfaces described below.
			
 
				 
			
 
				 A dma_addr_t can hold any valid DMA address for the platform.  It can be
			
@@ -26,9 +27,11 @@ address space and the DMA address space.
 
				 Part Ia - Using large DMA-coherent buffers
			
 
				 ------------------------------------------
			
 
				 
			
 
				-void *
			
 
				-dma_alloc_coherent(struct device *dev, size_t size,
			
 
				-			     dma_addr_t *dma_handle, gfp_t flag)
			
 
				+::
			
 
				+
			
 
				+	void *
			
 
				+	dma_alloc_coherent(struct device *dev, size_t size,
			
 
				+			   dma_addr_t *dma_handle, gfp_t flag)
			
 
				 
			
 
				 Consistent memory is memory for which a write by either the device or
			
 
				 the processor can immediately be read by the processor or device
			
@@ -51,20 +54,24 @@ consolidate your requests for consistent memory as much as possible.
 
				 The simplest way to do that is to use the dma_pool calls (see below).
			
 
				 
			
 
				 The flag parameter (dma_alloc_coherent() only) allows the caller to
			
 
				-specify the GFP_ flags (see kmalloc()) for the allocation (the
			
 
				+specify the ``GFP_`` flags (see kmalloc()) for the allocation (the
			
 
				 implementation may choose to ignore flags that affect the location of
			
 
				 the returned memory, like GFP_DMA).
			
 
				 
			
 
				-void *
			
 
				-dma_zalloc_coherent(struct device *dev, size_t size,
			
 
				-			     dma_addr_t *dma_handle, gfp_t flag)
			
 
				+::
			
 
				+
			
 
				+	void *
			
 
				+	dma_zalloc_coherent(struct device *dev, size_t size,
			
 
				+			    dma_addr_t *dma_handle, gfp_t flag)
			
 
				 
			
 
				 Wraps dma_alloc_coherent() and also zeroes the returned memory if the
			
 
				 allocation attempt succeeded.
			
 
				 
			
 
				-void
			
 
				-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
			
 
				-			   dma_addr_t dma_handle)
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
			
 
				+			  dma_addr_t dma_handle)
			
 
				 
			
 
				 Free a region of consistent memory you previously allocated.  dev,
			
 
				 size and dma_handle must all be the same as those passed into
			
@@ -78,7 +85,7 @@ may only be called with IRQs enabled.
 
				 Part Ib - Using small DMA-coherent buffers
			
 
				 ------------------------------------------
			
 
				 
			
 
				-To get this part of the dma_ API, you must #include <linux/dmapool.h>
			
 
				+To get this part of the dma_API, you must #include <linux/dmapool.h>
			
 
				 
			
 
				 Many drivers need lots of small DMA-coherent memory regions for DMA
			
 
				 descriptors or I/O buffers.  Rather than allocating in units of a page
			
@@ -88,6 +95,8 @@ not __get_free_pages().  Also, they understand common hardware constraints
 
				 for alignment, like queue heads needing to be aligned on N-byte boundaries.
			
 
				 
			
 
				 
			
 
				+::
			
 
				+
			
 
				 	struct dma_pool *
			
 
				 	dma_pool_create(const char *name, struct device *dev,
			
 
				 			size_t size, size_t align, size_t alloc);
			
@@ -103,16 +112,21 @@ in bytes, and must be a power of two).  If your device has no boundary
 
				 crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
			
 
				 from this pool must not cross 4KByte boundaries.
			
 
				 
			
 
				+::
			
 
				 
			
 
				-	void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
			
 
				-			      dma_addr_t *handle)
			
 
				+	void *
			
 
				+	dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
			
 
				+		        dma_addr_t *handle)
			
 
				 
			
 
				 Wraps dma_pool_alloc() and also zeroes the returned memory if the
			
 
				 allocation attempt succeeded.
			
 
				 
			
 
				 
			
 
				-	void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
			
 
				-			dma_addr_t *dma_handle);
			
 
				+::
			
 
				+
			
 
				+	void *
			
 
				+	dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
			
 
				+		       dma_addr_t *dma_handle);
			
 
				 
			
 
				 This allocates memory from the pool; the returned memory will meet the
			
 
				 size and alignment requirements specified at creation time.  Pass
			
@@ -122,16 +136,20 @@ blocking.  Like dma_alloc_coherent(), this returns two values:  an
 
				 address usable by the CPU, and the DMA address usable by the pool's
			
 
				 device.
			
 
				 
			
 
				+::
			
 
				 
			
 
				-	void dma_pool_free(struct dma_pool *pool, void *vaddr,
			
 
				-			dma_addr_t addr);
			
 
				+	void
			
 
				+	dma_pool_free(struct dma_pool *pool, void *vaddr,
			
 
				+		      dma_addr_t addr);
			
 
				 
			
 
				 This puts memory back into the pool.  The pool is what was passed to
			
 
				 dma_pool_alloc(); the CPU (vaddr) and DMA addresses are what
			
 
				 were returned when that routine allocated the memory being freed.
			
 
				 
			
 
				+::
			
 
				 
			
 
				-	void dma_pool_destroy(struct dma_pool *pool);
			
 
				+	void
			
 
				+	dma_pool_destroy(struct dma_pool *pool);
			
 
				 
			
 
				 dma_pool_destroy() frees the resources of the pool.  It must be
			
 
				 called in a context which can sleep.  Make sure you've freed all allocated
			
@@ -141,32 +159,40 @@ memory back to the pool before you destroy it.
 
				 Part Ic - DMA addressing limitations
			
 
				 ------------------------------------
			
 
				 
			
 
				-int
			
 
				-dma_set_mask_and_coherent(struct device *dev, u64 mask)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_set_mask_and_coherent(struct device *dev, u64 mask)
			
 
				 
			
 
				 Checks to see if the mask is possible and updates the device
			
 
				 streaming and coherent DMA mask parameters if it is.
			
 
				 
			
 
				 Returns: 0 if successful and a negative error if not.
			
 
				 
			
 
				-int
			
 
				-dma_set_mask(struct device *dev, u64 mask)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_set_mask(struct device *dev, u64 mask)
			
 
				 
			
 
				 Checks to see if the mask is possible and updates the device
			
 
				 parameters if it is.
			
 
				 
			
 
				 Returns: 0 if successful and a negative error if not.
			
 
				 
			
 
				-int
			
 
				-dma_set_coherent_mask(struct device *dev, u64 mask)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_set_coherent_mask(struct device *dev, u64 mask)
			
 
				 
			
 
				 Checks to see if the mask is possible and updates the device
			
 
				 parameters if it is.
			
 
				 
			
 
				 Returns: 0 if successful and a negative error if not.
			
 
				 
			
 
				-u64
			
 
				-dma_get_required_mask(struct device *dev)
			
 
				+::
			
 
				+
			
 
				+	u64
			
 
				+	dma_get_required_mask(struct device *dev)
			
 
				 
			
 
				 This API returns the mask that the platform requires to
			
 
				 operate efficiently.  Usually this means the returned mask
			
@@ -182,94 +208,107 @@ call to set the mask to the value returned.
 
				 Part Id - Streaming DMA mappings
			
 
				 --------------------------------
			
 
				 
			
 
				-dma_addr_t
			
 
				-dma_map_single(struct device *dev, void *cpu_addr, size_t size,
			
 
				-		      enum dma_data_direction direction)
			
 
				+::
			
 
				+
			
 
				+	dma_addr_t
			
 
				+	dma_map_single(struct device *dev, void *cpu_addr, size_t size,
			
 
				+		       enum dma_data_direction direction)
			
 
				 
			
 
				 Maps a piece of processor virtual memory so it can be accessed by the
			
 
				 device and returns the DMA address of the memory.
			
 
				 
			
 
				 The direction for both APIs may be converted freely by casting.
			
 
				-However the dma_ API uses a strongly typed enumerator for its
			
 
				+However the dma_API uses a strongly typed enumerator for its
			
 
				 direction:
			
 
				 
			
 
				+======================= =============================================
			
 
				 DMA_NONE		no direction (used for debugging)
			
 
				 DMA_TO_DEVICE		data is going from the memory to the device
			
 
				 DMA_FROM_DEVICE		data is coming from the device to the memory
			
 
				 DMA_BIDIRECTIONAL	direction isn't known
			
 
				+======================= =============================================
			
 
				+
			
 
				+.. note::
			
 
				+
			
 
				+	Not all memory regions in a machine can be mapped by this API.
			
 
				+	Further, contiguous kernel virtual space may not be contiguous as
			
 
				+	physical memory.  Since this API does not provide any scatter/gather
			
 
				+	capability, it will fail if the user tries to map a non-physically
			
 
				+	contiguous piece of memory.  For this reason, memory to be mapped by
			
 
				+	this API should be obtained from sources which guarantee it to be
			
 
				+	physically contiguous (like kmalloc).
			
 
				+
			
 
				+	Further, the DMA address of the memory must be within the
			
 
				+	dma_mask of the device (the dma_mask is a bit mask of the
			
 
				+	addressable region for the device, i.e., if the DMA address of
			
 
				+	the memory ANDed with the dma_mask is still equal to the DMA
			
 
				+	address, then the device can perform DMA to the memory).  To
			
 
				+	ensure that the memory allocated by kmalloc is within the dma_mask,
			
 
				+	the driver may specify various platform-dependent flags to restrict
			
 
				+	the DMA address range of the allocation (e.g., on x86, GFP_DMA
			
 
				+	guarantees to be within the first 16MB of available DMA addresses,
			
 
				+	as required by ISA devices).
			
 
				+
			
 
				+	Note also that the above constraints on physical contiguity and
			
 
				+	dma_mask may not apply if the platform has an IOMMU (a device which
			
 
				+	maps an I/O DMA address to a physical memory address).  However, to be
			
 
				+	portable, device driver writers may *not* assume that such an IOMMU
			
 
				+	exists.
			
 
				+
			
 
				+.. warning::
			
 
				+
			
 
				+	Memory coherency operates at a granularity called the cache
			
 
				+	line width.  In order for memory mapped by this API to operate
			
 
				+	correctly, the mapped region must begin exactly on a cache line
			
 
				+	boundary and end exactly on one (to prevent two separately mapped
			
 
				+	regions from sharing a single cache line).  Since the cache line size
			
 
				+	may not be known at compile time, the API will not enforce this
			
 
				+	requirement.  Therefore, it is recommended that driver writers who
			
 
				+	don't take special care to determine the cache line size at run time
			
 
				+	only map virtual regions that begin and end on page boundaries (which
			
 
				+	are guaranteed also to be cache line boundaries).
			
 
				+
			
 
				+	DMA_TO_DEVICE synchronisation must be done after the last modification
			
 
				+	of the memory region by the software and before it is handed off to
			
 
				+	the device.  Once this primitive is used, memory covered by this
			
 
				+	primitive should be treated as read-only by the device.  If the device
			
 
				+	may write to it at any point, it should be DMA_BIDIRECTIONAL (see
			
 
				+	below).
			
 
				+
			
 
				+	DMA_FROM_DEVICE synchronisation must be done before the driver
			
 
				+	accesses data that may be changed by the device.  This memory should
			
 
				+	be treated as read-only by the driver.  If the driver needs to write
			
 
				+	to it at any point, it should be DMA_BIDIRECTIONAL (see below).
			
 
				+
			
 
				+	DMA_BIDIRECTIONAL requires special handling: it means that the driver
			
 
				+	isn't sure if the memory was modified before being handed off to the
			
 
				+	device and also isn't sure if the device will also modify it.  Thus,
			
 
				+	you must always sync bidirectional memory twice: once before the
			
 
				+	memory is handed off to the device (to make sure all memory changes
			
 
				+	are flushed from the processor) and once before the data may be
			
 
				+	accessed after being used by the device (to make sure any processor
			
 
				+	cache lines are updated with data that the device may have changed).
			
 
				+
			
 
				+::
			
 
				 
			
 
				-Notes:  Not all memory regions in a machine can be mapped by this API.
			
 
				-Further, contiguous kernel virtual space may not be contiguous as
			
 
				-physical memory.  Since this API does not provide any scatter/gather
			
 
				-capability, it will fail if the user tries to map a non-physically
			
 
				-contiguous piece of memory.  For this reason, memory to be mapped by
			
 
				-this API should be obtained from sources which guarantee it to be
			
 
				-physically contiguous (like kmalloc).
			
 
				-
			
 
				-Further, the DMA address of the memory must be within the
			
 
				-dma_mask of the device (the dma_mask is a bit mask of the
			
 
				-addressable region for the device, i.e., if the DMA address of
			
 
				-the memory ANDed with the dma_mask is still equal to the DMA
			
 
				-address, then the device can perform DMA to the memory).  To
			
 
				-ensure that the memory allocated by kmalloc is within the dma_mask,
			
 
				-the driver may specify various platform-dependent flags to restrict
			
 
				-the DMA address range of the allocation (e.g., on x86, GFP_DMA
			
 
				-guarantees to be within the first 16MB of available DMA addresses,
			
 
				-as required by ISA devices).
			
 
				-
			
 
				-Note also that the above constraints on physical contiguity and
			
 
				-dma_mask may not apply if the platform has an IOMMU (a device which
			
 
				-maps an I/O DMA address to a physical memory address).  However, to be
			
 
				-portable, device driver writers may *not* assume that such an IOMMU
			
 
				-exists.
			
 
				-
			
 
				-Warnings:  Memory coherency operates at a granularity called the cache
			
 
				-line width.  In order for memory mapped by this API to operate
			
 
				-correctly, the mapped region must begin exactly on a cache line
			
 
				-boundary and end exactly on one (to prevent two separately mapped
			
 
				-regions from sharing a single cache line).  Since the cache line size
			
 
				-may not be known at compile time, the API will not enforce this
			
 
				-requirement.  Therefore, it is recommended that driver writers who
			
 
				-don't take special care to determine the cache line size at run time
			
 
				-only map virtual regions that begin and end on page boundaries (which
			
 
				-are guaranteed also to be cache line boundaries).
			
 
				-
			
 
				-DMA_TO_DEVICE synchronisation must be done after the last modification
			
 
				-of the memory region by the software and before it is handed off to
			
 
				-the device.  Once this primitive is used, memory covered by this
			
 
				-primitive should be treated as read-only by the device.  If the device
			
 
				-may write to it at any point, it should be DMA_BIDIRECTIONAL (see
			
 
				-below).
			
 
				-
			
 
				-DMA_FROM_DEVICE synchronisation must be done before the driver
			
 
				-accesses data that may be changed by the device.  This memory should
			
 
				-be treated as read-only by the driver.  If the driver needs to write
			
 
				-to it at any point, it should be DMA_BIDIRECTIONAL (see below).
			
 
				-
			
 
				-DMA_BIDIRECTIONAL requires special handling: it means that the driver
			
 
				-isn't sure if the memory was modified before being handed off to the
			
 
				-device and also isn't sure if the device will also modify it.  Thus,
			
 
				-you must always sync bidirectional memory twice: once before the
			
 
				-memory is handed off to the device (to make sure all memory changes
			
 
				-are flushed from the processor) and once before the data may be
			
 
				-accessed after being used by the device (to make sure any processor
			
 
				-cache lines are updated with data that the device may have changed).
			
 
				-
			
 
				-void
			
 
				-dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
			
 
				-		 enum dma_data_direction direction)
			
 
				+	void
			
 
				+	dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
			
 
				+			 enum dma_data_direction direction)
			
 
				 
			
 
				 Unmaps the region previously mapped.  All the parameters passed in
			
 
				 must be identical to those passed in (and returned) by the mapping
			
 
				 API.
			
 
				 
			
 
				-dma_addr_t
			
 
				-dma_map_page(struct device *dev, struct page *page,
			
 
				-		    unsigned long offset, size_t size,
			
 
				-		    enum dma_data_direction direction)
			
 
				-void
			
 
				-dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
			
 
				-	       enum dma_data_direction direction)
			
 
				+::
			
 
				+
			
 
				+	dma_addr_t
			
 
				+	dma_map_page(struct device *dev, struct page *page,
			
 
				+		     unsigned long offset, size_t size,
			
 
				+		     enum dma_data_direction direction)
			
 
				+
			
 
				+	void
			
 
				+	dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size,
			
 
				+		       enum dma_data_direction direction)
			
 
				 
			
 
				 API for mapping and unmapping for pages.  All the notes and warnings
			
 
				 for the other mapping APIs apply here.  Also, although the <offset>
			
@@ -277,20 +316,24 @@ and <size> parameters are provided to do partial page mapping, it is
 
				 recommended that you never use these unless you really know what the
			
 
				 cache width is.
			
 
				 
			
 
				-dma_addr_t
			
 
				-dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
			
 
				-		 enum dma_data_direction dir, unsigned long attrs)
			
 
				+::
			
 
				 
			
 
				-void
			
 
				-dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
			
 
				-		   enum dma_data_direction dir, unsigned long attrs)
			
 
				+	dma_addr_t
			
 
				+	dma_map_resource(struct device *dev, phys_addr_t phys_addr, size_t size,
			
 
				+			 enum dma_data_direction dir, unsigned long attrs)
			
 
				+
			
 
				+	void
			
 
				+	dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
			
 
				+			   enum dma_data_direction dir, unsigned long attrs)
			
 
				 
			
 
				 API for mapping and unmapping for MMIO resources. All the notes and
			
 
				 warnings for the other mapping APIs apply here. The API should only be
			
 
				 used to map device MMIO resources, mapping of RAM is not permitted.
			
 
				 
			
 
				-int
			
 
				-dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
			
 
				 
			
 
				 In some circumstances dma_map_single(), dma_map_page() and dma_map_resource()
			
 
				 will fail to create a mapping. A driver can check for these errors by testing
			
@@ -298,9 +341,11 @@ the returned DMA address with dma_mapping_error(). A non-zero return value
 
				 means the mapping could not be created and the driver should take appropriate
			
 
				 action (e.g. reduce current DMA mapping usage or delay and try again later).
			
 
				 
			
 
				+::
			
 
				+
			
 
				 	int
			
 
				 	dma_map_sg(struct device *dev, struct scatterlist *sg,
			
 
				-		int nents, enum dma_data_direction direction)
			
 
				+		   int nents, enum dma_data_direction direction)
			
 
				 
			
 
				 Returns: the number of DMA address segments mapped (this may be shorter
			
 
				 than <nents> passed in if some elements of the scatter/gather list are
			
@@ -316,7 +361,7 @@ critical that the driver do something, in the case of a block driver
 
				 aborting the request or even oopsing is better than doing nothing and
			
 
				 corrupting the filesystem.
			
 
				 
			
 
				-With scatterlists, you use the resulting mapping like this:
			
 
				+With scatterlists, you use the resulting mapping like this::
			
 
				 
			
 
				 	int i, count = dma_map_sg(dev, sglist, nents, direction);
			
 
				 	struct scatterlist *sg;
			
@@ -337,9 +382,11 @@ Then you should loop count times (note: this can be less than nents times)
 
				 and use sg_dma_address() and sg_dma_len() macros where you previously
			
 
				 accessed sg->address and sg->length as shown above.
			
 
				 
			
 
				+::
			
 
				+
			
 
				 	void
			
 
				 	dma_unmap_sg(struct device *dev, struct scatterlist *sg,
			
 
				-		int nents, enum dma_data_direction direction)
			
 
				+		     int nents, enum dma_data_direction direction)
			
 
				 
			
 
				 Unmap the previously mapped scatter/gather list.  All the parameters
			
 
				 must be the same as those and passed in to the scatter/gather mapping
			
@@ -348,18 +395,27 @@ API.
 
				 Note: <nents> must be the number you passed in, *not* the number of
			
 
				 DMA address entries returned.
			
 
				 
			
 
				-void
			
 
				-dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size,
			
 
				-			enum dma_data_direction direction)
			
 
				-void
			
 
				-dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size,
			
 
				-			   enum dma_data_direction direction)
			
 
				-void
			
 
				-dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nents,
			
 
				-		    enum dma_data_direction direction)
			
 
				-void
			
 
				-dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nents,
			
 
				-		       enum dma_data_direction direction)
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle,
			
 
				+				size_t size,
			
 
				+				enum dma_data_direction direction)
			
 
				+
			
 
				+	void
			
 
				+	dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle,
			
 
				+				   size_t size,
			
 
				+				   enum dma_data_direction direction)
			
 
				+
			
 
				+	void
			
 
				+	dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
			
 
				+			    int nents,
			
 
				+			    enum dma_data_direction direction)
			
 
				+
			
 
				+	void
			
 
				+	dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
			
 
				+			       int nents,
			
 
				+			       enum dma_data_direction direction)
			
 
				 
			
 
				 Synchronise a single contiguous or scatter/gather mapping for the CPU
			
 
				 and device. With the sync_sg API, all the parameters must be the same
			
@@ -367,36 +423,41 @@ as those passed into the single mapping API. With the sync_single API,
 
				 you can use dma_handle and size parameters that aren't identical to
			
 
				 those passed into the single mapping API to do a partial sync.
			
 
				 
			
 
				-Notes:  You must do this:
			
 
				 
			
 
				-- Before reading values that have been written by DMA from the device
			
 
				-  (use the DMA_FROM_DEVICE direction)
			
 
				-- After writing values that will be written to the device using DMA
			
 
				-  (use the DMA_TO_DEVICE) direction
			
 
				-- before *and* after handing memory to the device if the memory is
			
 
				-  DMA_BIDIRECTIONAL
			
 
				+.. note::
			
 
				+
			
 
				+   You must do this:
			
 
				+
			
 
				+   - Before reading values that have been written by DMA from the device
			
 
				+     (use the DMA_FROM_DEVICE direction)
			
 
				+   - After writing values that will be written to the device using DMA
			
 
				+     (use the DMA_TO_DEVICE) direction
			
 
				+   - before *and* after handing memory to the device if the memory is
			
 
				+     DMA_BIDIRECTIONAL
			
 
				 
			
 
				 See also dma_map_single().
			
 
				 
			
 
				-dma_addr_t
			
 
				-dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
			
 
				-		     enum dma_data_direction dir,
			
 
				-		     unsigned long attrs)
			
 
				+::
			
 
				+
			
 
				+	dma_addr_t
			
 
				+	dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
			
 
				+			     enum dma_data_direction dir,
			
 
				+			     unsigned long attrs)
			
 
				 
			
 
				-void
			
 
				-dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
			
 
				-		       size_t size, enum dma_data_direction dir,
			
 
				-		       unsigned long attrs)
			
 
				+	void
			
 
				+	dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
			
 
				+			       size_t size, enum dma_data_direction dir,
			
 
				+			       unsigned long attrs)
			
 
				 
			
 
				-int
			
 
				-dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
			
 
				-		 int nents, enum dma_data_direction dir,
			
 
				-		 unsigned long attrs)
			
 
				+	int
			
 
				+	dma_map_sg_attrs(struct device *dev, struct scatterlist *sgl,
			
 
				+			 int nents, enum dma_data_direction dir,
			
 
				+			 unsigned long attrs)
			
 
				 
			
 
				-void
			
 
				-dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
			
 
				-		   int nents, enum dma_data_direction dir,
			
 
				-		   unsigned long attrs)
			
 
				+	void
			
 
				+	dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sgl,
			
 
				+			   int nents, enum dma_data_direction dir,
			
 
				+			   unsigned long attrs)
			
 
				 
			
 
				 The four functions above are just like the counterpart functions
			
 
				 without the _attrs suffixes, except that they pass an optional
			
@@ -410,37 +471,38 @@ is identical to those of the corresponding function
 
				 without the _attrs suffix. As a result dma_map_single_attrs()
			
 
				 can generally replace dma_map_single(), etc.
			
 
				 
			
 
				-As an example of the use of the *_attrs functions, here's how
			
 
				+As an example of the use of the ``*_attrs`` functions, here's how
			
 
				 you could pass an attribute DMA_ATTR_FOO when mapping memory
			
 
				-for DMA:
			
 
				+for DMA::
			
 
				 
			
 
				-#include <linux/dma-mapping.h>
			
 
				-/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
			
 
				- * documented in Documentation/DMA-attributes.txt */
			
 
				-...
			
 
				+	#include <linux/dma-mapping.h>
			
 
				+	/* DMA_ATTR_FOO should be defined in linux/dma-mapping.h and
			
 
				+	* documented in Documentation/DMA-attributes.txt */
			
 
				+	...
			
 
				 
			
 
				-	unsigned long attr;
			
 
				-	attr |= DMA_ATTR_FOO;
			
 
				-	....
			
 
				-	n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
			
 
				-	....
			
 
				+		unsigned long attr;
			
 
				+		attr |= DMA_ATTR_FOO;
			
 
				+		....
			
 
				+		n = dma_map_sg_attrs(dev, sg, nents, DMA_TO_DEVICE, attr);
			
 
				+		....
			
 
				 
			
 
				 Architectures that care about DMA_ATTR_FOO would check for its
			
 
				 presence in their implementations of the mapping and unmapping
			
 
				-routines, e.g.:
			
 
				-
			
 
				-void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
			
 
				-			     size_t size, enum dma_data_direction dir,
			
 
				-			     unsigned long attrs)
			
 
				-{
			
 
				-	....
			
 
				-	if (attrs & DMA_ATTR_FOO)
			
 
				-		/* twizzle the frobnozzle */
			
 
				-	....
			
 
				+routines, e.g.:::
			
 
				+
			
 
				+	void whizco_dma_map_sg_attrs(struct device *dev, dma_addr_t dma_addr,
			
 
				+				     size_t size, enum dma_data_direction dir,
			
 
				+				     unsigned long attrs)
			
 
				+	{
			
 
				+		....
			
 
				+		if (attrs & DMA_ATTR_FOO)
			
 
				+			/* twizzle the frobnozzle */
			
 
				+		....
			
 
				+	}
			
 
				 
			
 
				 
			
 
				-Part II - Advanced dma_ usage
			
 
				------------------------------
			
 
				+Part II - Advanced dma usage
			
 
				+----------------------------
			
 
				 
			
 
				 Warning: These pieces of the DMA API should not be used in the
			
 
				 majority of cases, since they cater for unlikely corner cases that
			
@@ -450,9 +512,11 @@ If you don't understand how cache line coherency works between a
 
				 processor and an I/O device, you should not be using this part of the
			
 
				 API at all.
			
 
				 
			
 
				-void *
			
 
				-dma_alloc_noncoherent(struct device *dev, size_t size,
			
 
				-			       dma_addr_t *dma_handle, gfp_t flag)
			
 
				+::
			
 
				+
			
 
				+	void *
			
 
				+	dma_alloc_noncoherent(struct device *dev, size_t size,
			
 
				+			      dma_addr_t *dma_handle, gfp_t flag)
			
 
				 
			
 
				 Identical to dma_alloc_coherent() except that the platform will
			
 
				 choose to return either consistent or non-consistent memory as it sees
			
@@ -468,39 +532,49 @@ only use this API if you positively know your driver will be
 
				 required to work on one of the rare (usually non-PCI) architectures
			
 
				 that simply cannot make consistent memory.
			
 
				 
			
 
				-void
			
 
				-dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
			
 
				-			      dma_addr_t dma_handle)
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
			
 
				+			     dma_addr_t dma_handle)
			
 
				 
			
 
				 Free memory allocated by the nonconsistent API.  All parameters must
			
 
				 be identical to those passed in (and returned by
			
 
				 dma_alloc_noncoherent()).
			
 
				 
			
 
				-int
			
 
				-dma_get_cache_alignment(void)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_get_cache_alignment(void)
			
 
				 
			
 
				 Returns the processor cache alignment.  This is the absolute minimum
			
 
				 alignment *and* width that you must observe when either mapping
			
 
				 memory or doing partial flushes.
			
 
				 
			
 
				-Notes: This API may return a number *larger* than the actual cache
			
 
				-line, but it will guarantee that one or more cache lines fit exactly
			
 
				-into the width returned by this call.  It will also always be a power
			
 
				-of two for easy alignment.
			
 
				+.. note::
			
 
				 
			
 
				-void
			
 
				-dma_cache_sync(struct device *dev, void *vaddr, size_t size,
			
 
				-	       enum dma_data_direction direction)
			
 
				+	This API may return a number *larger* than the actual cache
			
 
				+	line, but it will guarantee that one or more cache lines fit exactly
			
 
				+	into the width returned by this call.  It will also always be a power
			
 
				+	of two for easy alignment.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	dma_cache_sync(struct device *dev, void *vaddr, size_t size,
			
 
				+		       enum dma_data_direction direction)
			
 
				 
			
 
				 Do a partial sync of memory that was allocated by
			
 
				 dma_alloc_noncoherent(), starting at virtual address vaddr and
			
 
				 continuing on for size.  Again, you *must* observe the cache line
			
 
				 boundaries when doing this.
			
 
				 
			
 
				-int
			
 
				-dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
			
 
				-			    dma_addr_t device_addr, size_t size, int
			
 
				-			    flags)
			
 
				+::
			
 
				+
			
 
				+	int
			
 
				+	dma_declare_coherent_memory(struct device *dev, phys_addr_t phys_addr,
			
 
				+				    dma_addr_t device_addr, size_t size, int
			
 
				+				    flags)
			
 
				 
			
 
				 Declare region of memory to be handed out by dma_alloc_coherent() when
			
 
				 it's asked for coherent memory for this device.
			
@@ -516,21 +590,21 @@ size is the size of the area (must be multiples of PAGE_SIZE).
 
				 
			
 
				 flags can be ORed together and are:
			
 
				 
			
 
				-DMA_MEMORY_MAP - request that the memory returned from
			
 
				-dma_alloc_coherent() be directly writable.
			
 
				+- DMA_MEMORY_MAP - request that the memory returned from
			
 
				+  dma_alloc_coherent() be directly writable.
			
 
				 
			
 
				-DMA_MEMORY_IO - request that the memory returned from
			
 
				-dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
			
 
				+- DMA_MEMORY_IO - request that the memory returned from
			
 
				+  dma_alloc_coherent() be addressable using read()/write()/memcpy_toio() etc.
			
 
				 
			
 
				 One or both of these flags must be present.
			
 
				 
			
 
				-DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
			
 
				-dma_alloc_coherent of any child devices of this one (for memory residing
			
 
				-on a bridge).
			
 
				+- DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
			
 
				+  dma_alloc_coherent of any child devices of this one (for memory residing
			
 
				+  on a bridge).
			
 
				 
			
 
				-DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions. 
			
 
				-Do not allow dma_alloc_coherent() to fall back to system memory when
			
 
				-it's out of memory in the declared region.
			
 
				+- DMA_MEMORY_EXCLUSIVE - only allocate memory from the declared regions.
			
 
				+  Do not allow dma_alloc_coherent() to fall back to system memory when
			
 
				+  it's out of memory in the declared region.
			
 
				 
			
 
				 The return value will be either DMA_MEMORY_MAP or DMA_MEMORY_IO and
			
 
				 must correspond to a passed in flag (i.e. no returning DMA_MEMORY_IO
			
@@ -543,15 +617,17 @@ must be accessed using the correct bus functions.  If your driver
 
				 isn't prepared to handle this contingency, it should not specify
			
 
				 DMA_MEMORY_IO in the input flags.
			
 
				 
			
 
				-As a simplification for the platforms, only *one* such region of
			
 
				+As a simplification for the platforms, only **one** such region of
			
 
				 memory may be declared per device.
			
 
				 
			
 
				 For reasons of efficiency, most platforms choose to track the declared
			
 
				 region only at the granularity of a page.  For smaller allocations,
			
 
				 you should use the dma_pool() API.
			
 
				 
			
 
				-void
			
 
				-dma_release_declared_memory(struct device *dev)
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	dma_release_declared_memory(struct device *dev)
			
 
				 
			
 
				 Remove the memory region previously declared from the system.  This
			
 
				 API performs *no* in-use checking for this region and will return
			
@@ -559,9 +635,11 @@ unconditionally having removed all the required structures.  It is the
 
				 driver's job to ensure that no parts of this memory region are
			
 
				 currently in use.
			
 
				 
			
 
				-void *
			
 
				-dma_mark_declared_memory_occupied(struct device *dev,
			
 
				-				  dma_addr_t device_addr, size_t size)
			
 
				+::
			
 
				+
			
 
				+	void *
			
 
				+	dma_mark_declared_memory_occupied(struct device *dev,
			
 
				+					  dma_addr_t device_addr, size_t size)
			
 
				 
			
 
				 This is used to occupy specific regions of the declared space
			
 
				 (dma_alloc_coherent() will hand out the first free region it finds).
			
@@ -592,38 +670,37 @@ option has a performance impact. Do not enable it in production kernels.
 
				 If you boot the resulting kernel will contain code which does some bookkeeping
			
 
				 about what DMA memory was allocated for which device. If this code detects an
			
 
				 error it prints a warning message with some details into your kernel log. An
			
 
				-example warning message may look like this:
			
 
				-
			
 
				-------------[ cut here ]------------
			
 
				-WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
			
 
				-	check_unmap+0x203/0x490()
			
 
				-Hardware name:
			
 
				-forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
			
 
				-	function [device address=0x00000000640444be] [size=66 bytes] [mapped as
			
 
				-single] [unmapped as page]
			
 
				-Modules linked in: nfsd exportfs bridge stp llc r8169
			
 
				-Pid: 0, comm: swapper Tainted: G        W  2.6.28-dmatest-09289-g8bb99c0 #1
			
 
				-Call Trace:
			
 
				- <IRQ>  [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
			
 
				- [<ffffffff80647b70>] _spin_unlock+0x10/0x30
			
 
				- [<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
			
 
				- [<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
			
 
				- [<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
			
 
				- [<ffffffff80252f96>] queue_work+0x56/0x60
			
 
				- [<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
			
 
				- [<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
			
 
				- [<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
			
 
				- [<ffffffff80235177>] find_busiest_group+0x207/0x8a0
			
 
				- [<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
			
 
				- [<ffffffff803c7ea3>] check_unmap+0x203/0x490
			
 
				- [<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
			
 
				- [<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
			
 
				- [<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
			
 
				- [<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
			
 
				- [<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
			
 
				- [<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
			
 
				- [<ffffffff8020c093>] ret_from_intr+0x0/0xa
			
 
				- <EOI> <4>---[ end trace f6435a98e2a38c0e ]---
			
 
				+example warning message may look like this::
			
 
				+
			
 
				+	WARNING: at /data2/repos/linux-2.6-iommu/lib/dma-debug.c:448
			
 
				+		check_unmap+0x203/0x490()
			
 
				+	Hardware name:
			
 
				+	forcedeth 0000:00:08.0: DMA-API: device driver frees DMA memory with wrong
			
 
				+		function [device address=0x00000000640444be] [size=66 bytes] [mapped as
			
 
				+	single] [unmapped as page]
			
 
				+	Modules linked in: nfsd exportfs bridge stp llc r8169
			
 
				+	Pid: 0, comm: swapper Tainted: G        W  2.6.28-dmatest-09289-g8bb99c0 #1
			
 
				+	Call Trace:
			
 
				+	<IRQ>  [<ffffffff80240b22>] warn_slowpath+0xf2/0x130
			
 
				+	[<ffffffff80647b70>] _spin_unlock+0x10/0x30
			
 
				+	[<ffffffff80537e75>] usb_hcd_link_urb_to_ep+0x75/0xc0
			
 
				+	[<ffffffff80647c22>] _spin_unlock_irqrestore+0x12/0x40
			
 
				+	[<ffffffff8055347f>] ohci_urb_enqueue+0x19f/0x7c0
			
 
				+	[<ffffffff80252f96>] queue_work+0x56/0x60
			
 
				+	[<ffffffff80237e10>] enqueue_task_fair+0x20/0x50
			
 
				+	[<ffffffff80539279>] usb_hcd_submit_urb+0x379/0xbc0
			
 
				+	[<ffffffff803b78c3>] cpumask_next_and+0x23/0x40
			
 
				+	[<ffffffff80235177>] find_busiest_group+0x207/0x8a0
			
 
				+	[<ffffffff8064784f>] _spin_lock_irqsave+0x1f/0x50
			
 
				+	[<ffffffff803c7ea3>] check_unmap+0x203/0x490
			
 
				+	[<ffffffff803c8259>] debug_dma_unmap_page+0x49/0x50
			
 
				+	[<ffffffff80485f26>] nv_tx_done_optimized+0xc6/0x2c0
			
 
				+	[<ffffffff80486c13>] nv_nic_irq_optimized+0x73/0x2b0
			
 
				+	[<ffffffff8026df84>] handle_IRQ_event+0x34/0x70
			
 
				+	[<ffffffff8026ffe9>] handle_edge_irq+0xc9/0x150
			
 
				+	[<ffffffff8020e3ab>] do_IRQ+0xcb/0x1c0
			
 
				+	[<ffffffff8020c093>] ret_from_intr+0x0/0xa
			
 
				+	<EOI> <4>---[ end trace f6435a98e2a38c0e ]---
			
 
				 
			
 
				 The driver developer can find the driver and the device including a stacktrace
			
 
				 of the DMA-API call which caused this warning.
			
@@ -637,43 +714,42 @@ details.
 
				 The debugfs directory for the DMA-API debugging code is called dma-api/. In
			
 
				 this directory the following files can currently be found:
			
 
				 
			
 
				-	dma-api/all_errors	This file contains a numeric value. If this
			
 
				+=============================== ===============================================
			
 
				+dma-api/all_errors		This file contains a numeric value. If this
			
 
				 				value is not equal to zero the debugging code
			
 
				 				will print a warning for every error it finds
			
 
				 				into the kernel log. Be careful with this
			
 
				 				option, as it can easily flood your logs.
			
 
				 
			
 
				-	dma-api/disabled	This read-only file contains the character 'Y'
			
 
				+dma-api/disabled		This read-only file contains the character 'Y'
			
 
				 				if the debugging code is disabled. This can
			
 
				 				happen when it runs out of memory or if it was
			
 
				 				disabled at boot time
			
 
				 
			
 
				-	dma-api/error_count	This file is read-only and shows the total
			
 
				+dma-api/error_count		This file is read-only and shows the total
			
 
				 				numbers of errors found.
			
 
				 
			
 
				-	dma-api/num_errors	The number in this file shows how many
			
 
				+dma-api/num_errors		The number in this file shows how many
			
 
				 				warnings will be printed to the kernel log
			
 
				 				before it stops. This number is initialized to
			
 
				 				one at system boot and be set by writing into
			
 
				 				this file
			
 
				 
			
 
				-	dma-api/min_free_entries
			
 
				-				This read-only file can be read to get the
			
 
				+dma-api/min_free_entries	This read-only file can be read to get the
			
 
				 				minimum number of free dma_debug_entries the
			
 
				 				allocator has ever seen. If this value goes
			
 
				 				down to zero the code will disable itself
			
 
				 				because it is not longer reliable.
			
 
				 
			
 
				-	dma-api/num_free_entries
			
 
				-				The current number of free dma_debug_entries
			
 
				+dma-api/num_free_entries	The current number of free dma_debug_entries
			
 
				 				in the allocator.
			
 
				 
			
 
				-	dma-api/driver-filter
			
 
				-				You can write a name of a driver into this file
			
 
				+dma-api/driver-filter		You can write a name of a driver into this file
			
 
				 				to limit the debug output to requests from that
			
 
				 				particular driver. Write an empty string to
			
 
				 				that file to disable the filter and see
			
 
				 				all errors again.
			
 
				+=============================== ===============================================
			
 
				 
			
 
				 If you have this code compiled into your kernel it will be enabled by default.
			
 
				 If you want to boot without the bookkeeping anyway you can provide
			
@@ -692,7 +768,10 @@ of preallocated entries is defined per architecture. If it is too low for you
 
				 boot with 'dma_debug_entries=<your_desired_number>' to overwrite the
			
 
				 architectural default.
			
 
				 
			
 
				-void debug_dmap_mapping_error(struct device *dev, dma_addr_t dma_addr);
			
 
				+::
			
 
				+
			
 
				+	void
			
 
				+	debug_dma_mapping_error(struct device *dev, dma_addr_t dma_addr);
			
 
				 
			
 
				 dma-debug interface debug_dma_mapping_error() to debug drivers that fail
			
 
				 to check DMA mapping errors on addresses returned by dma_map_single() and
			
@@ -702,4 +781,3 @@ the driver. When driver does unmap, debug_dma_unmap() checks the flag and if
 
				 this flag is still set, prints warning message that includes call trace that
			
 
				 leads up to the unmap. This interface can be called from dma_mapping_error()
			
 
				 routines to enable DMA mapping error check debugging.
			
 
				-
			
--- a/Documentation/DMA-ISA-LPC.txt
+++ b/Documentation/DMA-ISA-LPC.txt
@@ -1,19 +1,20 @@
 
				-                        DMA with ISA and LPC devices
			
 
				-                        ============================
			
 
				+============================
			
 
				+DMA with ISA and LPC devices
			
 
				+============================
			
 
				 
			
 
				-                      Pierre Ossman <drzeus@drzeus.cx>
			
 
				+:Author: Pierre Ossman <drzeus@drzeus.cx>
			
 
				 
			
 
				 This document describes how to do DMA transfers using the old ISA DMA
			
 
				 controller. Even though ISA is more or less dead today the LPC bus
			
 
				 uses the same DMA system so it will be around for quite some time.
			
 
				 
			
 
				-Part I - Headers and dependencies
			
 
				----------------------------------
			
 
				+Headers and dependencies
			
 
				+------------------------
			
 
				 
			
 
				-To do ISA style DMA you need to include two headers:
			
 
				+To do ISA style DMA you need to include two headers::
			
 
				 
			
 
				-#include <linux/dma-mapping.h>
			
 
				-#include <asm/dma.h>
			
 
				+	#include <linux/dma-mapping.h>
			
 
				+	#include <asm/dma.h>
			
 
				 
			
 
				 The first is the generic DMA API used to convert virtual addresses to
			
 
				 bus addresses (see Documentation/DMA-API.txt for details).
			
@@ -23,8 +24,8 @@ this is not present on all platforms make sure you construct your
 
				 Kconfig to be dependent on ISA_DMA_API (not ISA) so that nobody tries
			
 
				 to build your driver on unsupported platforms.
			
 
				 
			
 
				-Part II - Buffer allocation
			
 
				----------------------------
			
 
				+Buffer allocation
			
 
				+-----------------
			
 
				 
			
 
				 The ISA DMA controller has some very strict requirements on which
			
 
				 memory it can access so extra care must be taken when allocating
			
@@ -42,13 +43,13 @@ requirements you pass the flag GFP_DMA to kmalloc.
 
				 
			
 
				 Unfortunately the memory available for ISA DMA is scarce so unless you
			
 
				 allocate the memory during boot-up it's a good idea to also pass
			
 
				-__GFP_REPEAT and __GFP_NOWARN to make the allocator try a bit harder.
			
 
				+__GFP_RETRY_MAYFAIL and __GFP_NOWARN to make the allocator try a bit harder.
			
 
				 
			
 
				 (This scarcity also means that you should allocate the buffer as
			
 
				 early as possible and not release it until the driver is unloaded.)
			
 
				 
			
 
				-Part III - Address translation
			
 
				-------------------------------
			
 
				+Address translation
			
 
				+-------------------
			
 
				 
			
 
				 To translate the virtual address to a bus address, use the normal DMA
			
 
				 API. Do _not_ use isa_virt_to_phys() even though it does the same
			
@@ -61,8 +62,8 @@ Note: x86_64 had a broken DMA API when it came to ISA but has since
 
				 been fixed. If your arch has problems then fix the DMA API instead of
			
 
				 reverting to the ISA functions.
			
 
				 
			
 
				-Part IV - Channels
			
 
				-------------------
			
 
				+Channels
			
 
				+--------
			
 
				 
			
 
				 A normal ISA DMA controller has 8 channels. The lower four are for
			
 
				 8-bit transfers and the upper four are for 16-bit transfers.
			
@@ -80,8 +81,8 @@ The ability to use 16-bit or 8-bit transfers is _not_ up to you as a
 
				 driver author but depends on what the hardware supports. Check your
			
 
				 specs or test different channels.
			
 
				 
			
 
				-Part V - Transfer data
			
 
				-----------------------
			
 
				+Transfer data
			
 
				+-------------
			
 
				 
			
 
				 Now for the good stuff, the actual DMA transfer. :)
			
 
				 
			
@@ -112,37 +113,37 @@ Once the DMA transfer is finished (or timed out) you should disable
 
				 the channel again. You should also check get_dma_residue() to make
			
 
				 sure that all data has been transferred.
			
 
				 
			
 
				-Example:
			
 
				+Example::
			
 
				 
			
 
				-int flags, residue;
			
 
				+	int flags, residue;
			
 
				 
			
 
				-flags = claim_dma_lock();
			
 
				+	flags = claim_dma_lock();
			
 
				 
			
 
				-clear_dma_ff();
			
 
				+	clear_dma_ff();
			
 
				 
			
 
				-set_dma_mode(channel, DMA_MODE_WRITE);
			
 
				-set_dma_addr(channel, phys_addr);
			
 
				-set_dma_count(channel, num_bytes);
			
 
				+	set_dma_mode(channel, DMA_MODE_WRITE);
			
 
				+	set_dma_addr(channel, phys_addr);
			
 
				+	set_dma_count(channel, num_bytes);
			
 
				 
			
 
				-dma_enable(channel);
			
 
				+	dma_enable(channel);
			
 
				 
			
 
				-release_dma_lock(flags);
			
 
				+	release_dma_lock(flags);
			
 
				 
			
 
				-while (!device_done());
			
 
				+	while (!device_done());
			
 
				 
			
 
				-flags = claim_dma_lock();
			
 
				+	flags = claim_dma_lock();
			
 
				 
			
 
				-dma_disable(channel);
			
 
				+	dma_disable(channel);
			
 
				 
			
 
				-residue = dma_get_residue(channel);
			
 
				-if (residue != 0)
			
 
				-	printk(KERN_ERR "driver: Incomplete DMA transfer!"
			
 
				-		" %d bytes left!\n", residue);
			
 
				+	residue = dma_get_residue(channel);
			
 
				+	if (residue != 0)
			
 
				+		printk(KERN_ERR "driver: Incomplete DMA transfer!"
			
 
				+			" %d bytes left!\n", residue);
			
 
				 
			
 
				-release_dma_lock(flags);
			
 
				+	release_dma_lock(flags);
			
 
				 
			
 
				-Part VI - Suspend/resume
			
 
				-------------------------
			
 
				+Suspend/resume
			
 
				+--------------
			
 
				 
			
 
				 It is the driver's responsibility to make sure that the machine isn't
			
 
				 suspended while a DMA transfer is in progress. Also, all DMA settings
			
--- a/Documentation/DMA-attributes.txt
+++ b/Documentation/DMA-attributes.txt
@@ -1,5 +1,6 @@
 
				-			DMA attributes
			
 
				-			==============
			
 
				+==============
			
 
				+DMA attributes
			
 
				+==============
			
 
				 
			
 
				 This document describes the semantics of the DMA attributes that are
			
 
				 defined in linux/dma-mapping.h.
			
@@ -108,6 +109,7 @@ This is a hint to the DMA-mapping subsystem that it's probably not worth
 
				 the time to try to allocate memory to in a way that gives better TLB
			
 
				 efficiency (AKA it's not worth trying to build the mapping out of larger
			
 
				 pages).  You might want to specify this if:
			
 
				+
			
 
				 - You know that the accesses to this memory won't thrash the TLB.
			
 
				   You might know that the accesses are likely to be sequential or
			
 
				   that they aren't sequential but it's unlikely you'll ping-pong
			
@@ -121,11 +123,12 @@ pages).  You might want to specify this if:
 
				   the mapping to have a short lifetime then it may be worth it to
			
 
				   optimize allocation (avoid coming up with large pages) instead of
			
 
				   getting the slight performance win of larger pages.
			
 
				+
			
 
				 Setting this hint doesn't guarantee that you won't get huge pages, but it
			
 
				 means that we won't try quite as hard to get them.
			
 
				 
			
 
				-NOTE: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
			
 
				-though ARM64 patches will likely be posted soon.
			
 
				+.. note:: At the moment DMA_ATTR_ALLOC_SINGLE_PAGES is only implemented on ARM,
			
 
				+	  though ARM64 patches will likely be posted soon.
			
 
				 
			
 
				 DMA_ATTR_NO_WARN
			
 
				 ----------------
			
@@ -142,10 +145,10 @@ problem at all, depending on the implementation of the retry mechanism.
 
				 So, this provides a way for drivers to avoid those error messages on calls
			
 
				 where allocation failures are not a problem, and shouldn't bother the logs.
			
 
				 
			
 
				-NOTE: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
			
 
				+.. note:: At the moment DMA_ATTR_NO_WARN is only implemented on PowerPC.
			
 
				 
			
 
				 DMA_ATTR_PRIVILEGED
			
 
				-------------------------------
			
 
				+-------------------
			
 
				 
			
 
				 Some advanced peripherals such as remote processors and GPUs perform
			
 
				 accesses to DMA buffers in both privileged "supervisor" and unprivileged
			
--- a/Documentation/DocBook/.gitignore
+++ b/Documentation/DocBook/.gitignore
@@ -1,17 +0,0 @@
 
				-*.xml
			
 
				-*.ps
			
 
				-*.pdf
			
 
				-*.html
			
 
				-*.9.gz
			
 
				-*.9
			
 
				-*.aux
			
 
				-*.dvi
			
 
				-*.log
			
 
				-*.out
			
 
				-*.png
			
 
				-*.gif
			
 
				-*.svg
			
 
				-*.proc
			
 
				-*.db
			
 
				-media-indices.tmpl
			
 
				-media-entities.tmpl
			
--- a/Documentation/DocBook/Makefile
+++ b/Documentation/DocBook/Makefile
@@ -1,282 +0,0 @@
 
				-###
			
 
				-# This makefile is used to generate the kernel documentation,
			
 
				-# primarily based on in-line comments in various source files.
			
 
				-# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
			
 
				-# to document the SRC - and how to read it.
			
 
				-# To add a new book the only step required is to add the book to the
			
 
				-# list of DOCBOOKS.
			
 
				-
			
 
				-DOCBOOKS := z8530book.xml  \
			
 
				-	    kernel-hacking.xml kernel-locking.xml \
			
 
				-	    networking.xml \
			
 
				-	    filesystems.xml lsm.xml kgdb.xml \
			
 
				-	    libata.xml mtdnand.xml librs.xml rapidio.xml \
			
 
				-	    s390-drivers.xml scsi.xml \
			
 
				-	    sh.xml w1.xml
			
 
				-
			
 
				-ifeq ($(DOCBOOKS),)
			
 
				-
			
 
				-# Skip DocBook build if the user explicitly requested no DOCBOOKS.
			
 
				-.DEFAULT:
			
 
				-	@echo "  SKIP    DocBook $@ target (DOCBOOKS=\"\" specified)."
			
 
				-else
			
 
				-ifneq ($(SPHINXDIRS),)
			
 
				-
			
 
				-# Skip DocBook build if the user explicitly requested a sphinx dir
			
 
				-.DEFAULT:
			
 
				-	@echo "  SKIP    DocBook $@ target (SPHINXDIRS specified)."
			
 
				-else
			
 
				-
			
 
				-
			
 
				-###
			
 
				-# The build process is as follows (targets):
			
 
				-#              (xmldocs) [by docproc]
			
 
				-# file.tmpl --> file.xml +--> file.ps   (psdocs)   [by db2ps or xmlto]
			
 
				-#                        +--> file.pdf  (pdfdocs)  [by db2pdf or xmlto]
			
 
				-#                        +--> DIR=file  (htmldocs) [by xmlto]
			
 
				-#                        +--> man/      (mandocs)  [by xmlto]
			
 
				-
			
 
				-
			
 
				-# for PDF and PS output you can choose between xmlto and docbook-utils tools
			
 
				-PDF_METHOD	= $(prefer-db2x)
			
 
				-PS_METHOD	= $(prefer-db2x)
			
 
				-
			
 
				-
			
 
				-targets += $(DOCBOOKS)
			
 
				-BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
			
 
				-xmldocs: $(BOOKS)
			
 
				-sgmldocs: xmldocs
			
 
				-
			
 
				-PS := $(patsubst %.xml, %.ps, $(BOOKS))
			
 
				-psdocs: $(PS)
			
 
				-
			
 
				-PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
			
 
				-pdfdocs: $(PDF)
			
 
				-
			
 
				-HTML := $(sort $(patsubst %.xml, %.html, $(BOOKS)))
			
 
				-htmldocs: $(HTML)
			
 
				-	$(call cmd,build_main_index)
			
 
				-
			
 
				-MAN := $(patsubst %.xml, %.9, $(BOOKS))
			
 
				-mandocs: $(MAN)
			
 
				-	find $(obj)/man -name '*.9' | xargs gzip -nf
			
 
				-
			
 
				-# Default location for installed man pages
			
 
				-export INSTALL_MAN_PATH = $(objtree)/usr
			
 
				-
			
 
				-installmandocs: mandocs
			
 
				-	mkdir -p $(INSTALL_MAN_PATH)/man/man9/
			
 
				-	find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \
			
 
				-		sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \
			
 
				-		xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/
			
 
				-
			
 
				-# no-op for the DocBook toolchain
			
 
				-epubdocs:
			
 
				-latexdocs:
			
 
				-linkcheckdocs:
			
 
				-
			
 
				-###
			
 
				-#External programs used
			
 
				-KERNELDOCXMLREF = $(srctree)/scripts/kernel-doc-xml-ref
			
 
				-KERNELDOC       = $(srctree)/scripts/kernel-doc
			
 
				-DOCPROC         = $(objtree)/scripts/docproc
			
 
				-CHECK_LC_CTYPE = $(objtree)/scripts/check-lc_ctype
			
 
				-
			
 
				-# Use a fixed encoding - UTF-8 if the C library has support built-in
			
 
				-# or ASCII if not
			
 
				-LC_CTYPE := $(call try-run, LC_CTYPE=C.UTF-8 $(CHECK_LC_CTYPE),C.UTF-8,C)
			
 
				-export LC_CTYPE
			
 
				-
			
 
				-XMLTOFLAGS = -m $(srctree)/$(src)/stylesheet.xsl
			
 
				-XMLTOFLAGS += --skip-validation
			
 
				-
			
 
				-###
			
 
				-# DOCPROC is used for two purposes:
			
 
				-# 1) To generate a dependency list for a .tmpl file
			
 
				-# 2) To preprocess a .tmpl file and call kernel-doc with
			
 
				-#     appropriate parameters.
			
 
				-# The following rules are used to generate the .xml documentation
			
 
				-# required to generate the final targets. (ps, pdf, html).
			
 
				-quiet_cmd_docproc = DOCPROC $@
			
 
				-      cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
			
 
				-define rule_docproc
			
 
				-	set -e;								\
			
 
				-        $(if $($(quiet)cmd_$(1)),echo '  $($(quiet)cmd_$(1))';) 	\
			
 
				-        $(cmd_$(1)); 							\
			
 
				-        ( 								\
			
 
				-          echo 'cmd_$@ := $(cmd_$(1))'; 				\
			
 
				-          echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; 		\
			
 
				-        ) > $(dir $@).$(notdir $@).cmd
			
 
				-endef
			
 
				-
			
 
				-%.xml: %.tmpl $(KERNELDOC) $(DOCPROC) $(KERNELDOCXMLREF) FORCE
			
 
				-	$(call if_changed_rule,docproc)
			
 
				-
			
 
				-# Tell kbuild to always build the programs
			
 
				-always := $(hostprogs-y)
			
 
				-
			
 
				-notfoundtemplate = echo "*** You have to install docbook-utils or xmlto ***"; \
			
 
				-		   exit 1
			
 
				-db2xtemplate = db2TYPE -o $(dir $@) $<
			
 
				-xmltotemplate = xmlto TYPE $(XMLTOFLAGS) -o $(dir $@) $<
			
 
				-
			
 
				-# determine which methods are available
			
 
				-ifeq ($(shell which db2ps >/dev/null 2>&1 && echo found),found)
			
 
				-	use-db2x = db2x
			
 
				-	prefer-db2x = db2x
			
 
				-else
			
 
				-	use-db2x = notfound
			
 
				-	prefer-db2x = $(use-xmlto)
			
 
				-endif
			
 
				-ifeq ($(shell which xmlto >/dev/null 2>&1 && echo found),found)
			
 
				-	use-xmlto = xmlto
			
 
				-	prefer-xmlto = xmlto
			
 
				-else
			
 
				-	use-xmlto = notfound
			
 
				-	prefer-xmlto = $(use-db2x)
			
 
				-endif
			
 
				-
			
 
				-# the commands, generated from the chosen template
			
 
				-quiet_cmd_db2ps = PS      $@
			
 
				-      cmd_db2ps = $(subst TYPE,ps, $($(PS_METHOD)template))
			
 
				-%.ps : %.xml
			
 
				-	$(call cmd,db2ps)
			
 
				-
			
 
				-quiet_cmd_db2pdf = PDF     $@
			
 
				-      cmd_db2pdf = $(subst TYPE,pdf, $($(PDF_METHOD)template))
			
 
				-%.pdf : %.xml
			
 
				-	$(call cmd,db2pdf)
			
 
				-
			
 
				-
			
 
				-index = index.html
			
 
				-main_idx = $(obj)/$(index)
			
 
				-quiet_cmd_build_main_index = HTML    $(main_idx)
			
 
				-      cmd_build_main_index = rm -rf $(main_idx); \
			
 
				-		   echo '<h1>Linux Kernel HTML Documentation</h1>' >> $(main_idx) && \
			
 
				-		   echo '<h2>Kernel Version: $(KERNELVERSION)</h2>' >> $(main_idx) && \
			
 
				-		   cat $(HTML) >> $(main_idx)
			
 
				-
			
 
				-quiet_cmd_db2html = HTML    $@
			
 
				-      cmd_db2html = xmlto html $(XMLTOFLAGS) -o $(patsubst %.html,%,$@) $< && \
			
 
				-		echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/index.html"> \
			
 
				-		$(patsubst %.html,%,$(notdir $@))</a><p>' > $@
			
 
				-
			
 
				-###
			
 
				-# Rules to create an aux XML and .db, and use them to re-process the DocBook XML
			
 
				-# to fill internal hyperlinks
			
 
				-       gen_aux_xml = :
			
 
				- quiet_gen_aux_xml = echo '  XMLREF  $@'
			
 
				-silent_gen_aux_xml = :
			
 
				-%.aux.xml: %.xml
			
 
				-	@$($(quiet)gen_aux_xml)
			
 
				-	@rm -rf $@
			
 
				-	@(cat $< | egrep "^<refentry id" | egrep -o "\".*\"" | cut -f 2 -d \" > $<.db)
			
 
				-	@$(KERNELDOCXMLREF) -db $<.db $< > $@
			
 
				-.PRECIOUS: %.aux.xml
			
 
				-
			
 
				-%.html:	%.aux.xml
			
 
				-	@(which xmlto > /dev/null 2>&1) || \
			
 
				-	 (echo "*** You need to install xmlto ***"; \
			
 
				-	  exit 1)
			
 
				-	@rm -rf $@ $(patsubst %.html,%,$@)
			
 
				-	$(call cmd,db2html)
			
 
				-	@if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
			
 
				-            cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
			
 
				-
			
 
				-quiet_cmd_db2man = MAN     $@
			
 
				-      cmd_db2man = if grep -q refentry $<; then xmlto man $(XMLTOFLAGS) -o $(obj)/man/$(*F) $< ; fi
			
 
				-%.9 : %.xml
			
 
				-	@(which xmlto > /dev/null 2>&1) || \
			
 
				-	 (echo "*** You need to install xmlto ***"; \
			
 
				-	  exit 1)
			
 
				-	$(Q)mkdir -p $(obj)/man/$(*F)
			
 
				-	$(call cmd,db2man)
			
 
				-	@touch $@
			
 
				-
			
 
				-###
			
 
				-# Rules to generate postscripts and PNG images from .fig format files
			
 
				-quiet_cmd_fig2eps = FIG2EPS $@
			
 
				-      cmd_fig2eps = fig2dev -Leps $< $@
			
 
				-
			
 
				-%.eps: %.fig
			
 
				-	@(which fig2dev > /dev/null 2>&1) || \
			
 
				-	 (echo "*** You need to install transfig ***"; \
			
 
				-	  exit 1)
			
 
				-	$(call cmd,fig2eps)
			
 
				-
			
 
				-quiet_cmd_fig2png = FIG2PNG $@
			
 
				-      cmd_fig2png = fig2dev -Lpng $< $@
			
 
				-
			
 
				-%.png: %.fig
			
 
				-	@(which fig2dev > /dev/null 2>&1) || \
			
 
				-	 (echo "*** You need to install transfig ***"; \
			
 
				-	  exit 1)
			
 
				-	$(call cmd,fig2png)
			
 
				-
			
 
				-###
			
 
				-# Rule to convert a .c file to inline XML documentation
			
 
				-       gen_xml = :
			
 
				- quiet_gen_xml = echo '  GEN     $@'
			
 
				-silent_gen_xml = :
			
 
				-%.xml: %.c
			
 
				-	@$($(quiet)gen_xml)
			
 
				-	@(                            \
			
 
				-	   echo "<programlisting>";   \
			
 
				-	   expand --tabs=8 < $< |     \
			
 
				-	   sed -e "s/&/\\&amp;/g"     \
			
 
				-	       -e "s/</\\&lt;/g"      \
			
 
				-	       -e "s/>/\\&gt;/g";     \
			
 
				-	   echo "</programlisting>")  > $@
			
 
				-
			
 
				-endif # DOCBOOKS=""
			
 
				-endif # SPHINDIR=...
			
 
				-
			
 
				-###
			
 
				-# Help targets as used by the top-level makefile
			
 
				-dochelp:
			
 
				-	@echo  ' Linux kernel internal documentation in different formats (DocBook):'
			
 
				-	@echo  '  htmldocs        - HTML'
			
 
				-	@echo  '  pdfdocs         - PDF'
			
 
				-	@echo  '  psdocs          - Postscript'
			
 
				-	@echo  '  xmldocs         - XML DocBook'
			
 
				-	@echo  '  mandocs         - man pages'
			
 
				-	@echo  '  installmandocs  - install man pages generated by mandocs to INSTALL_MAN_PATH'; \
			
 
				-	 echo  '                    (default: $(INSTALL_MAN_PATH))'; \
			
 
				-	 echo  ''
			
 
				-	@echo  '  cleandocs       - clean all generated DocBook files'
			
 
				-	@echo
			
 
				-	@echo  '  make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml'
			
 
				-	@echo  '  valid values for DOCBOOKS are: $(DOCBOOKS)'
			
 
				-	@echo
			
 
				-	@echo  "  make DOCBOOKS=\"\" [target] Don't generate docs from Docbook"
			
 
				-	@echo  '     This is useful to generate only the ReST docs (Sphinx)'
			
 
				-
			
 
				-
			
 
				-###
			
 
				-# Temporary files left by various tools
			
 
				-clean-files := $(DOCBOOKS) \
			
 
				-	$(patsubst %.xml, %.dvi,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.aux,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.tex,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.log,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.out,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.ps,      $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.pdf,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.html,    $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.9,       $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.aux.xml, $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.xml.db,  $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, %.xml,     $(DOCBOOKS)) \
			
 
				-	$(patsubst %.xml, .%.xml.cmd, $(DOCBOOKS)) \
			
 
				-	$(index)
			
 
				-
			
 
				-clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS)) man
			
 
				-
			
 
				-cleandocs:
			
 
				-	$(Q)rm -f $(call objectify, $(clean-files))
			
 
				-	$(Q)rm -rf $(call objectify, $(clean-dirs))
			
 
				-
			
 
				-# Declare the contents of the .PHONY variable as phony.  We keep that
			
 
				-# information in a variable so we can use it in if_changed and friends.
			
 
				-
			
 
				-.PHONY: $(PHONY)
			
--- a/Documentation/DocBook/filesystems.tmpl
+++ b/Documentation/DocBook/filesystems.tmpl
@@ -1,381 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="Linux-filesystems-API">
			
 
				- <bookinfo>
			
 
				-  <title>Linux Filesystems API</title>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License as published by the Free Software Foundation; either
			
 
				-     version 2 of the License, or (at your option) any later
			
 
				-     version.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="vfs">
			
 
				-     <title>The Linux VFS</title>
			
 
				-     <sect1 id="the_filesystem_types"><title>The Filesystem types</title>
			
 
				-!Iinclude/linux/fs.h
			
 
				-     </sect1>
			
 
				-     <sect1 id="the_directory_cache"><title>The Directory Cache</title>
			
 
				-!Efs/dcache.c
			
 
				-!Iinclude/linux/dcache.h
			
 
				-     </sect1>
			
 
				-     <sect1 id="inode_handling"><title>Inode Handling</title>
			
 
				-!Efs/inode.c
			
 
				-!Efs/bad_inode.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="registration_and_superblocks"><title>Registration and Superblocks</title>
			
 
				-!Efs/super.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="file_locks"><title>File Locks</title>
			
 
				-!Efs/locks.c
			
 
				-!Ifs/locks.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="other_functions"><title>Other Functions</title>
			
 
				-!Efs/mpage.c
			
 
				-!Efs/namei.c
			
 
				-!Efs/buffer.c
			
 
				-!Eblock/bio.c
			
 
				-!Efs/seq_file.c
			
 
				-!Efs/filesystems.c
			
 
				-!Efs/fs-writeback.c
			
 
				-!Efs/block_dev.c
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="proc">
			
 
				-     <title>The proc filesystem</title>
			
 
				-
			
 
				-     <sect1 id="sysctl_interface"><title>sysctl interface</title>
			
 
				-!Ekernel/sysctl.c
			
 
				-     </sect1>
			
 
				-
			
 
				-     <sect1 id="proc_filesystem_interface"><title>proc filesystem interface</title>
			
 
				-!Ifs/proc/base.c
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="fs_events">
			
 
				-     <title>Events based on file descriptors</title>
			
 
				-!Efs/eventfd.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="sysfs">
			
 
				-     <title>The Filesystem for Exporting Kernel Objects</title>
			
 
				-!Efs/sysfs/file.c
			
 
				-!Efs/sysfs/symlink.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="debugfs">
			
 
				-     <title>The debugfs filesystem</title>
			
 
				-
			
 
				-     <sect1 id="debugfs_interface"><title>debugfs interface</title>
			
 
				-!Efs/debugfs/inode.c
			
 
				-!Efs/debugfs/file.c
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="LinuxJDBAPI">
			
 
				-  <chapterinfo>
			
 
				-  <title>The Linux Journalling API</title>
			
 
				-
			
 
				-  <authorgroup>
			
 
				-  <author>
			
 
				-     <firstname>Roger</firstname>
			
 
				-     <surname>Gammans</surname>
			
 
				-     <affiliation>
			
 
				-     <address>
			
 
				-      <email>rgammans@computer-surgery.co.uk</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-     </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Stephen</firstname>
			
 
				-    <surname>Tweedie</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>sct@redhat.com</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2002</year>
			
 
				-   <holder>Roger Gammans</holder>
			
 
				-  </copyright>
			
 
				-  </chapterinfo>
			
 
				-
			
 
				-  <title>The Linux Journalling API</title>
			
 
				-
			
 
				-    <sect1 id="journaling_overview">
			
 
				-     <title>Overview</title>
			
 
				-    <sect2 id="journaling_details">
			
 
				-     <title>Details</title>
			
 
				-<para>
			
 
				-The journalling layer is  easy to use. You need to
			
 
				-first of all create a journal_t data structure. There are
			
 
				-two calls to do this dependent on how you decide to allocate the physical
			
 
				-media on which the journal resides. The jbd2_journal_init_inode() call
			
 
				-is for journals stored in filesystem inodes, or the jbd2_journal_init_dev()
			
 
				-call can be used for journal stored on a raw device (in a continuous range
			
 
				-of blocks). A journal_t is a typedef for a struct pointer, so when
			
 
				-you are finally finished make sure you call jbd2_journal_destroy() on it
			
 
				-to free up any used kernel memory.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Once you have got your journal_t object you need to 'mount' or load the journal
			
 
				-file. The journalling layer expects the space for the journal was already
			
 
				-allocated and initialized properly by the userspace tools.  When loading the
			
 
				-journal you must call jbd2_journal_load() to process journal contents.  If the
			
 
				-client file system detects the journal contents does not need to be processed
			
 
				-(or even need not have valid contents), it may call jbd2_journal_wipe() to
			
 
				-clear the journal contents before calling jbd2_journal_load().
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Note that jbd2_journal_wipe(..,0) calls jbd2_journal_skip_recovery() for you if
			
 
				-it detects any outstanding transactions in the journal and similarly
			
 
				-jbd2_journal_load() will call jbd2_journal_recover() if necessary.  I would
			
 
				-advise reading ext4_load_journal() in fs/ext4/super.c for examples on this
			
 
				-stage.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Now you can go ahead and start modifying the underlying
			
 
				-filesystem. Almost.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-
			
 
				-You still need to actually journal your filesystem changes, this
			
 
				-is done by wrapping them into transactions. Additionally you
			
 
				-also need to wrap the modification of each of the buffers
			
 
				-with calls to the journal layer, so it knows what the modifications
			
 
				-you are actually making are. To do this use jbd2_journal_start() which
			
 
				-returns a transaction handle.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-jbd2_journal_start()
			
 
				-and its counterpart jbd2_journal_stop(), which indicates the end of a
			
 
				-transaction are nestable calls, so you can reenter a transaction if necessary,
			
 
				-but remember you must call jbd2_journal_stop() the same number of times as
			
 
				-jbd2_journal_start() before the transaction is completed (or more accurately
			
 
				-leaves the update phase). Ext4/VFS makes use of this feature to simplify
			
 
				-handling of inode dirtying, quota support, etc.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Inside each transaction you need to wrap the modifications to the
			
 
				-individual buffers (blocks). Before you start to modify a buffer you
			
 
				-need to call jbd2_journal_get_{create,write,undo}_access() as appropriate,
			
 
				-this allows the journalling layer to copy the unmodified data if it
			
 
				-needs to. After all the buffer may be part of a previously uncommitted
			
 
				-transaction.
			
 
				-At this point you are at last ready to modify a buffer, and once
			
 
				-you are have done so you need to call jbd2_journal_dirty_{meta,}data().
			
 
				-Or if you've asked for access to a buffer you now know is now longer
			
 
				-required to be pushed back on the device you can call jbd2_journal_forget()
			
 
				-in much the same way as you might have used bforget() in the past.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-A jbd2_journal_flush() may be called at any time to commit and checkpoint
			
 
				-all your transactions.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Then at umount time , in your put_super() you can then call jbd2_journal_destroy()
			
 
				-to clean up your in-core journal object.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Unfortunately there a couple of ways the journal layer can cause a deadlock.
			
 
				-The first thing to note is that each task can only have
			
 
				-a single outstanding transaction at any one time, remember nothing
			
 
				-commits until the outermost jbd2_journal_stop(). This means
			
 
				-you must complete the transaction at the end of each file/inode/address
			
 
				-etc. operation you perform, so that the journalling system isn't re-entered
			
 
				-on another journal. Since transactions can't be nested/batched
			
 
				-across differing journals, and another filesystem other than
			
 
				-yours (say ext4) may be modified in a later syscall.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The second case to bear in mind is that jbd2_journal_start() can
			
 
				-block if there isn't enough space in the journal for your transaction
			
 
				-(based on the passed nblocks param) - when it blocks it merely(!) needs to
			
 
				-wait for transactions to complete and be committed from other tasks,
			
 
				-so essentially we are waiting for jbd2_journal_stop(). So to avoid
			
 
				-deadlocks you must treat jbd2_journal_start/stop() as if they
			
 
				-were semaphores and include them in your semaphore ordering rules to prevent
			
 
				-deadlocks. Note that jbd2_journal_extend() has similar blocking behaviour to
			
 
				-jbd2_journal_start() so you can deadlock here just as easily as on
			
 
				-jbd2_journal_start().
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Try to reserve the right number of blocks the first time. ;-). This will
			
 
				-be the maximum number of blocks you are going to touch in this transaction.
			
 
				-I advise having a look at at least ext4_jbd.h to see the basis on which
			
 
				-ext4 uses to make these decisions.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Another wriggle to watch out for is your on-disk block allocation strategy.
			
 
				-Why? Because, if you do a delete, you need to ensure you haven't reused any
			
 
				-of the freed blocks until the transaction freeing these blocks commits. If you
			
 
				-reused these blocks and crash happens, there is no way to restore the contents
			
 
				-of the reallocated blocks at the end of the last fully committed transaction.
			
 
				-
			
 
				-One simple way of doing this is to mark blocks as free in internal in-memory
			
 
				-block allocation structures only after the transaction freeing them commits.
			
 
				-Ext4 uses journal commit callback for this purpose.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-With journal commit callbacks you can ask the journalling layer to call a
			
 
				-callback function when the transaction is finally committed to disk, so that
			
 
				-you can do some of your own management. You ask the journalling layer for
			
 
				-calling the callback by simply setting journal->j_commit_callback function
			
 
				-pointer and that function is called after each transaction commit. You can also
			
 
				-use transaction->t_private_list for attaching entries to a transaction that
			
 
				-need processing when the transaction commits.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-JBD2 also provides a way to block all transaction updates via
			
 
				-jbd2_journal_{un,}lock_updates(). Ext4 uses this when it wants a window with a
			
 
				-clean and stable fs for a moment.  E.g.
			
 
				-</para>
			
 
				-
			
 
				-<programlisting>
			
 
				-
			
 
				-	jbd2_journal_lock_updates() //stop new stuff happening..
			
 
				-	jbd2_journal_flush()        // checkpoint everything.
			
 
				-	..do stuff on stable fs
			
 
				-	jbd2_journal_unlock_updates() // carry on with filesystem use.
			
 
				-</programlisting>
			
 
				-
			
 
				-<para>
			
 
				-The opportunities for abuse and DOS attacks with this should be obvious,
			
 
				-if you allow unprivileged userspace to trigger codepaths containing these
			
 
				-calls.
			
 
				-</para>
			
 
				-
			
 
				-    </sect2>
			
 
				-
			
 
				-    <sect2 id="jbd_summary">
			
 
				-     <title>Summary</title>
			
 
				-<para>
			
 
				-Using the journal is a matter of wrapping the different context changes,
			
 
				-being each mount, each modification (transaction) and each changed buffer
			
 
				-to tell the journalling layer about them.
			
 
				-</para>
			
 
				-
			
 
				-    </sect2>
			
 
				-
			
 
				-    </sect1>
			
 
				-
			
 
				-    <sect1 id="data_types">
			
 
				-     <title>Data Types</title>
			
 
				-     <para>
			
 
				-	The journalling layer uses typedefs to 'hide' the concrete definitions
			
 
				-	of the structures used. As a client of the JBD2 layer you can
			
 
				-	just rely on the using the pointer as a magic cookie  of some sort.
			
 
				-
			
 
				-	Obviously the hiding is not enforced as this is 'C'.
			
 
				-     </para>
			
 
				-	<sect2 id="structures"><title>Structures</title>
			
 
				-!Iinclude/linux/jbd2.h
			
 
				-	</sect2>
			
 
				-    </sect1>
			
 
				-
			
 
				-    <sect1 id="functions">
			
 
				-     <title>Functions</title>
			
 
				-     <para>
			
 
				-	The functions here are split into two groups those that
			
 
				-	affect a journal as a whole, and those which are used to
			
 
				-	manage transactions
			
 
				-     </para>
			
 
				-	<sect2 id="journal_level"><title>Journal Level</title>
			
 
				-!Efs/jbd2/journal.c
			
 
				-!Ifs/jbd2/recovery.c
			
 
				-	</sect2>
			
 
				-	<sect2 id="transaction_level"><title>Transasction Level</title>
			
 
				-!Efs/jbd2/transaction.c
			
 
				-	</sect2>
			
 
				-    </sect1>
			
 
				-    <sect1 id="see_also">
			
 
				-     <title>See also</title>
			
 
				-	<para>
			
 
				-	  <citation>
			
 
				-	   <ulink url="http://kernel.org/pub/linux/kernel/people/sct/ext3/journal-design.ps.gz">
			
 
				-	   	Journaling the Linux ext2fs Filesystem, LinuxExpo 98, Stephen Tweedie
			
 
				-	   </ulink>
			
 
				-	  </citation>
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	   <citation>
			
 
				-	   <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
			
 
				-	   	Ext3 Journalling FileSystem, OLS 2000, Dr. Stephen Tweedie
			
 
				-	   </ulink>
			
 
				-	   </citation>
			
 
				-	</para>
			
 
				-    </sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="splice">
			
 
				-      <title>splice API</title>
			
 
				-  <para>
			
 
				-	splice is a method for moving blocks of data around inside the
			
 
				-	kernel, without continually transferring them between the kernel
			
 
				-	and user space.
			
 
				-  </para>
			
 
				-!Ffs/splice.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="pipes">
			
 
				-      <title>pipes API</title>
			
 
				-  <para>
			
 
				-	Pipe interfaces are all for in-kernel (builtin image) use.
			
 
				-	They are not exported for use by modules.
			
 
				-  </para>
			
 
				-!Iinclude/linux/pipe_fs_i.h
			
 
				-!Ffs/pipe.c
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -1,1312 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="lk-hacking-guide">
			
 
				- <bookinfo>
			
 
				-  <title>Unreliable Guide To Hacking The Linux Kernel</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Rusty</firstname>
			
 
				-    <surname>Russell</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>rusty@rustcorp.com.au</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2005</year>
			
 
				-   <holder>Rusty Russell</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-    This documentation is free software; you can redistribute
			
 
				-    it and/or modify it under the terms of the GNU General Public
			
 
				-    License as published by the Free Software Foundation; either
			
 
				-    version 2 of the License, or (at your option) any later
			
 
				-    version.
			
 
				-   </para>
			
 
				-   
			
 
				-   <para>
			
 
				-    This program is distributed in the hope that it will be
			
 
				-    useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-    warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-    See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-   
			
 
				-   <para>
			
 
				-    You should have received a copy of the GNU General Public
			
 
				-    License along with this program; if not, write to the Free
			
 
				-    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-    MA 02111-1307 USA
			
 
				-   </para>
			
 
				-   
			
 
				-   <para>
			
 
				-    For more details see the file COPYING in the source
			
 
				-    distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				-
			
 
				-  <releaseinfo>
			
 
				-   This is the first release of this document as part of the kernel tarball.
			
 
				-  </releaseinfo>
			
 
				-
			
 
				- </bookinfo>
			
 
				-
			
 
				- <toc></toc>
			
 
				-
			
 
				- <chapter id="introduction">
			
 
				-  <title>Introduction</title>
			
 
				-  <para>
			
 
				-   Welcome, gentle reader, to Rusty's Remarkably Unreliable Guide to Linux
			
 
				-   Kernel Hacking.  This document describes the common routines and
			
 
				-   general requirements for kernel code: its goal is to serve as a
			
 
				-   primer for Linux kernel development for experienced C
			
 
				-   programmers.  I avoid implementation details: that's what the
			
 
				-   code is for, and I ignore whole tracts of useful routines.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-   Before you read this, please understand that I never wanted to
			
 
				-   write this document, being grossly under-qualified, but I always
			
 
				-   wanted to read it, and this was the only way.  I hope it will
			
 
				-   grow into a compendium of best practice, common starting points
			
 
				-   and random information.
			
 
				-  </para>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="basic-players">
			
 
				-  <title>The Players</title>
			
 
				-
			
 
				-  <para>
			
 
				-   At any time each of the CPUs in a system can be:
			
 
				-  </para>
			
 
				-
			
 
				-  <itemizedlist>
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     not associated with any process, serving a hardware interrupt;
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     not associated with any process, serving a softirq or tasklet;
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     running in kernel space, associated with a process (user context);
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     running a process in user space.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-  </itemizedlist>
			
 
				-
			
 
				-  <para>
			
 
				-   There is an ordering between these.  The bottom two can preempt
			
 
				-   each other, but above that is a strict hierarchy: each can only be
			
 
				-   preempted by the ones above it.  For example, while a softirq is
			
 
				-   running on a CPU, no other softirq will preempt it, but a hardware
			
 
				-   interrupt can.  However, any other CPUs in the system execute
			
 
				-   independently.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   We'll see a number of ways that the user context can block
			
 
				-   interrupts, to become truly non-preemptable.
			
 
				-  </para>
			
 
				-  
			
 
				-  <sect1 id="basics-usercontext">
			
 
				-   <title>User Context</title>
			
 
				-
			
 
				-   <para>
			
 
				-    User context is when you are coming in from a system call or other
			
 
				-    trap: like userspace, you can be preempted by more important tasks
			
 
				-    and by interrupts.  You can sleep, by calling
			
 
				-    <function>schedule()</function>.
			
 
				-   </para>
			
 
				-
			
 
				-   <note>
			
 
				-    <para>
			
 
				-     You are always in user context on module load and unload,
			
 
				-     and on operations on the block device layer.
			
 
				-    </para>
			
 
				-   </note>
			
 
				-
			
 
				-   <para>
			
 
				-    In user context, the <varname>current</varname> pointer (indicating 
			
 
				-    the task we are currently executing) is valid, and
			
 
				-    <function>in_interrupt()</function>
			
 
				-    (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false
			
 
				-    </returnvalue>.  
			
 
				-   </para>
			
 
				-
			
 
				-   <caution>
			
 
				-    <para>
			
 
				-     Beware that if you have preemption or softirqs disabled
			
 
				-     (see below), <function>in_interrupt()</function> will return a 
			
 
				-     false positive.
			
 
				-    </para>
			
 
				-   </caution>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="basics-hardirqs">
			
 
				-   <title>Hardware Interrupts (Hard IRQs)</title>
			
 
				-
			
 
				-   <para>
			
 
				-    Timer ticks, <hardware>network cards</hardware> and 
			
 
				-    <hardware>keyboard</hardware> are examples of real
			
 
				-    hardware which produce interrupts at any time.  The kernel runs
			
 
				-    interrupt handlers, which services the hardware.  The kernel
			
 
				-    guarantees that this handler is never re-entered: if the same
			
 
				-    interrupt arrives, it is queued (or dropped).  Because it
			
 
				-    disables interrupts, this handler has to be fast: frequently it
			
 
				-    simply acknowledges the interrupt, marks a 'software interrupt'
			
 
				-    for execution and exits.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    You can tell you are in a hardware interrupt, because 
			
 
				-    <function>in_irq()</function> returns <returnvalue>true</returnvalue>.  
			
 
				-   </para>
			
 
				-   <caution>
			
 
				-    <para>
			
 
				-     Beware that this will return a false positive if interrupts are disabled 
			
 
				-     (see below).
			
 
				-    </para>
			
 
				-   </caution>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="basics-softirqs">
			
 
				-   <title>Software Interrupt Context: Softirqs and Tasklets</title>
			
 
				-
			
 
				-   <para>
			
 
				-    Whenever a system call is about to return to userspace, or a
			
 
				-    hardware interrupt handler exits, any 'software interrupts'
			
 
				-    which are marked pending (usually by hardware interrupts) are
			
 
				-    run (<filename>kernel/softirq.c</filename>).
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    Much of the real interrupt handling work is done here.  Early in
			
 
				-    the transition to <acronym>SMP</acronym>, there were only 'bottom
			
 
				-    halves' (BHs), which didn't take advantage of multiple CPUs.  Shortly 
			
 
				-    after we switched from wind-up computers made of match-sticks and snot,
			
 
				-    we abandoned this limitation and switched to 'softirqs'.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    <filename class="headerfile">include/linux/interrupt.h</filename> lists the
			
 
				-    different softirqs.  A very important softirq is the
			
 
				-    timer softirq (<filename
			
 
				-    class="headerfile">include/linux/timer.h</filename>): you can
			
 
				-    register to have it call functions for you in a given length of
			
 
				-    time.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    Softirqs are often a pain to deal with, since the same softirq
			
 
				-    will run simultaneously on more than one CPU.  For this reason,
			
 
				-    tasklets (<filename
			
 
				-    class="headerfile">include/linux/interrupt.h</filename>) are more
			
 
				-    often used: they are dynamically-registrable (meaning you can have
			
 
				-    as many as you want), and they also guarantee that any tasklet
			
 
				-    will only run on one CPU at any time, although different tasklets
			
 
				-    can run simultaneously.
			
 
				-   </para>
			
 
				-   <caution>
			
 
				-    <para>
			
 
				-     The name 'tasklet' is misleading: they have nothing to do with 'tasks',
			
 
				-     and probably more to do with some bad vodka Alexey Kuznetsov had at the 
			
 
				-     time.
			
 
				-    </para>
			
 
				-   </caution>
			
 
				-
			
 
				-   <para>
			
 
				-    You can tell you are in a softirq (or tasklet)
			
 
				-    using the <function>in_softirq()</function> macro 
			
 
				-    (<filename class="headerfile">include/linux/interrupt.h</filename>).
			
 
				-   </para>
			
 
				-   <caution>
			
 
				-    <para>
			
 
				-     Beware that this will return a false positive if a bh lock (see below)
			
 
				-     is held.
			
 
				-    </para>
			
 
				-   </caution>
			
 
				-  </sect1>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="basic-rules">
			
 
				-  <title>Some Basic Rules</title>
			
 
				-
			
 
				-  <variablelist>
			
 
				-   <varlistentry>
			
 
				-    <term>No memory protection</term>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      If you corrupt memory, whether in user context or
			
 
				-      interrupt context, the whole machine will crash.  Are you
			
 
				-      sure you can't do what you want in userspace?
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </varlistentry>
			
 
				-
			
 
				-   <varlistentry>
			
 
				-    <term>No floating point or <acronym>MMX</acronym></term>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      The <acronym>FPU</acronym> context is not saved; even in user
			
 
				-      context the <acronym>FPU</acronym> state probably won't
			
 
				-      correspond with the current process: you would mess with some
			
 
				-      user process' <acronym>FPU</acronym> state.  If you really want
			
 
				-      to do this, you would have to explicitly save/restore the full
			
 
				-      <acronym>FPU</acronym> state (and avoid context switches).  It
			
 
				-      is generally a bad idea; use fixed point arithmetic first.
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </varlistentry>
			
 
				-
			
 
				-   <varlistentry>
			
 
				-    <term>A rigid stack limit</term>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Depending on configuration options the kernel stack is about 3K to 6K for most 32-bit architectures: it's
			
 
				-      about 14K on most 64-bit archs, and often shared with interrupts
			
 
				-      so you can't use it all.  Avoid deep recursion and huge local
			
 
				-      arrays on the stack (allocate them dynamically instead).
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </varlistentry>
			
 
				-
			
 
				-   <varlistentry>
			
 
				-    <term>The Linux kernel is portable</term>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Let's keep it that way.  Your code should be 64-bit clean,
			
 
				-      and endian-independent.  You should also minimize CPU
			
 
				-      specific stuff, e.g. inline assembly should be cleanly
			
 
				-      encapsulated and minimized to ease porting.  Generally it
			
 
				-      should be restricted to the architecture-dependent part of
			
 
				-      the kernel tree.
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </varlistentry>
			
 
				-  </variablelist>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="ioctls">
			
 
				-  <title>ioctls: Not writing a new system call</title>
			
 
				-
			
 
				-  <para>
			
 
				-   A system call generally looks like this
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-asmlinkage long sys_mycall(int arg)
			
 
				-{
			
 
				-        return 0; 
			
 
				-}
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   First, in most cases you don't want to create a new system call.
			
 
				-   You create a character device and implement an appropriate ioctl
			
 
				-   for it.  This is much more flexible than system calls, doesn't have
			
 
				-   to be entered in every architecture's
			
 
				-   <filename class="headerfile">include/asm/unistd.h</filename> and
			
 
				-   <filename>arch/kernel/entry.S</filename> file, and is much more
			
 
				-   likely to be accepted by Linus.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   If all your routine does is read or write some parameter, consider
			
 
				-   implementing a <function>sysfs</function> interface instead.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   Inside the ioctl you're in user context to a process.  When a
			
 
				-   error occurs you return a negated errno (see
			
 
				-   <filename class="headerfile">include/linux/errno.h</filename>),
			
 
				-   otherwise you return <returnvalue>0</returnvalue>.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   After you slept you should check if a signal occurred: the
			
 
				-   Unix/Linux way of handling signals is to temporarily exit the
			
 
				-   system call with the <constant>-ERESTARTSYS</constant> error.  The
			
 
				-   system call entry code will switch back to user context, process
			
 
				-   the signal handler and then your system call will be restarted
			
 
				-   (unless the user disabled that).  So you should be prepared to
			
 
				-   process the restart, e.g. if you're in the middle of manipulating
			
 
				-   some data structure.
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-if (signal_pending(current))
			
 
				-        return -ERESTARTSYS;
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   If you're doing longer computations: first think userspace. If you
			
 
				-   <emphasis>really</emphasis> want to do it in kernel you should
			
 
				-   regularly check if you need to give up the CPU (remember there is
			
 
				-   cooperative multitasking per CPU).  Idiom:
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-cond_resched(); /* Will sleep */ 
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   A short note on interface design: the UNIX system call motto is
			
 
				-   "Provide mechanism not policy".
			
 
				-  </para>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="deadlock-recipes">
			
 
				-  <title>Recipes for Deadlock</title>
			
 
				-
			
 
				-  <para>
			
 
				-   You cannot call any routines which may sleep, unless:
			
 
				-  </para>
			
 
				-  <itemizedlist>
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     You are in user context.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     You do not own any spinlocks.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     You have interrupts enabled (actually, Andi Kleen says
			
 
				-     that the scheduling code will enable them for you, but
			
 
				-     that's probably not what you wanted).
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-  </itemizedlist>
			
 
				-
			
 
				-  <para>
			
 
				-   Note that some functions may sleep implicitly: common ones are
			
 
				-   the user space access functions (*_user) and memory allocation
			
 
				-   functions without <symbol>GFP_ATOMIC</symbol>.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   You should always compile your kernel
			
 
				-   <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
			
 
				-   you if you break these rules.  If you <emphasis>do</emphasis> break
			
 
				-   the rules, you will eventually lock up your box.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   Really.
			
 
				-  </para>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="common-routines">
			
 
				-  <title>Common Routines</title>
			
 
				-
			
 
				-  <sect1 id="routines-printk">
			
 
				-   <title>
			
 
				-    <function>printk()</function>
			
 
				-    <filename class="headerfile">include/linux/kernel.h</filename>
			
 
				-   </title>
			
 
				-
			
 
				-   <para>
			
 
				-    <function>printk()</function> feeds kernel messages to the
			
 
				-    console, dmesg, and the syslog daemon.  It is useful for debugging
			
 
				-    and reporting errors, and can be used inside interrupt context,
			
 
				-    but use with caution: a machine which has its console flooded with
			
 
				-    printk messages is unusable.  It uses a format string mostly
			
 
				-    compatible with ANSI C printf, and C string concatenation to give
			
 
				-    it a first "priority" argument:
			
 
				-   </para>
			
 
				-
			
 
				-   <programlisting>
			
 
				-printk(KERN_INFO "i = %u\n", i);
			
 
				-   </programlisting>
			
 
				-
			
 
				-   <para>
			
 
				-    See <filename class="headerfile">include/linux/kernel.h</filename>;
			
 
				-    for other KERN_ values; these are interpreted by syslog as the
			
 
				-    level.  Special case: for printing an IP address use
			
 
				-   </para>
			
 
				-
			
 
				-   <programlisting>
			
 
				-__be32 ipaddress;
			
 
				-printk(KERN_INFO "my ip: %pI4\n", &amp;ipaddress);
			
 
				-   </programlisting>
			
 
				-
			
 
				-   <para>
			
 
				-    <function>printk()</function> internally uses a 1K buffer and does
			
 
				-    not catch overruns.  Make sure that will be enough.
			
 
				-   </para>
			
 
				-
			
 
				-   <note>
			
 
				-    <para>
			
 
				-     You will know when you are a real kernel hacker
			
 
				-     when you start typoing printf as printk in your user programs :)
			
 
				-    </para>
			
 
				-   </note>
			
 
				-
			
 
				-   <!--- From the Lions book reader department --> 
			
 
				-
			
 
				-   <note>
			
 
				-    <para>
			
 
				-     Another sidenote: the original Unix Version 6 sources had a
			
 
				-     comment on top of its printf function: "Printf should not be
			
 
				-     used for chit-chat".  You should follow that advice.
			
 
				-    </para>
			
 
				-   </note>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-copy">
			
 
				-   <title>
			
 
				-    <function>copy_[to/from]_user()</function>
			
 
				-    /
			
 
				-    <function>get_user()</function>
			
 
				-    /
			
 
				-    <function>put_user()</function>
			
 
				-    <filename class="headerfile">include/linux/uaccess.h</filename>
			
 
				-   </title>  
			
 
				-
			
 
				-   <para>
			
 
				-    <emphasis>[SLEEPS]</emphasis>
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    <function>put_user()</function> and <function>get_user()</function>
			
 
				-    are used to get and put single values (such as an int, char, or
			
 
				-    long) from and to userspace.  A pointer into userspace should
			
 
				-    never be simply dereferenced: data should be copied using these
			
 
				-    routines.  Both return <constant>-EFAULT</constant> or 0.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-    <function>copy_to_user()</function> and
			
 
				-    <function>copy_from_user()</function> are more general: they copy
			
 
				-    an arbitrary amount of data to and from userspace.
			
 
				-    <caution>
			
 
				-     <para>
			
 
				-      Unlike <function>put_user()</function> and
			
 
				-      <function>get_user()</function>, they return the amount of
			
 
				-      uncopied data (ie. <returnvalue>0</returnvalue> still means
			
 
				-      success).
			
 
				-     </para>
			
 
				-    </caution>
			
 
				-    [Yes, this moronic interface makes me cringe.  The flamewar comes up every year or so. --RR.]
			
 
				-   </para>
			
 
				-   <para>
			
 
				-    The functions may sleep implicitly. This should never be called
			
 
				-    outside user context (it makes no sense), with interrupts
			
 
				-    disabled, or a spinlock held.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-kmalloc">
			
 
				-   <title><function>kmalloc()</function>/<function>kfree()</function>
			
 
				-    <filename class="headerfile">include/linux/slab.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    <emphasis>[MAY SLEEP: SEE BELOW]</emphasis>
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    These routines are used to dynamically request pointer-aligned
			
 
				-    chunks of memory, like malloc and free do in userspace, but
			
 
				-    <function>kmalloc()</function> takes an extra flag word.
			
 
				-    Important values:
			
 
				-   </para>
			
 
				-
			
 
				-   <variablelist>
			
 
				-    <varlistentry>
			
 
				-     <term>
			
 
				-      <constant>
			
 
				-       GFP_KERNEL
			
 
				-      </constant>
			
 
				-     </term>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-       May sleep and swap to free memory. Only allowed in user
			
 
				-       context, but is the most reliable way to allocate memory.
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-    </varlistentry>
			
 
				-    
			
 
				-    <varlistentry>
			
 
				-     <term>
			
 
				-      <constant>
			
 
				-       GFP_ATOMIC
			
 
				-      </constant>
			
 
				-     </term>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-       Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>,
			
 
				-       but may be called from interrupt context. You should
			
 
				-       <emphasis>really</emphasis> have a good out-of-memory
			
 
				-       error-handling strategy.
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-    </varlistentry>
			
 
				-    
			
 
				-    <varlistentry>
			
 
				-     <term>
			
 
				-      <constant>
			
 
				-       GFP_DMA
			
 
				-      </constant>
			
 
				-     </term>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-       Allocate ISA DMA lower than 16MB. If you don't know what that
			
 
				-       is you don't need it.  Very unreliable.
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-    </varlistentry>
			
 
				-   </variablelist>
			
 
				-
			
 
				-   <para>
			
 
				-    If you see a <errorname>sleeping function called from invalid
			
 
				-    context</errorname> warning message, then maybe you called a
			
 
				-    sleeping allocation function from interrupt context without
			
 
				-    <constant>GFP_ATOMIC</constant>.  You should really fix that.
			
 
				-    Run, don't walk.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    If you are allocating at least <constant>PAGE_SIZE</constant>
			
 
				-    (<filename class="headerfile">include/asm/page.h</filename>) bytes,
			
 
				-    consider using <function>__get_free_pages()</function>
			
 
				-
			
 
				-    (<filename class="headerfile">include/linux/mm.h</filename>).  It
			
 
				-    takes an order argument (0 for page sized, 1 for double page, 2
			
 
				-    for four pages etc.) and the same memory priority flag word as
			
 
				-    above.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    If you are allocating more than a page worth of bytes you can use
			
 
				-    <function>vmalloc()</function>.  It'll allocate virtual memory in
			
 
				-    the kernel map.  This block is not contiguous in physical memory,
			
 
				-    but the <acronym>MMU</acronym> makes it look like it is for you
			
 
				-    (so it'll only look contiguous to the CPUs, not to external device
			
 
				-    drivers).  If you really need large physically contiguous memory
			
 
				-    for some weird device, you have a problem: it is poorly supported
			
 
				-    in Linux because after some time memory fragmentation in a running
			
 
				-    kernel makes it hard.  The best way is to allocate the block early
			
 
				-    in the boot process via the <function>alloc_bootmem()</function>
			
 
				-    routine.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    Before inventing your own cache of often-used objects consider
			
 
				-    using a slab cache in
			
 
				-    <filename class="headerfile">include/linux/slab.h</filename>
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-current">
			
 
				-   <title><function>current</function>
			
 
				-    <filename class="headerfile">include/asm/current.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    This global variable (really a macro) contains a pointer to
			
 
				-    the current task structure, so is only valid in user context.
			
 
				-    For example, when a process makes a system call, this will
			
 
				-    point to the task structure of the calling process.  It is
			
 
				-    <emphasis>not NULL</emphasis> in interrupt context.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-udelay">
			
 
				-   <title><function>mdelay()</function>/<function>udelay()</function>
			
 
				-     <filename class="headerfile">include/asm/delay.h</filename>
			
 
				-     <filename class="headerfile">include/linux/delay.h</filename>
			
 
				-   </title>
			
 
				-
			
 
				-   <para>
			
 
				-    The <function>udelay()</function> and <function>ndelay()</function> functions can be used for small pauses.
			
 
				-    Do not use large values with them as you risk
			
 
				-    overflow - the helper function <function>mdelay()</function> is useful
			
 
				-    here, or consider <function>msleep()</function>.
			
 
				-   </para> 
			
 
				-  </sect1>
			
 
				- 
			
 
				-  <sect1 id="routines-endian">
			
 
				-   <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function>
			
 
				-     <filename class="headerfile">include/asm/byteorder.h</filename>
			
 
				-   </title>
			
 
				-
			
 
				-   <para>
			
 
				-    The <function>cpu_to_be32()</function> family (where the "32" can
			
 
				-    be replaced by 64 or 16, and the "be" can be replaced by "le") are
			
 
				-    the general way to do endian conversions in the kernel: they
			
 
				-    return the converted value.  All variations supply the reverse as
			
 
				-    well: <function>be32_to_cpu()</function>, etc.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    There are two major variations of these functions: the pointer
			
 
				-    variation, such as <function>cpu_to_be32p()</function>, which take
			
 
				-    a pointer to the given type, and return the converted value.  The
			
 
				-    other variation is the "in-situ" family, such as
			
 
				-    <function>cpu_to_be32s()</function>, which convert value referred
			
 
				-    to by the pointer, and return void.
			
 
				-   </para> 
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-local-irqs">
			
 
				-   <title><function>local_irq_save()</function>/<function>local_irq_restore()</function>
			
 
				-    <filename class="headerfile">include/linux/irqflags.h</filename>
			
 
				-   </title>
			
 
				-
			
 
				-   <para>
			
 
				-    These routines disable hard interrupts on the local CPU, and
			
 
				-    restore them.  They are reentrant; saving the previous state in
			
 
				-    their one <varname>unsigned long flags</varname> argument.  If you
			
 
				-    know that interrupts are enabled, you can simply use
			
 
				-    <function>local_irq_disable()</function> and
			
 
				-    <function>local_irq_enable()</function>.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-softirqs">
			
 
				-   <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function>
			
 
				-    <filename class="headerfile">include/linux/interrupt.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    These routines disable soft interrupts on the local CPU, and
			
 
				-    restore them.  They are reentrant; if soft interrupts were
			
 
				-    disabled before, they will still be disabled after this pair
			
 
				-    of functions has been called.  They prevent softirqs and tasklets
			
 
				-    from running on the current CPU.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-processorids">
			
 
				-   <title><function>smp_processor_id</function>()
			
 
				-    <filename class="headerfile">include/asm/smp.h</filename></title>
			
 
				-   
			
 
				-   <para>
			
 
				-    <function>get_cpu()</function> disables preemption (so you won't
			
 
				-    suddenly get moved to another CPU) and returns the current
			
 
				-    processor number, between 0 and <symbol>NR_CPUS</symbol>.  Note
			
 
				-    that the CPU numbers are not necessarily continuous.  You return
			
 
				-    it again with <function>put_cpu()</function> when you are done.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-    If you know you cannot be preempted by another task (ie. you are
			
 
				-    in interrupt context, or have preemption disabled) you can use
			
 
				-    smp_processor_id().
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-init">
			
 
				-   <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type>
			
 
				-    <filename class="headerfile">include/linux/init.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    After boot, the kernel frees up a special section; functions
			
 
				-    marked with <type>__init</type> and data structures marked with
			
 
				-    <type>__initdata</type> are dropped after boot is complete: similarly
			
 
				-    modules discard this memory after initialization.  <type>__exit</type>
			
 
				-    is used to declare a function which is only required on exit: the
			
 
				-    function will be dropped if this file is not compiled as a module.
			
 
				-    See the header file for use. Note that it makes no sense for a function
			
 
				-    marked with <type>__init</type> to be exported to modules with 
			
 
				-    <function>EXPORT_SYMBOL()</function> - this will break.
			
 
				-   </para>
			
 
				-
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-init-again">
			
 
				-   <title><function>__initcall()</function>/<function>module_init()</function>
			
 
				-    <filename class="headerfile">include/linux/init.h</filename></title>
			
 
				-   <para>
			
 
				-    Many parts of the kernel are well served as a module
			
 
				-    (dynamically-loadable parts of the kernel).  Using the
			
 
				-    <function>module_init()</function> and
			
 
				-    <function>module_exit()</function> macros it is easy to write code
			
 
				-    without #ifdefs which can operate both as a module or built into
			
 
				-    the kernel.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    The <function>module_init()</function> macro defines which
			
 
				-    function is to be called at module insertion time (if the file is
			
 
				-    compiled as a module), or at boot time: if the file is not
			
 
				-    compiled as a module the <function>module_init()</function> macro
			
 
				-    becomes equivalent to <function>__initcall()</function>, which
			
 
				-    through linker magic ensures that the function is called on boot.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    The function can return a negative error number to cause
			
 
				-    module loading to fail (unfortunately, this has no effect if
			
 
				-    the module is compiled into the kernel).  This function is
			
 
				-    called in user context with interrupts enabled, so it can sleep.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-  
			
 
				-  <sect1 id="routines-moduleexit">
			
 
				-   <title> <function>module_exit()</function>
			
 
				-    <filename class="headerfile">include/linux/init.h</filename> </title>
			
 
				-
			
 
				-   <para>
			
 
				-    This macro defines the function to be called at module removal
			
 
				-    time (or never, in the case of the file compiled into the
			
 
				-    kernel).  It will only be called if the module usage count has
			
 
				-    reached zero.  This function can also sleep, but cannot fail:
			
 
				-    everything must be cleaned up by the time it returns.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    Note that this macro is optional: if it is not present, your
			
 
				-    module will not be removable (except for 'rmmod -f').
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="routines-module-use-counters">
			
 
				-   <title> <function>try_module_get()</function>/<function>module_put()</function>
			
 
				-    <filename class="headerfile">include/linux/module.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    These manipulate the module usage count, to protect against
			
 
				-    removal (a module also can't be removed if another module uses one
			
 
				-    of its exported symbols: see below).  Before calling into module
			
 
				-    code, you should call <function>try_module_get()</function> on
			
 
				-    that module: if it fails, then the module is being removed and you
			
 
				-    should act as if it wasn't there.  Otherwise, you can safely enter
			
 
				-    the module, and call <function>module_put()</function> when you're
			
 
				-    finished.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-   Most registerable structures have an
			
 
				-   <structfield>owner</structfield> field, such as in the
			
 
				-   <structname>file_operations</structname> structure. Set this field
			
 
				-   to the macro <symbol>THIS_MODULE</symbol>.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				- <!-- add info on new-style module refcounting here -->
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="queues">
			
 
				-  <title>Wait Queues
			
 
				-   <filename class="headerfile">include/linux/wait.h</filename>
			
 
				-  </title>
			
 
				-  <para>
			
 
				-   <emphasis>[SLEEPS]</emphasis>
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   A wait queue is used to wait for someone to wake you up when a
			
 
				-   certain condition is true.  They must be used carefully to ensure
			
 
				-   there is no race condition.  You declare a
			
 
				-   <type>wait_queue_head_t</type>, and then processes which want to
			
 
				-   wait for that condition declare a <type>wait_queue_t</type>
			
 
				-   referring to themselves, and place that in the queue.
			
 
				-  </para>
			
 
				-
			
 
				-  <sect1 id="queue-declaring">
			
 
				-   <title>Declaring</title>
			
 
				-   
			
 
				-   <para>
			
 
				-    You declare a <type>wait_queue_head_t</type> using the
			
 
				-    <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the
			
 
				-    <function>init_waitqueue_head()</function> routine in your
			
 
				-    initialization code.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-  
			
 
				-  <sect1 id="queue-waitqueue">
			
 
				-   <title>Queuing</title>
			
 
				-   
			
 
				-   <para>
			
 
				-    Placing yourself in the waitqueue is fairly complex, because you
			
 
				-    must put yourself in the queue before checking the condition.
			
 
				-    There is a macro to do this:
			
 
				-    <function>wait_event_interruptible()</function>
			
 
				-
			
 
				-    <filename class="headerfile">include/linux/wait.h</filename> The
			
 
				-    first argument is the wait queue head, and the second is an
			
 
				-    expression which is evaluated; the macro returns
			
 
				-    <returnvalue>0</returnvalue> when this expression is true, or
			
 
				-    <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
			
 
				-    The <function>wait_event()</function> version ignores signals.
			
 
				-   </para>
			
 
				- 
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="queue-waking">
			
 
				-   <title>Waking Up Queued Tasks</title>
			
 
				-   
			
 
				-   <para>
			
 
				-    Call <function>wake_up()</function>
			
 
				-
			
 
				-    <filename class="headerfile">include/linux/wait.h</filename>;,
			
 
				-    which will wake up every process in the queue.  The exception is
			
 
				-    if one has <constant>TASK_EXCLUSIVE</constant> set, in which case
			
 
				-    the remainder of the queue will not be woken.  There are other variants
			
 
				-    of this basic function available in the same header.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="atomic-ops">
			
 
				-  <title>Atomic Operations</title>
			
 
				-
			
 
				-  <para>
			
 
				-   Certain operations are guaranteed atomic on all platforms.  The
			
 
				-   first class of operations work on <type>atomic_t</type>
			
 
				-
			
 
				-   <filename class="headerfile">include/asm/atomic.h</filename>; this
			
 
				-   contains a signed integer (at least 32 bits long), and you must use
			
 
				-   these functions to manipulate or read atomic_t variables.
			
 
				-   <function>atomic_read()</function> and
			
 
				-   <function>atomic_set()</function> get and set the counter,
			
 
				-   <function>atomic_add()</function>,
			
 
				-   <function>atomic_sub()</function>,
			
 
				-   <function>atomic_inc()</function>,
			
 
				-   <function>atomic_dec()</function>, and
			
 
				-   <function>atomic_dec_and_test()</function> (returns
			
 
				-   <returnvalue>true</returnvalue> if it was decremented to zero).
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   Yes.  It returns <returnvalue>true</returnvalue> (i.e. != 0) if the
			
 
				-   atomic variable is zero.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   Note that these functions are slower than normal arithmetic, and
			
 
				-   so should not be used unnecessarily.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   The second class of atomic operations is atomic bit operations on an
			
 
				-   <type>unsigned long</type>, defined in
			
 
				-
			
 
				-   <filename class="headerfile">include/linux/bitops.h</filename>.  These
			
 
				-   operations generally take a pointer to the bit pattern, and a bit
			
 
				-   number: 0 is the least significant bit.
			
 
				-   <function>set_bit()</function>, <function>clear_bit()</function>
			
 
				-   and <function>change_bit()</function> set, clear, and flip the
			
 
				-   given bit.  <function>test_and_set_bit()</function>,
			
 
				-   <function>test_and_clear_bit()</function> and
			
 
				-   <function>test_and_change_bit()</function> do the same thing,
			
 
				-   except return true if the bit was previously set; these are
			
 
				-   particularly useful for atomically setting flags.
			
 
				-  </para>
			
 
				-  
			
 
				-  <para>
			
 
				-   It is possible to call these operations with bit indices greater
			
 
				-   than BITS_PER_LONG.  The resulting behavior is strange on big-endian
			
 
				-   platforms though so it is a good idea not to do this.
			
 
				-  </para>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="symbols">
			
 
				-  <title>Symbols</title>
			
 
				-
			
 
				-  <para>
			
 
				-   Within the kernel proper, the normal linking rules apply
			
 
				-   (ie. unless a symbol is declared to be file scope with the
			
 
				-   <type>static</type> keyword, it can be used anywhere in the
			
 
				-   kernel).  However, for modules, a special exported symbol table is
			
 
				-   kept which limits the entry points to the kernel proper.  Modules
			
 
				-   can also export symbols.
			
 
				-  </para>
			
 
				-
			
 
				-  <sect1 id="sym-exportsymbols">
			
 
				-   <title><function>EXPORT_SYMBOL()</function>
			
 
				-    <filename class="headerfile">include/linux/export.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    This is the classic method of exporting a symbol: dynamically
			
 
				-    loaded modules will be able to use the symbol as normal.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="sym-exportsymbols-gpl">
			
 
				-   <title><function>EXPORT_SYMBOL_GPL()</function>
			
 
				-    <filename class="headerfile">include/linux/export.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    Similar to <function>EXPORT_SYMBOL()</function> except that the
			
 
				-    symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can
			
 
				-    only be seen by modules with a
			
 
				-    <function>MODULE_LICENSE()</function> that specifies a GPL
			
 
				-    compatible license.  It implies that the function is considered
			
 
				-    an internal implementation issue, and not really an interface.
			
 
				-    Some maintainers and developers may however
			
 
				-    require EXPORT_SYMBOL_GPL() when adding any new APIs or functionality.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="conventions">
			
 
				-  <title>Routines and Conventions</title>
			
 
				-
			
 
				-  <sect1 id="conventions-doublelinkedlist">
			
 
				-   <title>Double-linked lists
			
 
				-    <filename class="headerfile">include/linux/list.h</filename></title>
			
 
				-
			
 
				-   <para>
			
 
				-    There used to be three sets of linked-list routines in the kernel
			
 
				-    headers, but this one is the winner.  If you don't have some
			
 
				-    particular pressing need for a single list, it's a good choice.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    In particular, <function>list_for_each_entry</function> is useful.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="convention-returns">
			
 
				-   <title>Return Conventions</title>
			
 
				-
			
 
				-   <para>
			
 
				-    For code called in user context, it's very common to defy C
			
 
				-    convention, and return <returnvalue>0</returnvalue> for success,
			
 
				-    and a negative error number
			
 
				-    (eg. <returnvalue>-EFAULT</returnvalue>) for failure.  This can be
			
 
				-    unintuitive at first, but it's fairly widespread in the kernel.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-    Using <function>ERR_PTR()</function>
			
 
				-
			
 
				-    <filename class="headerfile">include/linux/err.h</filename>; to
			
 
				-    encode a negative error number into a pointer, and
			
 
				-    <function>IS_ERR()</function> and <function>PTR_ERR()</function>
			
 
				-    to get it back out again: avoids a separate pointer parameter for
			
 
				-    the error number.  Icky, but in a good way.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="conventions-borkedcompile">
			
 
				-   <title>Breaking Compilation</title>
			
 
				-
			
 
				-   <para>
			
 
				-    Linus and the other developers sometimes change function or
			
 
				-    structure names in development kernels; this is not done just to
			
 
				-    keep everyone on their toes: it reflects a fundamental change
			
 
				-    (eg. can no longer be called with interrupts on, or does extra
			
 
				-    checks, or doesn't do checks which were caught before).  Usually
			
 
				-    this is accompanied by a fairly complete note to the linux-kernel
			
 
				-    mailing list; search the archive.  Simply doing a global replace
			
 
				-    on the file usually makes things <emphasis>worse</emphasis>.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="conventions-initialising">
			
 
				-   <title>Initializing structure members</title>
			
 
				-
			
 
				-   <para>
			
 
				-    The preferred method of initializing structures is to use
			
 
				-    designated initialisers, as defined by ISO C99, eg:
			
 
				-   </para>
			
 
				-   <programlisting>
			
 
				-static struct block_device_operations opt_fops = {
			
 
				-        .open               = opt_open,
			
 
				-        .release            = opt_release,
			
 
				-        .ioctl              = opt_ioctl,
			
 
				-        .check_media_change = opt_media_change,
			
 
				-};
			
 
				-   </programlisting>
			
 
				-   <para>
			
 
				-    This makes it easy to grep for, and makes it clear which
			
 
				-    structure fields are set.  You should do this because it looks
			
 
				-    cool.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="conventions-gnu-extns">
			
 
				-   <title>GNU Extensions</title>
			
 
				-
			
 
				-   <para>
			
 
				-    GNU Extensions are explicitly allowed in the Linux kernel.
			
 
				-    Note that some of the more complex ones are not very well
			
 
				-    supported, due to lack of general use, but the following are
			
 
				-    considered standard (see the GCC info page section "C
			
 
				-    Extensions" for more details - Yes, really the info page, the
			
 
				-    man page is only a short summary of the stuff in info).
			
 
				-   </para>
			
 
				-   <itemizedlist>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Inline functions
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Statement expressions (ie. the ({ and }) constructs).
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Declaring attributes of a function / variable / type
			
 
				-      (__attribute__)
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      typeof
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Zero length arrays
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Macro varargs
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Arithmetic on void pointers
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Non-Constant initializers
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Assembler Instructions (not outside arch/ and include/asm/)
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      Function names as strings (__func__).
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-      __builtin_constant_p()
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </itemizedlist>
			
 
				-
			
 
				-   <para>
			
 
				-    Be wary when using long long in the kernel, the code gcc generates for
			
 
				-    it is horrible and worse: division and multiplication does not work
			
 
				-    on i386 because the GCC runtime functions for it are missing from
			
 
				-    the kernel environment.
			
 
				-   </para>
			
 
				-
			
 
				-    <!-- FIXME: add a note about ANSI aliasing cleanness -->
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="conventions-cplusplus">
			
 
				-   <title>C++</title>
			
 
				-   
			
 
				-   <para>
			
 
				-    Using C++ in the kernel is usually a bad idea, because the
			
 
				-    kernel does not provide the necessary runtime environment
			
 
				-    and the include files are not tested for it.  It is still
			
 
				-    possible, but not recommended.  If you really want to do
			
 
				-    this, forget about exceptions at least.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="conventions-ifdef">
			
 
				-   <title>&num;if</title>
			
 
				-   
			
 
				-   <para>
			
 
				-    It is generally considered cleaner to use macros in header files
			
 
				-    (or at the top of .c files) to abstract away functions rather than
			
 
				-    using `#if' pre-processor statements throughout the source code.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="submitting">
			
 
				-  <title>Putting Your Stuff in the Kernel</title>
			
 
				-
			
 
				-  <para>
			
 
				-   In order to get your stuff into shape for official inclusion, or
			
 
				-   even to make a neat patch, there's administrative work to be
			
 
				-   done:
			
 
				-  </para>
			
 
				-  <itemizedlist>
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     Figure out whose pond you've been pissing in.  Look at the top of
			
 
				-     the source files, inside the <filename>MAINTAINERS</filename>
			
 
				-     file, and last of all in the <filename>CREDITS</filename> file.
			
 
				-     You should coordinate with this person to make sure you're not
			
 
				-     duplicating effort, or trying something that's already been
			
 
				-     rejected.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-     Make sure you put your name and EMail address at the top of
			
 
				-     any files you create or mangle significantly.  This is the
			
 
				-     first place people will look when they find a bug, or when
			
 
				-     <emphasis>they</emphasis> want to make a change.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     Usually you want a configuration option for your kernel hack.
			
 
				-     Edit <filename>Kconfig</filename> in the appropriate directory.
			
 
				-     The Config language is simple to use by cut and paste, and there's
			
 
				-     complete documentation in
			
 
				-     <filename>Documentation/kbuild/kconfig-language.txt</filename>.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-     In your description of the option, make sure you address both the
			
 
				-     expert user and the user who knows nothing about your feature.  Mention
			
 
				-     incompatibilities and issues here.  <emphasis> Definitely
			
 
				-     </emphasis> end your description with <quote> if in doubt, say N
			
 
				-     </quote> (or, occasionally, `Y'); this is for people who have no
			
 
				-     idea what you are talking about.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     Edit the <filename>Makefile</filename>: the CONFIG variables are
			
 
				-     exported here so you can usually just add a "obj-$(CONFIG_xxx) +=
			
 
				-     xxx.o" line.  The syntax is documented in
			
 
				-     <filename>Documentation/kbuild/makefiles.txt</filename>.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     Put yourself in <filename>CREDITS</filename> if you've done
			
 
				-     something noteworthy, usually beyond a single file (your name
			
 
				-     should be at the top of the source files anyway).
			
 
				-     <filename>MAINTAINERS</filename> means you want to be consulted
			
 
				-     when changes are made to a subsystem, and hear about bugs; it
			
 
				-     implies a more-than-passing commitment to some part of the code.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-   
			
 
				-   <listitem>
			
 
				-    <para>
			
 
				-     Finally, don't forget to read <filename>Documentation/process/submitting-patches.rst</filename>
			
 
				-     and possibly <filename>Documentation/process/submitting-drivers.rst</filename>.
			
 
				-    </para>
			
 
				-   </listitem>
			
 
				-  </itemizedlist>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="cantrips">
			
 
				-  <title>Kernel Cantrips</title>
			
 
				-
			
 
				-  <para>
			
 
				-   Some favorites from browsing the source.  Feel free to add to this
			
 
				-   list.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-   <filename>arch/x86/include/asm/delay.h:</filename>
			
 
				-  </para>
			
 
				-  <programlisting>
			
 
				-#define ndelay(n) (__builtin_constant_p(n) ? \
			
 
				-        ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
			
 
				-        __ndelay(n))
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   <filename>include/linux/fs.h</filename>:
			
 
				-  </para>
			
 
				-  <programlisting>
			
 
				-/*
			
 
				- * Kernel pointers have redundant information, so we can use a
			
 
				- * scheme where we can return either an error code or a dentry
			
 
				- * pointer with the same return value.
			
 
				- *
			
 
				- * This should be a per-architecture thing, to allow different
			
 
				- * error and pointer decisions.
			
 
				- */
			
 
				- #define ERR_PTR(err)    ((void *)((long)(err)))
			
 
				- #define PTR_ERR(ptr)    ((long)(ptr))
			
 
				- #define IS_ERR(ptr)     ((unsigned long)(ptr) > (unsigned long)(-1000))
			
 
				-</programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   <filename>arch/x86/include/asm/uaccess_32.h:</filename>
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-#define copy_to_user(to,from,n)                         \
			
 
				-        (__builtin_constant_p(n) ?                      \
			
 
				-         __constant_copy_to_user((to),(from),(n)) :     \
			
 
				-         __generic_copy_to_user((to),(from),(n)))
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   <filename>arch/sparc/kernel/head.S:</filename>
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-/*
			
 
				- * Sun people can't spell worth damn. "compatability" indeed.
			
 
				- * At least we *know* we can't spell, and use a spell-checker.
			
 
				- */
			
 
				-
			
 
				-/* Uh, actually Linus it is I who cannot spell. Too much murky
			
 
				- * Sparc assembly will do this to ya.
			
 
				- */
			
 
				-C_LABEL(cputypvar):
			
 
				-        .asciz "compatibility"
			
 
				-
			
 
				-/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
			
 
				-        .align 4
			
 
				-C_LABEL(cputypvar_sun4m):
			
 
				-        .asciz "compatible"
			
 
				-  </programlisting>
			
 
				-
			
 
				-  <para>
			
 
				-   <filename>arch/sparc/lib/checksum.S:</filename>
			
 
				-  </para>
			
 
				-
			
 
				-  <programlisting>
			
 
				-        /* Sun, you just can't beat me, you just can't.  Stop trying,
			
 
				-         * give up.  I'm serious, I am going to kick the living shit
			
 
				-         * out of you, game over, lights out.
			
 
				-         */
			
 
				-  </programlisting>
			
 
				- </chapter>
			
 
				-
			
 
				- <chapter id="credits">
			
 
				-  <title>Thanks</title>
			
 
				-
			
 
				-  <para>
			
 
				-   Thanks to Andi Kleen for the idea, answering my questions, fixing
			
 
				-   my mistakes, filling content, etc.  Philipp Rumpf for more spelling
			
 
				-   and clarity fixes, and some excellent non-obvious points.  Werner
			
 
				-   Almesberger for giving me a great summary of
			
 
				-   <function>disable_irq()</function>, and Jes Sorensen and Andrea
			
 
				-   Arcangeli added caveats. Michael Elizabeth Chastain for checking
			
 
				-   and adding to the Configure section. <!-- Rusty insisted on this
			
 
				-   bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook. 
			
 
				-  </para>
			
 
				- </chapter>
			
 
				-</book>
			
 
				-
			
--- a/Documentation/DocBook/kernel-locking.tmpl
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -1,2151 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="LKLockingGuide">
			
 
				- <bookinfo>
			
 
				-  <title>Unreliable Guide To Locking</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Rusty</firstname>
			
 
				-    <surname>Russell</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>rusty@rustcorp.com.au</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2003</year>
			
 
				-   <holder>Rusty Russell</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License as published by the Free Software Foundation; either
			
 
				-     version 2 of the License, or (at your option) any later
			
 
				-     version.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				- <toc></toc>
			
 
				-  <chapter id="intro">
			
 
				-   <title>Introduction</title>
			
 
				-   <para>
			
 
				-     Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
			
 
				-     Locking issues.  This document describes the locking systems in
			
 
				-     the Linux Kernel in 2.6.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-     With the wide availability of HyperThreading, and <firstterm
			
 
				-     linkend="gloss-preemption">preemption </firstterm> in the Linux
			
 
				-     Kernel, everyone hacking on the kernel needs to know the
			
 
				-     fundamentals of concurrency and locking for
			
 
				-     <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
			
 
				-   </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-   <chapter id="races">
			
 
				-    <title>The Problem With Concurrency</title>
			
 
				-    <para>
			
 
				-      (Skip this if you know what a Race Condition is).
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      In a normal program, you can increment a counter like so:
			
 
				-    </para>
			
 
				-    <programlisting>
			
 
				-      very_important_count++;
			
 
				-    </programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-      This is what they would expect to happen:
			
 
				-    </para>
			
 
				-
			
 
				-    <table>
			
 
				-     <title>Expected Results</title>
			
 
				-
			
 
				-     <tgroup cols="2" align="left">
			
 
				-
			
 
				-      <thead>
			
 
				-       <row>
			
 
				-        <entry>Instance 1</entry>
			
 
				-        <entry>Instance 2</entry>
			
 
				-       </row>
			
 
				-      </thead>
			
 
				-
			
 
				-      <tbody>
			
 
				-       <row>
			
 
				-        <entry>read very_important_count (5)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry>add 1 (6)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry>write very_important_count (6)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>read very_important_count (6)</entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>add 1 (7)</entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>write very_important_count (7)</entry>
			
 
				-       </row>
			
 
				-      </tbody>
			
 
				-
			
 
				-     </tgroup>
			
 
				-    </table>
			
 
				-
			
 
				-    <para>
			
 
				-     This is what might happen:
			
 
				-    </para>
			
 
				-
			
 
				-    <table>
			
 
				-     <title>Possible Results</title>
			
 
				-
			
 
				-     <tgroup cols="2" align="left">
			
 
				-      <thead>
			
 
				-       <row>
			
 
				-        <entry>Instance 1</entry>
			
 
				-        <entry>Instance 2</entry>
			
 
				-       </row>
			
 
				-      </thead>
			
 
				-
			
 
				-      <tbody>
			
 
				-       <row>
			
 
				-        <entry>read very_important_count (5)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>read very_important_count (5)</entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry>add 1 (6)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>add 1 (6)</entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry>write very_important_count (6)</entry>
			
 
				-        <entry></entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry></entry>
			
 
				-        <entry>write very_important_count (6)</entry>
			
 
				-       </row>
			
 
				-      </tbody>
			
 
				-     </tgroup>
			
 
				-    </table>
			
 
				-
			
 
				-    <sect1 id="race-condition">
			
 
				-    <title>Race Conditions and Critical Regions</title>
			
 
				-    <para>
			
 
				-      This overlap, where the result depends on the
			
 
				-      relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
			
 
				-      The piece of code containing the concurrency issue is called a
			
 
				-      <firstterm>critical region</firstterm>.  And especially since Linux starting running
			
 
				-      on SMP machines, they became one of the major issues in kernel
			
 
				-      design and implementation.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      Preemption can have the same effect, even if there is only one
			
 
				-      CPU: by preempting one task during the critical region, we have
			
 
				-      exactly the same race condition.  In this case the thread which
			
 
				-      preempts might run the critical region itself.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      The solution is to recognize when these simultaneous accesses
			
 
				-      occur, and use locks to make sure that only one instance can
			
 
				-      enter the critical region at any time.  There are many
			
 
				-      friendly primitives in the Linux kernel to help you do this.
			
 
				-      And then there are the unfriendly primitives, but I'll pretend
			
 
				-      they don't exist.
			
 
				-    </para>
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="locks">
			
 
				-   <title>Locking in the Linux Kernel</title>
			
 
				-
			
 
				-   <para>
			
 
				-     If I could give you one piece of advice: never sleep with anyone
			
 
				-     crazier than yourself.  But if I had to give you advice on
			
 
				-     locking: <emphasis>keep it simple</emphasis>.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     Be reluctant to introduce new locks.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     Strangely enough, this last one is the exact reverse of my advice when
			
 
				-     you <emphasis>have</emphasis> slept with someone crazier than yourself.
			
 
				-     And you should think about getting a big dog.
			
 
				-   </para>
			
 
				-
			
 
				-   <sect1 id="lock-intro">
			
 
				-   <title>Two Main Types of Kernel Locks: Spinlocks and Mutexes</title>
			
 
				-
			
 
				-   <para>
			
 
				-     There are two main types of kernel locks.  The fundamental type
			
 
				-     is the spinlock 
			
 
				-     (<filename class="headerfile">include/asm/spinlock.h</filename>),
			
 
				-     which is a very simple single-holder lock: if you can't get the 
			
 
				-     spinlock, you keep trying (spinning) until you can.  Spinlocks are 
			
 
				-     very small and fast, and can be used anywhere.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-     The second type is a mutex
			
 
				-     (<filename class="headerfile">include/linux/mutex.h</filename>): it
			
 
				-     is like a spinlock, but you may block holding a mutex.
			
 
				-     If you can't lock a mutex, your task will suspend itself, and be woken
			
 
				-     up when the mutex is released.  This means the CPU can do something
			
 
				-     else while you are waiting.  There are many cases when you simply
			
 
				-     can't sleep (see <xref linkend="sleeping-things"/>), and so have to
			
 
				-     use a spinlock instead.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-     Neither type of lock is recursive: see
			
 
				-     <xref linkend="deadlock"/>.
			
 
				-   </para>
			
 
				-   </sect1>
			
 
				- 
			
 
				-   <sect1 id="uniprocessor">
			
 
				-    <title>Locks and Uniprocessor Kernels</title>
			
 
				-
			
 
				-    <para>
			
 
				-      For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
			
 
				-      without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
			
 
				-      all.  This is an excellent design decision: when no-one else can
			
 
				-      run at the same time, there is no reason to have a lock.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
			
 
				-      but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
			
 
				-      simply disable preemption, which is sufficient to prevent any
			
 
				-      races.  For most purposes, we can think of preemption as
			
 
				-      equivalent to SMP, and not worry about it separately.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      You should always test your locking code with <symbol>CONFIG_SMP</symbol>
			
 
				-      and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
			
 
				-      will still catch some kinds of locking bugs.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Mutexes still exist, because they are required for
			
 
				-      synchronization between <firstterm linkend="gloss-usercontext">user 
			
 
				-      contexts</firstterm>, as we will see below.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-    <sect1 id="usercontextlocking">
			
 
				-     <title>Locking Only In User Context</title>
			
 
				-
			
 
				-     <para>
			
 
				-       If you have a data structure which is only ever accessed from
			
 
				-       user context, then you can use a simple mutex
			
 
				-       (<filename>include/linux/mutex.h</filename>) to protect it.  This
			
 
				-       is the most trivial case: you initialize the mutex.  Then you can
			
 
				-       call <function>mutex_lock_interruptible()</function> to grab the mutex,
			
 
				-       and <function>mutex_unlock()</function> to release it.  There is also a 
			
 
				-       <function>mutex_lock()</function>, which should be avoided, because it 
			
 
				-       will not return if a signal is received.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       Example: <filename>net/netfilter/nf_sockopt.c</filename> allows 
			
 
				-       registration of new <function>setsockopt()</function> and 
			
 
				-       <function>getsockopt()</function> calls, with
			
 
				-       <function>nf_register_sockopt()</function>.  Registration and 
			
 
				-       de-registration are only done on module load and unload (and boot 
			
 
				-       time, where there is no concurrency), and the list of registrations 
			
 
				-       is only consulted for an unknown <function>setsockopt()</function>
			
 
				-       or <function>getsockopt()</function> system call.  The 
			
 
				-       <varname>nf_sockopt_mutex</varname> is perfect to protect this,
			
 
				-       especially since the setsockopt and getsockopt calls may well
			
 
				-       sleep.
			
 
				-     </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="lock-user-bh">
			
 
				-    <title>Locking Between User Context and Softirqs</title>
			
 
				-
			
 
				-    <para>
			
 
				-      If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
			
 
				-      data with user context, you have two problems.  Firstly, the current 
			
 
				-      user context can be interrupted by a softirq, and secondly, the
			
 
				-      critical region could be entered from another CPU.  This is where
			
 
				-      <function>spin_lock_bh()</function> 
			
 
				-      (<filename class="headerfile">include/linux/spinlock.h</filename>) is
			
 
				-      used.  It disables softirqs on that CPU, then grabs the lock.
			
 
				-      <function>spin_unlock_bh()</function> does the reverse.  (The
			
 
				-      '_bh' suffix is a historical reference to "Bottom Halves", the
			
 
				-      old name for software interrupts.  It should really be
			
 
				-      called spin_lock_softirq()' in a perfect world).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Note that you can also use <function>spin_lock_irq()</function>
			
 
				-      or <function>spin_lock_irqsave()</function> here, which stop
			
 
				-      hardware interrupts as well: see <xref linkend="hardirq-context"/>.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
			
 
				-      </acronym></firstterm> as well: the spin lock vanishes, and this macro 
			
 
				-      simply becomes <function>local_bh_disable()</function>
			
 
				-      (<filename class="headerfile">include/linux/interrupt.h</filename>), which
			
 
				-      protects you from the softirq being run.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="lock-user-tasklet">
			
 
				-    <title>Locking Between User Context and Tasklets</title>
			
 
				-
			
 
				-    <para>
			
 
				-      This is exactly the same as above, because <firstterm
			
 
				-      linkend="gloss-tasklet">tasklets</firstterm> are actually run
			
 
				-      from a softirq.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="lock-user-timers">
			
 
				-    <title>Locking Between User Context and Timers</title>
			
 
				-
			
 
				-    <para>
			
 
				-      This, too, is exactly the same as above, because <firstterm
			
 
				-      linkend="gloss-timers">timers</firstterm> are actually run from
			
 
				-      a softirq.  From a locking point of view, tasklets and timers
			
 
				-      are identical.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="lock-tasklets">
			
 
				-    <title>Locking Between Tasklets/Timers</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Sometimes a tasklet or timer might want to share data with
			
 
				-      another tasklet or timer.
			
 
				-    </para>
			
 
				-
			
 
				-    <sect2 id="lock-tasklets-same">
			
 
				-     <title>The Same Tasklet/Timer</title>
			
 
				-     <para>
			
 
				-       Since a tasklet is never run on two CPUs at once, you don't
			
 
				-       need to worry about your tasklet being reentrant (running
			
 
				-       twice at once), even on SMP.
			
 
				-     </para>
			
 
				-    </sect2>
			
 
				-
			
 
				-    <sect2 id="lock-tasklets-different">
			
 
				-     <title>Different Tasklets/Timers</title>
			
 
				-     <para>
			
 
				-       If another tasklet/timer wants
			
 
				-       to share data with your tasklet or timer , you will both need to use
			
 
				-       <function>spin_lock()</function> and
			
 
				-       <function>spin_unlock()</function> calls.  
			
 
				-       <function>spin_lock_bh()</function> is
			
 
				-       unnecessary here, as you are already in a tasklet, and
			
 
				-       none will be run on the same CPU.
			
 
				-     </para>
			
 
				-    </sect2>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="lock-softirqs">
			
 
				-    <title>Locking Between Softirqs</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Often a softirq might
			
 
				-      want to share data with itself or a tasklet/timer.
			
 
				-    </para>
			
 
				-
			
 
				-    <sect2 id="lock-softirqs-same">
			
 
				-     <title>The Same Softirq</title>
			
 
				-
			
 
				-     <para>
			
 
				-       The same softirq can run on the other CPUs: you can use a
			
 
				-       per-CPU array (see <xref linkend="per-cpu"/>) for better
			
 
				-       performance.  If you're going so far as to use a softirq,
			
 
				-       you probably care about scalable performance enough
			
 
				-       to justify the extra complexity.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       You'll need to use <function>spin_lock()</function> and 
			
 
				-       <function>spin_unlock()</function> for shared data.
			
 
				-     </para>
			
 
				-    </sect2>
			
 
				-
			
 
				-    <sect2 id="lock-softirqs-different">
			
 
				-     <title>Different Softirqs</title>
			
 
				-
			
 
				-     <para>
			
 
				-       You'll need to use <function>spin_lock()</function> and
			
 
				-       <function>spin_unlock()</function> for shared data, whether it
			
 
				-       be a timer, tasklet, different softirq or the same or another
			
 
				-       softirq: any of them could be running on a different CPU.
			
 
				-     </para>
			
 
				-    </sect2>
			
 
				-   </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="hardirq-context">
			
 
				-   <title>Hard IRQ Context</title>
			
 
				-
			
 
				-   <para>
			
 
				-     Hardware interrupts usually communicate with a
			
 
				-     tasklet or softirq.  Frequently this involves putting work in a
			
 
				-     queue, which the softirq will take out.
			
 
				-   </para>
			
 
				-
			
 
				-   <sect1 id="hardirq-softirq">
			
 
				-    <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
			
 
				-
			
 
				-    <para>
			
 
				-      If a hardware irq handler shares data with a softirq, you have
			
 
				-      two concerns.  Firstly, the softirq processing can be
			
 
				-      interrupted by a hardware interrupt, and secondly, the
			
 
				-      critical region could be entered by a hardware interrupt on
			
 
				-      another CPU.  This is where <function>spin_lock_irq()</function> is 
			
 
				-      used.  It is defined to disable interrupts on that cpu, then grab 
			
 
				-      the lock. <function>spin_unlock_irq()</function> does the reverse.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      The irq handler does not to use
			
 
				-      <function>spin_lock_irq()</function>, because the softirq cannot
			
 
				-      run while the irq handler is running: it can use
			
 
				-      <function>spin_lock()</function>, which is slightly faster.  The
			
 
				-      only exception would be if a different hardware irq handler uses
			
 
				-      the same lock: <function>spin_lock_irq()</function> will stop
			
 
				-      that from interrupting us.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      This works perfectly for UP as well: the spin lock vanishes,
			
 
				-      and this macro simply becomes <function>local_irq_disable()</function>
			
 
				-      (<filename class="headerfile">include/asm/smp.h</filename>), which
			
 
				-      protects you from the softirq/tasklet/BH being run.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      <function>spin_lock_irqsave()</function> 
			
 
				-      (<filename>include/linux/spinlock.h</filename>) is a variant
			
 
				-      which saves whether interrupts were on or off in a flags word,
			
 
				-      which is passed to <function>spin_unlock_irqrestore()</function>.  This
			
 
				-      means that the same code can be used inside an hard irq handler (where
			
 
				-      interrupts are already off) and in softirqs (where the irq
			
 
				-      disabling is required).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Note that softirqs (and hence tasklets and timers) are run on
			
 
				-      return from hardware interrupts, so
			
 
				-      <function>spin_lock_irq()</function> also stops these.  In that
			
 
				-      sense, <function>spin_lock_irqsave()</function> is the most
			
 
				-      general and powerful locking function.
			
 
				-    </para>
			
 
				-
			
 
				-   </sect1>
			
 
				-   <sect1 id="hardirq-hardirq">
			
 
				-    <title>Locking Between Two Hard IRQ Handlers</title>
			
 
				-    <para>
			
 
				-      It is rare to have to share data between two IRQ handlers, but
			
 
				-      if you do, <function>spin_lock_irqsave()</function> should be
			
 
				-      used: it is architecture-specific whether all interrupts are
			
 
				-      disabled inside irq handlers themselves.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="cheatsheet">
			
 
				-   <title>Cheat Sheet For Locking</title>
			
 
				-   <para>
			
 
				-     Pete Zaitcev gives the following summary:
			
 
				-   </para>
			
 
				-   <itemizedlist>
			
 
				-      <listitem>
			
 
				-	<para>
			
 
				-          If you are in a process context (any syscall) and want to
			
 
				-	lock other process out, use a mutex.  You can take a mutex
			
 
				-	and sleep (<function>copy_from_user*(</function> or
			
 
				-	<function>kmalloc(x,GFP_KERNEL)</function>).
			
 
				-      </para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-	<para>
			
 
				-	Otherwise (== data can be touched in an interrupt), use
			
 
				-	<function>spin_lock_irqsave()</function> and
			
 
				-	<function>spin_unlock_irqrestore()</function>.
			
 
				-	</para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-	<para>
			
 
				-	Avoid holding spinlock for more than 5 lines of code and
			
 
				-	across any function call (except accessors like
			
 
				-	<function>readb</function>).
			
 
				-	</para>
			
 
				-      </listitem>
			
 
				-    </itemizedlist>
			
 
				-
			
 
				-   <sect1 id="minimum-lock-reqirements">
			
 
				-   <title>Table of Minimum Requirements</title>
			
 
				-
			
 
				-   <para> The following table lists the <emphasis>minimum</emphasis>
			
 
				-	locking requirements between various contexts.  In some cases,
			
 
				-	the same context can only be running on one CPU at a time, so
			
 
				-	no locking is required for that context (eg. a particular
			
 
				-	thread can only run on one CPU at a time, but if it needs
			
 
				-	shares data with another thread, locking is required).
			
 
				-   </para>
			
 
				-   <para>
			
 
				-	Remember the advice above: you can always use
			
 
				-	<function>spin_lock_irqsave()</function>, which is a superset
			
 
				-	of all other spinlock primitives.
			
 
				-   </para>
			
 
				-
			
 
				-   <table>
			
 
				-<title>Table of Locking Requirements</title>
			
 
				-<tgroup cols="11">
			
 
				-<tbody>
			
 
				-
			
 
				-<row>
			
 
				-<entry></entry>
			
 
				-<entry>IRQ Handler A</entry>
			
 
				-<entry>IRQ Handler B</entry>
			
 
				-<entry>Softirq A</entry>
			
 
				-<entry>Softirq B</entry>
			
 
				-<entry>Tasklet A</entry>
			
 
				-<entry>Tasklet B</entry>
			
 
				-<entry>Timer A</entry>
			
 
				-<entry>Timer B</entry>
			
 
				-<entry>User Context A</entry>
			
 
				-<entry>User Context B</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>IRQ Handler A</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>IRQ Handler B</entry>
			
 
				-<entry>SLIS</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Softirq A</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Softirq B</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Tasklet A</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Tasklet B</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Timer A</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>Timer B</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>SL</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>User Context A</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-<row>
			
 
				-<entry>User Context B</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>MLI</entry>
			
 
				-<entry>None</entry>
			
 
				-</row>
			
 
				-
			
 
				-</tbody>
			
 
				-</tgroup>
			
 
				-</table>
			
 
				-
			
 
				-   <table>
			
 
				-<title>Legend for Locking Requirements Table</title>
			
 
				-<tgroup cols="2">
			
 
				-<tbody>
			
 
				-
			
 
				-<row>
			
 
				-<entry>SLIS</entry>
			
 
				-<entry>spin_lock_irqsave</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>SLI</entry>
			
 
				-<entry>spin_lock_irq</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>SL</entry>
			
 
				-<entry>spin_lock</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>SLBH</entry>
			
 
				-<entry>spin_lock_bh</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>MLI</entry>
			
 
				-<entry>mutex_lock_interruptible</entry>
			
 
				-</row>
			
 
				-
			
 
				-</tbody>
			
 
				-</tgroup>
			
 
				-</table>
			
 
				-
			
 
				-</sect1>
			
 
				-</chapter>
			
 
				-
			
 
				-<chapter id="trylock-functions">
			
 
				- <title>The trylock Functions</title>
			
 
				-  <para>
			
 
				-   There are functions that try to acquire a lock only once and immediately
			
 
				-   return a value telling about success or failure to acquire the lock.
			
 
				-   They can be used if you need no access to the data protected with the lock
			
 
				-   when some other thread is holding the lock. You should acquire the lock
			
 
				-   later if you then need access to the data protected with the lock.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-    <function>spin_trylock()</function> does not spin but returns non-zero if
			
 
				-    it acquires the spinlock on the first try or 0 if not. This function can
			
 
				-    be used in all contexts like <function>spin_lock</function>: you must have
			
 
				-    disabled the contexts that might interrupt you and acquire the spin lock.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-    <function>mutex_trylock()</function> does not suspend your task
			
 
				-    but returns non-zero if it could lock the mutex on the first try
			
 
				-    or 0 if not. This function cannot be safely used in hardware or software
			
 
				-    interrupt contexts despite not sleeping.
			
 
				-  </para>
			
 
				-</chapter>
			
 
				-
			
 
				-  <chapter id="Examples">
			
 
				-   <title>Common Examples</title>
			
 
				-    <para>
			
 
				-Let's step through a simple example: a cache of number to name
			
 
				-mappings.  The cache keeps a count of how often each of the objects is
			
 
				-used, and when it gets full, throws out the least used one.
			
 
				-
			
 
				-    </para>
			
 
				-
			
 
				-   <sect1 id="examples-usercontext">
			
 
				-    <title>All In User Context</title>
			
 
				-    <para>
			
 
				-For our first example, we assume that all operations are in user
			
 
				-context (ie. from system calls), so we can sleep.  This means we can
			
 
				-use a mutex to protect the cache and all the objects within
			
 
				-it.  Here's the code:
			
 
				-    </para>
			
 
				-
			
 
				-    <programlisting>
			
 
				-#include &lt;linux/list.h&gt;
			
 
				-#include &lt;linux/slab.h&gt;
			
 
				-#include &lt;linux/string.h&gt;
			
 
				-#include &lt;linux/mutex.h&gt;
			
 
				-#include &lt;asm/errno.h&gt;
			
 
				-
			
 
				-struct object
			
 
				-{
			
 
				-        struct list_head list;
			
 
				-        int id;
			
 
				-        char name[32];
			
 
				-        int popularity;
			
 
				-};
			
 
				-
			
 
				-/* Protects the cache, cache_num, and the objects within it */
			
 
				-static DEFINE_MUTEX(cache_lock);
			
 
				-static LIST_HEAD(cache);
			
 
				-static unsigned int cache_num = 0;
			
 
				-#define MAX_CACHE_SIZE 10
			
 
				-
			
 
				-/* Must be holding cache_lock */
			
 
				-static struct object *__cache_find(int id)
			
 
				-{
			
 
				-        struct object *i;
			
 
				-
			
 
				-        list_for_each_entry(i, &amp;cache, list)
			
 
				-                if (i-&gt;id == id) {
			
 
				-                        i-&gt;popularity++;
			
 
				-                        return i;
			
 
				-                }
			
 
				-        return NULL;
			
 
				-}
			
 
				-
			
 
				-/* Must be holding cache_lock */
			
 
				-static void __cache_delete(struct object *obj)
			
 
				-{
			
 
				-        BUG_ON(!obj);
			
 
				-        list_del(&amp;obj-&gt;list);
			
 
				-        kfree(obj);
			
 
				-        cache_num--;
			
 
				-}
			
 
				-
			
 
				-/* Must be holding cache_lock */
			
 
				-static void __cache_add(struct object *obj)
			
 
				-{
			
 
				-        list_add(&amp;obj-&gt;list, &amp;cache);
			
 
				-        if (++cache_num > MAX_CACHE_SIZE) {
			
 
				-                struct object *i, *outcast = NULL;
			
 
				-                list_for_each_entry(i, &amp;cache, list) {
			
 
				-                        if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
			
 
				-                                outcast = i;
			
 
				-                }
			
 
				-                __cache_delete(outcast);
			
 
				-        }
			
 
				-}
			
 
				-
			
 
				-int cache_add(int id, const char *name)
			
 
				-{
			
 
				-        struct object *obj;
			
 
				-
			
 
				-        if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
			
 
				-                return -ENOMEM;
			
 
				-
			
 
				-        strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
			
 
				-        obj-&gt;id = id;
			
 
				-        obj-&gt;popularity = 0;
			
 
				-
			
 
				-        mutex_lock(&amp;cache_lock);
			
 
				-        __cache_add(obj);
			
 
				-        mutex_unlock(&amp;cache_lock);
			
 
				-        return 0;
			
 
				-}
			
 
				-
			
 
				-void cache_delete(int id)
			
 
				-{
			
 
				-        mutex_lock(&amp;cache_lock);
			
 
				-        __cache_delete(__cache_find(id));
			
 
				-        mutex_unlock(&amp;cache_lock);
			
 
				-}
			
 
				-
			
 
				-int cache_find(int id, char *name)
			
 
				-{
			
 
				-        struct object *obj;
			
 
				-        int ret = -ENOENT;
			
 
				-
			
 
				-        mutex_lock(&amp;cache_lock);
			
 
				-        obj = __cache_find(id);
			
 
				-        if (obj) {
			
 
				-                ret = 0;
			
 
				-                strcpy(name, obj-&gt;name);
			
 
				-        }
			
 
				-        mutex_unlock(&amp;cache_lock);
			
 
				-        return ret;
			
 
				-}
			
 
				-</programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-Note that we always make sure we have the cache_lock when we add,
			
 
				-delete, or look up the cache: both the cache infrastructure itself and
			
 
				-the contents of the objects are protected by the lock.  In this case
			
 
				-it's easy, since we copy the data for the user, and never let them
			
 
				-access the objects directly.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-There is a slight (and common) optimization here: in
			
 
				-<function>cache_add</function> we set up the fields of the object
			
 
				-before grabbing the lock.  This is safe, as no-one else can access it
			
 
				-until we put it in cache.
			
 
				-    </para>
			
 
				-    </sect1>
			
 
				-
			
 
				-   <sect1 id="examples-interrupt">
			
 
				-    <title>Accessing From Interrupt Context</title>
			
 
				-    <para>
			
 
				-Now consider the case where <function>cache_find</function> can be
			
 
				-called from interrupt context: either a hardware interrupt or a
			
 
				-softirq.  An example would be a timer which deletes object from the
			
 
				-cache.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-The change is shown below, in standard patch format: the
			
 
				-<symbol>-</symbol> are lines which are taken away, and the
			
 
				-<symbol>+</symbol> are lines which are added.
			
 
				-    </para>
			
 
				-<programlisting>
			
 
				---- cache.c.usercontext	2003-12-09 13:58:54.000000000 +1100
			
 
				-+++ cache.c.interrupt	2003-12-09 14:07:49.000000000 +1100
			
 
				-@@ -12,7 +12,7 @@
			
 
				-         int popularity;
			
 
				- };
			
 
				-
			
 
				--static DEFINE_MUTEX(cache_lock);
			
 
				-+static DEFINE_SPINLOCK(cache_lock);
			
 
				- static LIST_HEAD(cache);
			
 
				- static unsigned int cache_num = 0;
			
 
				- #define MAX_CACHE_SIZE 10
			
 
				-@@ -55,6 +55,7 @@
			
 
				- int cache_add(int id, const char *name)
			
 
				- {
			
 
				-         struct object *obj;
			
 
				-+        unsigned long flags;
			
 
				-
			
 
				-         if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
			
 
				-                 return -ENOMEM;
			
 
				-@@ -63,30 +64,33 @@
			
 
				-         obj-&gt;id = id;
			
 
				-         obj-&gt;popularity = 0;
			
 
				-
			
 
				--        mutex_lock(&amp;cache_lock);
			
 
				-+        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         __cache_add(obj);
			
 
				--        mutex_unlock(&amp;cache_lock);
			
 
				-+        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-         return 0;
			
 
				- }
			
 
				-
			
 
				- void cache_delete(int id)
			
 
				- {
			
 
				--        mutex_lock(&amp;cache_lock);
			
 
				-+        unsigned long flags;
			
 
				-+
			
 
				-+        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         __cache_delete(__cache_find(id));
			
 
				--        mutex_unlock(&amp;cache_lock);
			
 
				-+        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				- }
			
 
				-
			
 
				- int cache_find(int id, char *name)
			
 
				- {
			
 
				-         struct object *obj;
			
 
				-         int ret = -ENOENT;
			
 
				-+        unsigned long flags;
			
 
				-
			
 
				--        mutex_lock(&amp;cache_lock);
			
 
				-+        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         obj = __cache_find(id);
			
 
				-         if (obj) {
			
 
				-                 ret = 0;
			
 
				-                 strcpy(name, obj-&gt;name);
			
 
				-         }
			
 
				--        mutex_unlock(&amp;cache_lock);
			
 
				-+        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-         return ret;
			
 
				- }
			
 
				-</programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-Note that the <function>spin_lock_irqsave</function> will turn off
			
 
				-interrupts if they are on, otherwise does nothing (if we are already
			
 
				-in an interrupt handler), hence these functions are safe to call from
			
 
				-any context.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-Unfortunately, <function>cache_add</function> calls
			
 
				-<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
			
 
				-flag, which is only legal in user context.  I have assumed that
			
 
				-<function>cache_add</function> is still only called in user context,
			
 
				-otherwise this should become a parameter to
			
 
				-<function>cache_add</function>.
			
 
				-    </para>
			
 
				-  </sect1>
			
 
				-   <sect1 id="examples-refcnt">
			
 
				-    <title>Exposing Objects Outside This File</title>
			
 
				-    <para>
			
 
				-If our objects contained more information, it might not be sufficient
			
 
				-to copy the information in and out: other parts of the code might want
			
 
				-to keep pointers to these objects, for example, rather than looking up
			
 
				-the id every time.  This produces two problems.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-The first problem is that we use the <symbol>cache_lock</symbol> to
			
 
				-protect objects: we'd need to make this non-static so the rest of the
			
 
				-code can use it.  This makes locking trickier, as it is no longer all
			
 
				-in one place.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-The second problem is the lifetime problem: if another structure keeps
			
 
				-a pointer to an object, it presumably expects that pointer to remain
			
 
				-valid.  Unfortunately, this is only guaranteed while you hold the
			
 
				-lock, otherwise someone might call <function>cache_delete</function>
			
 
				-and even worse, add another object, re-using the same address.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-As there is only one lock, you can't hold it forever: no-one else would
			
 
				-get any work done.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-The solution to this problem is to use a reference count: everyone who
			
 
				-has a pointer to the object increases it when they first get the
			
 
				-object, and drops the reference count when they're finished with it.
			
 
				-Whoever drops it to zero knows it is unused, and can actually delete it.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-Here is the code:
			
 
				-    </para>
			
 
				-
			
 
				-<programlisting>
			
 
				---- cache.c.interrupt	2003-12-09 14:25:43.000000000 +1100
			
 
				-+++ cache.c.refcnt	2003-12-09 14:33:05.000000000 +1100
			
 
				-@@ -7,6 +7,7 @@
			
 
				- struct object
			
 
				- {
			
 
				-         struct list_head list;
			
 
				-+        unsigned int refcnt;
			
 
				-         int id;
			
 
				-         char name[32];
			
 
				-         int popularity;
			
 
				-@@ -17,6 +18,35 @@
			
 
				- static unsigned int cache_num = 0;
			
 
				- #define MAX_CACHE_SIZE 10
			
 
				-
			
 
				-+static void __object_put(struct object *obj)
			
 
				-+{
			
 
				-+        if (--obj-&gt;refcnt == 0)
			
 
				-+                kfree(obj);
			
 
				-+}
			
 
				-+
			
 
				-+static void __object_get(struct object *obj)
			
 
				-+{
			
 
				-+        obj-&gt;refcnt++;
			
 
				-+}
			
 
				-+
			
 
				-+void object_put(struct object *obj)
			
 
				-+{
			
 
				-+        unsigned long flags;
			
 
				-+
			
 
				-+        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-+        __object_put(obj);
			
 
				-+        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-+}
			
 
				-+
			
 
				-+void object_get(struct object *obj)
			
 
				-+{
			
 
				-+        unsigned long flags;
			
 
				-+
			
 
				-+        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-+        __object_get(obj);
			
 
				-+        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-+}
			
 
				-+
			
 
				- /* Must be holding cache_lock */
			
 
				- static struct object *__cache_find(int id)
			
 
				- {
			
 
				-@@ -35,6 +65,7 @@
			
 
				- {
			
 
				-         BUG_ON(!obj);
			
 
				-         list_del(&amp;obj-&gt;list);
			
 
				-+        __object_put(obj);
			
 
				-         cache_num--;
			
 
				- }
			
 
				-
			
 
				-@@ -63,6 +94,7 @@
			
 
				-         strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
			
 
				-         obj-&gt;id = id;
			
 
				-         obj-&gt;popularity = 0;
			
 
				-+        obj-&gt;refcnt = 1; /* The cache holds a reference */
			
 
				-
			
 
				-         spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         __cache_add(obj);
			
 
				-@@ -79,18 +111,15 @@
			
 
				-         spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				- }
			
 
				-
			
 
				--int cache_find(int id, char *name)
			
 
				-+struct object *cache_find(int id)
			
 
				- {
			
 
				-         struct object *obj;
			
 
				--        int ret = -ENOENT;
			
 
				-         unsigned long flags;
			
 
				-
			
 
				-         spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         obj = __cache_find(id);
			
 
				--        if (obj) {
			
 
				--                ret = 0;
			
 
				--                strcpy(name, obj-&gt;name);
			
 
				--        }
			
 
				-+        if (obj)
			
 
				-+                __object_get(obj);
			
 
				-         spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				--        return ret;
			
 
				-+        return obj;
			
 
				- }
			
 
				-</programlisting>
			
 
				-
			
 
				-<para>
			
 
				-We encapsulate the reference counting in the standard 'get' and 'put'
			
 
				-functions.  Now we can return the object itself from
			
 
				-<function>cache_find</function> which has the advantage that the user
			
 
				-can now sleep holding the object (eg. to
			
 
				-<function>copy_to_user</function> to name to userspace).
			
 
				-</para>
			
 
				-<para>
			
 
				-The other point to note is that I said a reference should be held for
			
 
				-every pointer to the object: thus the reference count is 1 when first
			
 
				-inserted into the cache.  In some versions the framework does not hold
			
 
				-a reference count, but they are more complicated.
			
 
				-</para>
			
 
				-
			
 
				-   <sect2 id="examples-refcnt-atomic">
			
 
				-    <title>Using Atomic Operations For The Reference Count</title>
			
 
				-<para>
			
 
				-In practice, <type>atomic_t</type> would usually be used for
			
 
				-<structfield>refcnt</structfield>.  There are a number of atomic
			
 
				-operations defined in
			
 
				-
			
 
				-<filename class="headerfile">include/asm/atomic.h</filename>: these are
			
 
				-guaranteed to be seen atomically from all CPUs in the system, so no
			
 
				-lock is required.  In this case, it is simpler than using spinlocks,
			
 
				-although for anything non-trivial using spinlocks is clearer.  The
			
 
				-<function>atomic_inc</function> and
			
 
				-<function>atomic_dec_and_test</function> are used instead of the
			
 
				-standard increment and decrement operators, and the lock is no longer
			
 
				-used to protect the reference count itself.
			
 
				-</para>
			
 
				-
			
 
				-<programlisting>
			
 
				---- cache.c.refcnt	2003-12-09 15:00:35.000000000 +1100
			
 
				-+++ cache.c.refcnt-atomic	2003-12-11 15:49:42.000000000 +1100
			
 
				-@@ -7,7 +7,7 @@
			
 
				- struct object
			
 
				- {
			
 
				-         struct list_head list;
			
 
				--        unsigned int refcnt;
			
 
				-+        atomic_t refcnt;
			
 
				-         int id;
			
 
				-         char name[32];
			
 
				-         int popularity;
			
 
				-@@ -18,33 +18,15 @@
			
 
				- static unsigned int cache_num = 0;
			
 
				- #define MAX_CACHE_SIZE 10
			
 
				-
			
 
				--static void __object_put(struct object *obj)
			
 
				--{
			
 
				--        if (--obj-&gt;refcnt == 0)
			
 
				--                kfree(obj);
			
 
				--}
			
 
				--
			
 
				--static void __object_get(struct object *obj)
			
 
				--{
			
 
				--        obj-&gt;refcnt++;
			
 
				--}
			
 
				--
			
 
				- void object_put(struct object *obj)
			
 
				- {
			
 
				--        unsigned long flags;
			
 
				--
			
 
				--        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				--        __object_put(obj);
			
 
				--        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-+        if (atomic_dec_and_test(&amp;obj-&gt;refcnt))
			
 
				-+                kfree(obj);
			
 
				- }
			
 
				-
			
 
				- void object_get(struct object *obj)
			
 
				- {
			
 
				--        unsigned long flags;
			
 
				--
			
 
				--        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				--        __object_get(obj);
			
 
				--        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-+        atomic_inc(&amp;obj-&gt;refcnt);
			
 
				- }
			
 
				-
			
 
				- /* Must be holding cache_lock */
			
 
				-@@ -65,7 +47,7 @@
			
 
				- {
			
 
				-         BUG_ON(!obj);
			
 
				-         list_del(&amp;obj-&gt;list);
			
 
				--        __object_put(obj);
			
 
				-+        object_put(obj);
			
 
				-         cache_num--;
			
 
				- }
			
 
				-
			
 
				-@@ -94,7 +76,7 @@
			
 
				-         strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
			
 
				-         obj-&gt;id = id;
			
 
				-         obj-&gt;popularity = 0;
			
 
				--        obj-&gt;refcnt = 1; /* The cache holds a reference */
			
 
				-+        atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
			
 
				-
			
 
				-         spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         __cache_add(obj);
			
 
				-@@ -119,7 +101,7 @@
			
 
				-         spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         obj = __cache_find(id);
			
 
				-         if (obj)
			
 
				--                __object_get(obj);
			
 
				-+                object_get(obj);
			
 
				-         spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-         return obj;
			
 
				- }
			
 
				-</programlisting>
			
 
				-</sect2>
			
 
				-</sect1>
			
 
				-
			
 
				-   <sect1 id="examples-lock-per-obj">
			
 
				-    <title>Protecting The Objects Themselves</title>
			
 
				-    <para>
			
 
				-In these examples, we assumed that the objects (except the reference
			
 
				-counts) never changed once they are created.  If we wanted to allow
			
 
				-the name to change, there are three possibilities:
			
 
				-    </para>
			
 
				-    <itemizedlist>
			
 
				-      <listitem>
			
 
				-	<para>
			
 
				-You can make <symbol>cache_lock</symbol> non-static, and tell people
			
 
				-to grab that lock before changing the name in any object.
			
 
				-        </para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-        <para>
			
 
				-You can provide a <function>cache_obj_rename</function> which grabs
			
 
				-this lock and changes the name for the caller, and tell everyone to
			
 
				-use that function.
			
 
				-        </para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-        <para>
			
 
				-You can make the <symbol>cache_lock</symbol> protect only the cache
			
 
				-itself, and use another lock to protect the name.
			
 
				-        </para>
			
 
				-      </listitem>
			
 
				-    </itemizedlist>
			
 
				-
			
 
				-      <para>
			
 
				-Theoretically, you can make the locks as fine-grained as one lock for
			
 
				-every field, for every object.  In practice, the most common variants
			
 
				-are:
			
 
				-</para>
			
 
				-    <itemizedlist>
			
 
				-      <listitem>
			
 
				-	<para>
			
 
				-One lock which protects the infrastructure (the <symbol>cache</symbol>
			
 
				-list in this example) and all the objects.  This is what we have done
			
 
				-so far.
			
 
				-	</para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-        <para>
			
 
				-One lock which protects the infrastructure (including the list
			
 
				-pointers inside the objects), and one lock inside the object which
			
 
				-protects the rest of that object.
			
 
				-        </para>
			
 
				-      </listitem>
			
 
				-      <listitem>
			
 
				-        <para>
			
 
				-Multiple locks to protect the infrastructure (eg. one lock per hash
			
 
				-chain), possibly with a separate per-object lock.
			
 
				-        </para>
			
 
				-      </listitem>
			
 
				-    </itemizedlist>
			
 
				-
			
 
				-<para>
			
 
				-Here is the "lock-per-object" implementation:
			
 
				-</para>
			
 
				-<programlisting>
			
 
				---- cache.c.refcnt-atomic	2003-12-11 15:50:54.000000000 +1100
			
 
				-+++ cache.c.perobjectlock	2003-12-11 17:15:03.000000000 +1100
			
 
				-@@ -6,11 +6,17 @@
			
 
				-
			
 
				- struct object
			
 
				- {
			
 
				-+        /* These two protected by cache_lock. */
			
 
				-         struct list_head list;
			
 
				-+        int popularity;
			
 
				-+
			
 
				-         atomic_t refcnt;
			
 
				-+
			
 
				-+        /* Doesn't change once created. */
			
 
				-         int id;
			
 
				-+
			
 
				-+        spinlock_t lock; /* Protects the name */
			
 
				-         char name[32];
			
 
				--        int popularity;
			
 
				- };
			
 
				-
			
 
				- static DEFINE_SPINLOCK(cache_lock);
			
 
				-@@ -77,6 +84,7 @@
			
 
				-         obj-&gt;id = id;
			
 
				-         obj-&gt;popularity = 0;
			
 
				-         atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
			
 
				-+        spin_lock_init(&amp;obj-&gt;lock);
			
 
				-
			
 
				-         spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-         __cache_add(obj);
			
 
				-</programlisting>
			
 
				-
			
 
				-<para>
			
 
				-Note that I decide that the <structfield>popularity</structfield>
			
 
				-count should be protected by the <symbol>cache_lock</symbol> rather
			
 
				-than the per-object lock: this is because it (like the
			
 
				-<structname>struct list_head</structname> inside the object) is
			
 
				-logically part of the infrastructure.  This way, I don't need to grab
			
 
				-the lock of every object in <function>__cache_add</function> when
			
 
				-seeking the least popular.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-I also decided that the <structfield>id</structfield> member is
			
 
				-unchangeable, so I don't need to grab each object lock in
			
 
				-<function>__cache_find()</function> to examine the
			
 
				-<structfield>id</structfield>: the object lock is only used by a
			
 
				-caller who wants to read or write the <structfield>name</structfield>
			
 
				-field.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Note also that I added a comment describing what data was protected by
			
 
				-which locks.  This is extremely important, as it describes the runtime
			
 
				-behavior of the code, and can be hard to gain from just reading.  And
			
 
				-as Alan Cox says, <quote>Lock data, not code</quote>.
			
 
				-</para>
			
 
				-</sect1>
			
 
				-</chapter>
			
 
				-
			
 
				-   <chapter id="common-problems">
			
 
				-    <title>Common Problems</title>
			
 
				-    <sect1 id="deadlock">
			
 
				-    <title>Deadlock: Simple and Advanced</title>
			
 
				-
			
 
				-    <para>
			
 
				-      There is a coding bug where a piece of code tries to grab a
			
 
				-      spinlock twice: it will spin forever, waiting for the lock to
			
 
				-      be released (spinlocks, rwlocks and mutexes are not
			
 
				-      recursive in Linux).  This is trivial to diagnose: not a
			
 
				-      stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
			
 
				-      problem.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      For a slightly more complex case, imagine you have a region
			
 
				-      shared by a softirq and user context.  If you use a
			
 
				-      <function>spin_lock()</function> call to protect it, it is 
			
 
				-      possible that the user context will be interrupted by the softirq
			
 
				-      while it holds the lock, and the softirq will then spin
			
 
				-      forever trying to get the same lock.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Both of these are called deadlock, and as shown above, it can
			
 
				-      occur even with a single CPU (although not on UP compiles,
			
 
				-      since spinlocks vanish on kernel compiles with 
			
 
				-      <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption 
			
 
				-      in the second example).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      This complete lockup is easy to diagnose: on SMP boxes the
			
 
				-      watchdog timer or compiling with <symbol>DEBUG_SPINLOCK</symbol> set
			
 
				-      (<filename>include/linux/spinlock.h</filename>) will show this up 
			
 
				-      immediately when it happens.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      A more complex problem is the so-called 'deadly embrace',
			
 
				-      involving two or more locks.  Say you have a hash table: each
			
 
				-      entry in the table is a spinlock, and a chain of hashed
			
 
				-      objects.  Inside a softirq handler, you sometimes want to
			
 
				-      alter an object from one place in the hash to another: you
			
 
				-      grab the spinlock of the old hash chain and the spinlock of
			
 
				-      the new hash chain, and delete the object from the old one,
			
 
				-      and insert it in the new one.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      There are two problems here.  First, if your code ever
			
 
				-      tries to move the object to the same chain, it will deadlock
			
 
				-      with itself as it tries to lock it twice.  Secondly, if the
			
 
				-      same softirq on another CPU is trying to move another object
			
 
				-      in the reverse direction, the following could happen:
			
 
				-    </para>
			
 
				-
			
 
				-    <table>
			
 
				-     <title>Consequences</title>
			
 
				-
			
 
				-     <tgroup cols="2" align="left">
			
 
				-
			
 
				-      <thead>
			
 
				-       <row>
			
 
				-        <entry>CPU 1</entry>
			
 
				-        <entry>CPU 2</entry>
			
 
				-       </row>
			
 
				-      </thead>
			
 
				-
			
 
				-      <tbody>
			
 
				-       <row>
			
 
				-        <entry>Grab lock A -&gt; OK</entry>
			
 
				-        <entry>Grab lock B -&gt; OK</entry>
			
 
				-       </row>
			
 
				-       <row>
			
 
				-        <entry>Grab lock B -&gt; spin</entry>
			
 
				-        <entry>Grab lock A -&gt; spin</entry>
			
 
				-       </row>
			
 
				-      </tbody>
			
 
				-     </tgroup>
			
 
				-    </table>
			
 
				-
			
 
				-    <para>
			
 
				-      The two CPUs will spin forever, waiting for the other to give up
			
 
				-      their lock.  It will look, smell, and feel like a crash.
			
 
				-    </para>
			
 
				-    </sect1>
			
 
				-
			
 
				-    <sect1 id="techs-deadlock-prevent">
			
 
				-     <title>Preventing Deadlock</title>
			
 
				-
			
 
				-     <para>
			
 
				-       Textbooks will tell you that if you always lock in the same
			
 
				-       order, you will never get this kind of deadlock.  Practice
			
 
				-       will tell you that this approach doesn't scale: when I
			
 
				-       create a new lock, I don't understand enough of the kernel
			
 
				-       to figure out where in the 5000 lock hierarchy it will fit.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       The best locks are encapsulated: they never get exposed in
			
 
				-       headers, and are never held around calls to non-trivial
			
 
				-       functions outside the same file.  You can read through this
			
 
				-       code and see that it will never deadlock, because it never
			
 
				-       tries to grab another lock while it has that one.  People
			
 
				-       using your code don't even need to know you are using a
			
 
				-       lock.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       A classic problem here is when you provide callbacks or
			
 
				-       hooks: if you call these with the lock held, you risk simple
			
 
				-       deadlock, or a deadly embrace (who knows what the callback
			
 
				-       will do?).  Remember, the other programmers are out to get
			
 
				-       you, so don't do this.
			
 
				-     </para>
			
 
				-
			
 
				-    <sect2 id="techs-deadlock-overprevent">
			
 
				-     <title>Overzealous Prevention Of Deadlocks</title>
			
 
				-
			
 
				-     <para>
			
 
				-       Deadlocks are problematic, but not as bad as data
			
 
				-       corruption.  Code which grabs a read lock, searches a list,
			
 
				-       fails to find what it wants, drops the read lock, grabs a
			
 
				-       write lock and inserts the object has a race condition.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       If you don't see why, please stay the fuck away from my code.
			
 
				-     </para>
			
 
				-    </sect2>
			
 
				-    </sect1>
			
 
				-
			
 
				-   <sect1 id="racing-timers">
			
 
				-    <title>Racing Timers: A Kernel Pastime</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Timers can produce their own special problems with races.
			
 
				-      Consider a collection of objects (list, hash, etc) where each
			
 
				-      object has a timer which is due to destroy it.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      If you want to destroy the entire collection (say on module
			
 
				-      removal), you might do the following:
			
 
				-    </para>
			
 
				-
			
 
				-    <programlisting>
			
 
				-        /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
			
 
				-           HUNGARIAN NOTATION */
			
 
				-        spin_lock_bh(&amp;list_lock);
			
 
				-
			
 
				-        while (list) {
			
 
				-                struct foo *next = list-&gt;next;
			
 
				-                del_timer(&amp;list-&gt;timer);
			
 
				-                kfree(list);
			
 
				-                list = next;
			
 
				-        }
			
 
				-
			
 
				-        spin_unlock_bh(&amp;list_lock);
			
 
				-    </programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-      Sooner or later, this will crash on SMP, because a timer can
			
 
				-      have just gone off before the <function>spin_lock_bh()</function>,
			
 
				-      and it will only get the lock after we
			
 
				-      <function>spin_unlock_bh()</function>, and then try to free
			
 
				-      the element (which has already been freed!).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      This can be avoided by checking the result of
			
 
				-      <function>del_timer()</function>: if it returns
			
 
				-      <returnvalue>1</returnvalue>, the timer has been deleted.
			
 
				-      If <returnvalue>0</returnvalue>, it means (in this
			
 
				-      case) that it is currently running, so we can do:
			
 
				-    </para>
			
 
				-
			
 
				-    <programlisting>
			
 
				-        retry:
			
 
				-                spin_lock_bh(&amp;list_lock);
			
 
				-
			
 
				-                while (list) {
			
 
				-                        struct foo *next = list-&gt;next;
			
 
				-                        if (!del_timer(&amp;list-&gt;timer)) {
			
 
				-                                /* Give timer a chance to delete this */
			
 
				-                                spin_unlock_bh(&amp;list_lock);
			
 
				-                                goto retry;
			
 
				-                        }
			
 
				-                        kfree(list);
			
 
				-                        list = next;
			
 
				-                }
			
 
				-
			
 
				-                spin_unlock_bh(&amp;list_lock);
			
 
				-    </programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-      Another common problem is deleting timers which restart
			
 
				-      themselves (by calling <function>add_timer()</function> at the end
			
 
				-      of their timer function).  Because this is a fairly common case
			
 
				-      which is prone to races, you should use <function>del_timer_sync()</function>
			
 
				-      (<filename class="headerfile">include/linux/timer.h</filename>)
			
 
				-      to handle this case.  It returns the number of times the timer
			
 
				-      had to be deleted before we finally stopped it from adding itself back
			
 
				-      in.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				- <chapter id="Efficiency">
			
 
				-    <title>Locking Speed</title>
			
 
				-
			
 
				-    <para>
			
 
				-There are three main things to worry about when considering speed of
			
 
				-some code which does locking.  First is concurrency: how many things
			
 
				-are going to be waiting while someone else is holding a lock.  Second
			
 
				-is the time taken to actually acquire and release an uncontended lock.
			
 
				-Third is using fewer, or smarter locks.  I'm assuming that the lock is
			
 
				-used fairly often: otherwise, you wouldn't be concerned about
			
 
				-efficiency.
			
 
				-</para>
			
 
				-    <para>
			
 
				-Concurrency depends on how long the lock is usually held: you should
			
 
				-hold the lock for as long as needed, but no longer.  In the cache
			
 
				-example, we always create the object without the lock held, and then
			
 
				-grab the lock only when we are ready to insert it in the list.
			
 
				-</para>
			
 
				-    <para>
			
 
				-Acquisition times depend on how much damage the lock operations do to
			
 
				-the pipeline (pipeline stalls) and how likely it is that this CPU was
			
 
				-the last one to grab the lock (ie. is the lock cache-hot for this
			
 
				-CPU): on a machine with more CPUs, this likelihood drops fast.
			
 
				-Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
			
 
				-an atomic increment takes about 58ns, a lock which is cache-hot on
			
 
				-this CPU takes 160ns, and a cacheline transfer from another CPU takes
			
 
				-an additional 170 to 360ns.  (These figures from Paul McKenney's
			
 
				-<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux
			
 
				-Journal RCU article</ulink>).
			
 
				-</para>
			
 
				-    <para>
			
 
				-These two aims conflict: holding a lock for a short time might be done
			
 
				-by splitting locks into parts (such as in our final per-object-lock
			
 
				-example), but this increases the number of lock acquisitions, and the
			
 
				-results are often slower than having a single lock.  This is another
			
 
				-reason to advocate locking simplicity.
			
 
				-</para>
			
 
				-    <para>
			
 
				-The third concern is addressed below: there are some methods to reduce
			
 
				-the amount of locking which needs to be done.
			
 
				-</para>
			
 
				-
			
 
				-  <sect1 id="efficiency-rwlocks">
			
 
				-   <title>Read/Write Lock Variants</title>
			
 
				-
			
 
				-   <para>
			
 
				-      Both spinlocks and mutexes have read/write variants:
			
 
				-      <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
			
 
				-      These divide users into two classes: the readers and the writers.  If
			
 
				-      you are only reading the data, you can get a read lock, but to write to
			
 
				-      the data you need the write lock.  Many people can hold a read lock,
			
 
				-      but a writer must be sole holder.
			
 
				-    </para>
			
 
				-
			
 
				-   <para>
			
 
				-      If your code divides neatly along reader/writer lines (as our
			
 
				-      cache code does), and the lock is held by readers for
			
 
				-      significant lengths of time, using these locks can help.  They
			
 
				-      are slightly slower than the normal locks though, so in practice
			
 
				-      <type>rwlock_t</type> is not usually worthwhile.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="efficiency-read-copy-update">
			
 
				-    <title>Avoiding Locks: Read Copy Update</title>
			
 
				-
			
 
				-    <para>
			
 
				-      There is a special method of read/write locking called Read Copy
			
 
				-      Update.  Using RCU, the readers can avoid taking a lock
			
 
				-      altogether: as we expect our cache to be read more often than
			
 
				-      updated (otherwise the cache is a waste of time), it is a
			
 
				-      candidate for this optimization.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      How do we get rid of read locks?  Getting rid of read locks
			
 
				-      means that writers may be changing the list underneath the
			
 
				-      readers.  That is actually quite simple: we can read a linked
			
 
				-      list while an element is being added if the writer adds the
			
 
				-      element very carefully.  For example, adding
			
 
				-      <symbol>new</symbol> to a single linked list called
			
 
				-      <symbol>list</symbol>:
			
 
				-    </para>
			
 
				-
			
 
				-    <programlisting>
			
 
				-        new-&gt;next = list-&gt;next;
			
 
				-        wmb();
			
 
				-        list-&gt;next = new;
			
 
				-    </programlisting>
			
 
				-
			
 
				-    <para>
			
 
				-      The <function>wmb()</function> is a write memory barrier.  It
			
 
				-      ensures that the first operation (setting the new element's
			
 
				-      <symbol>next</symbol> pointer) is complete and will be seen by
			
 
				-      all CPUs, before the second operation is (putting the new
			
 
				-      element into the list).  This is important, since modern
			
 
				-      compilers and modern CPUs can both reorder instructions unless
			
 
				-      told otherwise: we want a reader to either not see the new
			
 
				-      element at all, or see the new element with the
			
 
				-      <symbol>next</symbol> pointer correctly pointing at the rest of
			
 
				-      the list.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      Fortunately, there is a function to do this for standard
			
 
				-      <structname>struct list_head</structname> lists:
			
 
				-      <function>list_add_rcu()</function>
			
 
				-      (<filename>include/linux/list.h</filename>).
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      Removing an element from the list is even simpler: we replace
			
 
				-      the pointer to the old element with a pointer to its successor,
			
 
				-      and readers will either see it, or skip over it.
			
 
				-    </para>
			
 
				-    <programlisting>
			
 
				-        list-&gt;next = old-&gt;next;
			
 
				-    </programlisting>
			
 
				-    <para>
			
 
				-      There is <function>list_del_rcu()</function>
			
 
				-      (<filename>include/linux/list.h</filename>) which does this (the
			
 
				-      normal version poisons the old object, which we don't want).
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      The reader must also be careful: some CPUs can look through the
			
 
				-      <symbol>next</symbol> pointer to start reading the contents of
			
 
				-      the next element early, but don't realize that the pre-fetched
			
 
				-      contents is wrong when the <symbol>next</symbol> pointer changes
			
 
				-      underneath them.  Once again, there is a
			
 
				-      <function>list_for_each_entry_rcu()</function>
			
 
				-      (<filename>include/linux/list.h</filename>) to help you.  Of
			
 
				-      course, writers can just use
			
 
				-      <function>list_for_each_entry()</function>, since there cannot
			
 
				-      be two simultaneous writers.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      Our final dilemma is this: when can we actually destroy the
			
 
				-      removed element?  Remember, a reader might be stepping through
			
 
				-      this element in the list right now: if we free this element and
			
 
				-      the <symbol>next</symbol> pointer changes, the reader will jump
			
 
				-      off into garbage and crash.  We need to wait until we know that
			
 
				-      all the readers who were traversing the list when we deleted the
			
 
				-      element are finished.  We use <function>call_rcu()</function> to
			
 
				-      register a callback which will actually destroy the object once
			
 
				-      all pre-existing readers are finished.  Alternatively,
			
 
				-      <function>synchronize_rcu()</function> may be used to block until
			
 
				-      all pre-existing are finished.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      But how does Read Copy Update know when the readers are
			
 
				-      finished?  The method is this: firstly, the readers always
			
 
				-      traverse the list inside
			
 
				-      <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function>
			
 
				-      pairs: these simply disable preemption so the reader won't go to
			
 
				-      sleep while reading the list.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      RCU then waits until every other CPU has slept at least once:
			
 
				-      since readers cannot sleep, we know that any readers which were
			
 
				-      traversing the list during the deletion are finished, and the
			
 
				-      callback is triggered.  The real Read Copy Update code is a
			
 
				-      little more optimized than this, but this is the fundamental
			
 
				-      idea.
			
 
				-    </para>
			
 
				-
			
 
				-<programlisting>
			
 
				---- cache.c.perobjectlock	2003-12-11 17:15:03.000000000 +1100
			
 
				-+++ cache.c.rcupdate	2003-12-11 17:55:14.000000000 +1100
			
 
				-@@ -1,15 +1,18 @@
			
 
				- #include &lt;linux/list.h&gt;
			
 
				- #include &lt;linux/slab.h&gt;
			
 
				- #include &lt;linux/string.h&gt;
			
 
				-+#include &lt;linux/rcupdate.h&gt;
			
 
				- #include &lt;linux/mutex.h&gt;
			
 
				- #include &lt;asm/errno.h&gt;
			
 
				-
			
 
				- struct object
			
 
				- {
			
 
				--        /* These two protected by cache_lock. */
			
 
				-+        /* This is protected by RCU */
			
 
				-         struct list_head list;
			
 
				-         int popularity;
			
 
				-
			
 
				-+        struct rcu_head rcu;
			
 
				-+
			
 
				-         atomic_t refcnt;
			
 
				-
			
 
				-         /* Doesn't change once created. */
			
 
				-@@ -40,7 +43,7 @@
			
 
				- {
			
 
				-         struct object *i;
			
 
				-
			
 
				--        list_for_each_entry(i, &amp;cache, list) {
			
 
				-+        list_for_each_entry_rcu(i, &amp;cache, list) {
			
 
				-                 if (i-&gt;id == id) {
			
 
				-                         i-&gt;popularity++;
			
 
				-                         return i;
			
 
				-@@ -49,19 +52,25 @@
			
 
				-         return NULL;
			
 
				- }
			
 
				-
			
 
				-+/* Final discard done once we know no readers are looking. */
			
 
				-+static void cache_delete_rcu(void *arg)
			
 
				-+{
			
 
				-+        object_put(arg);
			
 
				-+}
			
 
				-+
			
 
				- /* Must be holding cache_lock */
			
 
				- static void __cache_delete(struct object *obj)
			
 
				- {
			
 
				-         BUG_ON(!obj);
			
 
				--        list_del(&amp;obj-&gt;list);
			
 
				--        object_put(obj);
			
 
				-+        list_del_rcu(&amp;obj-&gt;list);
			
 
				-         cache_num--;
			
 
				-+        call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu);
			
 
				- }
			
 
				-
			
 
				- /* Must be holding cache_lock */
			
 
				- static void __cache_add(struct object *obj)
			
 
				- {
			
 
				--        list_add(&amp;obj-&gt;list, &amp;cache);
			
 
				-+        list_add_rcu(&amp;obj-&gt;list, &amp;cache);
			
 
				-         if (++cache_num > MAX_CACHE_SIZE) {
			
 
				-                 struct object *i, *outcast = NULL;
			
 
				-                 list_for_each_entry(i, &amp;cache, list) {
			
 
				-@@ -104,12 +114,11 @@
			
 
				- struct object *cache_find(int id)
			
 
				- {
			
 
				-         struct object *obj;
			
 
				--        unsigned long flags;
			
 
				-
			
 
				--        spin_lock_irqsave(&amp;cache_lock, flags);
			
 
				-+        rcu_read_lock();
			
 
				-         obj = __cache_find(id);
			
 
				-         if (obj)
			
 
				-                 object_get(obj);
			
 
				--        spin_unlock_irqrestore(&amp;cache_lock, flags);
			
 
				-+        rcu_read_unlock();
			
 
				-         return obj;
			
 
				- }
			
 
				-</programlisting>
			
 
				-
			
 
				-<para>
			
 
				-Note that the reader will alter the
			
 
				-<structfield>popularity</structfield> member in
			
 
				-<function>__cache_find()</function>, and now it doesn't hold a lock.
			
 
				-One solution would be to make it an <type>atomic_t</type>, but for
			
 
				-this usage, we don't really care about races: an approximate result is
			
 
				-good enough, so I didn't change it.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The result is that <function>cache_find()</function> requires no
			
 
				-synchronization with any other functions, so is almost as fast on SMP
			
 
				-as it would be on UP.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-There is a further optimization possible here: remember our original
			
 
				-cache code, where there were no reference counts and the caller simply
			
 
				-held the lock whenever using the object?  This is still possible: if
			
 
				-you hold the lock, no one can delete the object, so you don't need to
			
 
				-get and put the reference count.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Now, because the 'read lock' in RCU is simply disabling preemption, a
			
 
				-caller which always has preemption disabled between calling
			
 
				-<function>cache_find()</function> and
			
 
				-<function>object_put()</function> does not need to actually get and
			
 
				-put the reference count: we could expose
			
 
				-<function>__cache_find()</function> by making it non-static, and
			
 
				-such callers could simply call that.
			
 
				-</para>
			
 
				-<para>
			
 
				-The benefit here is that the reference count is not written to: the
			
 
				-object is not altered in any way, which is much faster on SMP
			
 
				-machines due to caching.
			
 
				-</para>
			
 
				-  </sect1>
			
 
				-
			
 
				-   <sect1 id="per-cpu">
			
 
				-    <title>Per-CPU Data</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Another technique for avoiding locking which is used fairly
			
 
				-      widely is to duplicate information for each CPU.  For example,
			
 
				-      if you wanted to keep a count of a common condition, you could
			
 
				-      use a spin lock and a single counter.  Nice and simple.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      If that was too slow (it's usually not, but if you've got a
			
 
				-      really big machine to test on and can show that it is), you
			
 
				-      could instead use a counter for each CPU, then none of them need
			
 
				-      an exclusive lock.  See <function>DEFINE_PER_CPU()</function>,
			
 
				-      <function>get_cpu_var()</function> and
			
 
				-      <function>put_cpu_var()</function>
			
 
				-      (<filename class="headerfile">include/linux/percpu.h</filename>).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Of particular use for simple per-cpu counters is the
			
 
				-      <type>local_t</type> type, and the
			
 
				-      <function>cpu_local_inc()</function> and related functions,
			
 
				-      which are more efficient than simple code on some architectures
			
 
				-      (<filename class="headerfile">include/asm/local.h</filename>).
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Note that there is no simple, reliable way of getting an exact
			
 
				-      value of such a counter, without introducing more locks.  This
			
 
				-      is not a problem for some uses.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="mostly-hardirq">
			
 
				-    <title>Data Which Mostly Used By An IRQ Handler</title>
			
 
				-
			
 
				-    <para>
			
 
				-      If data is always accessed from within the same IRQ handler, you
			
 
				-      don't need a lock at all: the kernel already guarantees that the
			
 
				-      irq handler will not run simultaneously on multiple CPUs.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      Manfred Spraul points out that you can still do this, even if
			
 
				-      the data is very occasionally accessed in user context or
			
 
				-      softirqs/tasklets.  The irq handler doesn't use a lock, and
			
 
				-      all other accesses are done as so:
			
 
				-    </para>
			
 
				-
			
 
				-<programlisting>
			
 
				-	spin_lock(&amp;lock);
			
 
				-	disable_irq(irq);
			
 
				-	...
			
 
				-	enable_irq(irq);
			
 
				-	spin_unlock(&amp;lock);
			
 
				-</programlisting>
			
 
				-    <para>
			
 
				-      The <function>disable_irq()</function> prevents the irq handler
			
 
				-      from running (and waits for it to finish if it's currently
			
 
				-      running on other CPUs).  The spinlock prevents any other
			
 
				-      accesses happening at the same time.  Naturally, this is slower
			
 
				-      than just a <function>spin_lock_irq()</function> call, so it
			
 
				-      only makes sense if this type of access happens extremely
			
 
				-      rarely.
			
 
				-    </para>
			
 
				-   </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				- <chapter id="sleeping-things">
			
 
				-    <title>What Functions Are Safe To Call From Interrupts?</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Many functions in the kernel sleep (ie. call schedule())
			
 
				-      directly or indirectly: you can never call them while holding a
			
 
				-      spinlock, or with preemption disabled.  This also means you need
			
 
				-      to be in user context: calling them from an interrupt is illegal.
			
 
				-    </para>
			
 
				-
			
 
				-   <sect1 id="sleeping">
			
 
				-    <title>Some Functions Which Sleep</title>
			
 
				-
			
 
				-    <para>
			
 
				-      The most common ones are listed below, but you usually have to
			
 
				-      read the code to find out if other calls are safe.  If everyone
			
 
				-      else who calls it can sleep, you probably need to be able to
			
 
				-      sleep, too.  In particular, registration and deregistration
			
 
				-      functions usually expect to be called from user context, and can
			
 
				-      sleep.
			
 
				-    </para>
			
 
				-
			
 
				-    <itemizedlist>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-        Accesses to 
			
 
				-        <firstterm linkend="gloss-userspace">userspace</firstterm>:
			
 
				-      </para>
			
 
				-      <itemizedlist>
			
 
				-       <listitem>
			
 
				-        <para>
			
 
				-          <function>copy_from_user()</function>
			
 
				-        </para>
			
 
				-       </listitem>
			
 
				-       <listitem>
			
 
				-        <para>
			
 
				-          <function>copy_to_user()</function>
			
 
				-        </para>
			
 
				-       </listitem>
			
 
				-       <listitem>
			
 
				-        <para>
			
 
				-          <function>get_user()</function>
			
 
				-        </para>
			
 
				-       </listitem>
			
 
				-       <listitem>
			
 
				-        <para>
			
 
				-          <function>put_user()</function>
			
 
				-        </para>
			
 
				-       </listitem>
			
 
				-      </itemizedlist>
			
 
				-     </listitem>
			
 
				-
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-        <function>kmalloc(GFP_KERNEL)</function>
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-      <function>mutex_lock_interruptible()</function> and
			
 
				-      <function>mutex_lock()</function>
			
 
				-      </para>
			
 
				-      <para>
			
 
				-       There is a <function>mutex_trylock()</function> which does not
			
 
				-       sleep.  Still, it must not be used inside interrupt context since
			
 
				-       its implementation is not safe for that.
			
 
				-       <function>mutex_unlock()</function> will also never sleep.
			
 
				-       It cannot be used in interrupt context either since a mutex
			
 
				-       must be released by the same task that acquired it.
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-    </itemizedlist>
			
 
				-   </sect1>
			
 
				-
			
 
				-   <sect1 id="dont-sleep">
			
 
				-    <title>Some Functions Which Don't Sleep</title>
			
 
				-
			
 
				-    <para>
			
 
				-     Some functions are safe to call from any context, or holding
			
 
				-     almost any lock.
			
 
				-    </para>
			
 
				-
			
 
				-    <itemizedlist>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-	<function>printk()</function>
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-        <function>kfree()</function>
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-     <listitem>
			
 
				-      <para>
			
 
				-	<function>add_timer()</function> and <function>del_timer()</function>
			
 
				-      </para>
			
 
				-     </listitem>
			
 
				-    </itemizedlist>
			
 
				-   </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="apiref-mutex">
			
 
				-   <title>Mutex API reference</title>
			
 
				-!Iinclude/linux/mutex.h
			
 
				-!Ekernel/locking/mutex.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="apiref-futex">
			
 
				-   <title>Futex API reference</title>
			
 
				-!Ikernel/futex.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="references">
			
 
				-   <title>Further reading</title>
			
 
				-
			
 
				-   <itemizedlist>
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-       <filename>Documentation/locking/spinlocks.txt</filename>:
			
 
				-       Linus Torvalds' spinlocking tutorial in the kernel sources.
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-
			
 
				-    <listitem>
			
 
				-     <para>
			
 
				-       Unix Systems for Modern Architectures: Symmetric
			
 
				-       Multiprocessing and Caching for Kernel Programmers:
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-       Curt Schimmel's very good introduction to kernel level
			
 
				-       locking (not written for Linux, but nearly everything
			
 
				-       applies).  The book is expensive, but really worth every
			
 
				-       penny to understand SMP locking. [ISBN: 0201633388]
			
 
				-     </para>
			
 
				-    </listitem>
			
 
				-   </itemizedlist>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="thanks">
			
 
				-    <title>Thanks</title>
			
 
				-
			
 
				-    <para>
			
 
				-      Thanks to Telsa Gwynne for DocBooking, neatening and adding
			
 
				-      style.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
			
 
				-      Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
			
 
				-      Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
			
 
				-      John Ashby for proofreading, correcting, flaming, commenting.
			
 
				-    </para>
			
 
				-
			
 
				-    <para>
			
 
				-      Thanks to the cabal for having no influence on this document.
			
 
				-    </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <glossary id="glossary">
			
 
				-   <title>Glossary</title>
			
 
				-
			
 
				-   <glossentry id="gloss-preemption">
			
 
				-    <glossterm>preemption</glossterm>
			
 
				-     <glossdef>
			
 
				-      <para>
			
 
				-        Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
			
 
				-        unset, processes in user context inside the kernel would not
			
 
				-        preempt each other (ie. you had that CPU until you gave it up,
			
 
				-        except for interrupts).  With the addition of
			
 
				-        <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
			
 
				-        in user context, higher priority tasks can "cut in": spinlocks
			
 
				-        were changed to disable preemption, even on UP.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-bh">
			
 
				-    <glossterm>bh</glossterm>
			
 
				-     <glossdef>
			
 
				-      <para>
			
 
				-        Bottom Half: for historical reasons, functions with
			
 
				-        '_bh' in them often now refer to any software interrupt, e.g.
			
 
				-        <function>spin_lock_bh()</function> blocks any software interrupt 
			
 
				-        on the current CPU.  Bottom halves are deprecated, and will 
			
 
				-        eventually be replaced by tasklets.  Only one bottom half will be 
			
 
				-        running at any time.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-hwinterrupt">
			
 
				-    <glossterm>Hardware Interrupt / Hardware IRQ</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       Hardware interrupt request.  <function>in_irq()</function> returns 
			
 
				-       <returnvalue>true</returnvalue> in a hardware interrupt handler.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-interruptcontext">
			
 
				-    <glossterm>Interrupt Context</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       Not user context: processing a hardware irq or software irq.
			
 
				-       Indicated by the <function>in_interrupt()</function> macro 
			
 
				-       returning <returnvalue>true</returnvalue>.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-smp">
			
 
				-    <glossterm><acronym>SMP</acronym></glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       Symmetric Multi-Processor: kernels compiled for multiple-CPU
			
 
				-       machines.  (CONFIG_SMP=y).
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-softirq">
			
 
				-    <glossterm>Software Interrupt / softirq</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       Software interrupt handler.  <function>in_irq()</function> returns
			
 
				-       <returnvalue>false</returnvalue>; <function>in_softirq()</function>
			
 
				-       returns <returnvalue>true</returnvalue>.  Tasklets and softirqs
			
 
				-	both fall into the category of 'software interrupts'.
			
 
				-     </para>
			
 
				-     <para>
			
 
				-       Strictly speaking a softirq is one of up to 32 enumerated software
			
 
				-       interrupts which can run on multiple CPUs at once.
			
 
				-       Sometimes used to refer to tasklets as
			
 
				-       well (ie. all software interrupts).
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-tasklet">
			
 
				-    <glossterm>tasklet</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       A dynamically-registrable software interrupt,
			
 
				-       which is guaranteed to only run on one CPU at a time.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-timers">
			
 
				-    <glossterm>timer</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       A dynamically-registrable software interrupt, which is run at
			
 
				-       (or close to) a given time.  When running, it is just like a
			
 
				-       tasklet (in fact, they are called from the TIMER_SOFTIRQ).
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-up">
			
 
				-    <glossterm><acronym>UP</acronym></glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       Uni-Processor: Non-SMP.  (CONFIG_SMP=n).
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-usercontext">
			
 
				-    <glossterm>User Context</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       The kernel executing on behalf of a particular process (ie. a
			
 
				-       system call or trap) or kernel thread.  You can tell which
			
 
				-       process with the <symbol>current</symbol> macro.)  Not to
			
 
				-       be confused with userspace.  Can be interrupted by software or
			
 
				-       hardware interrupts.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>
			
 
				-
			
 
				-   <glossentry id="gloss-userspace">
			
 
				-    <glossterm>Userspace</glossterm>
			
 
				-    <glossdef>
			
 
				-     <para>
			
 
				-       A process executing its own code outside the kernel.
			
 
				-     </para>
			
 
				-    </glossdef>
			
 
				-   </glossentry>      
			
 
				-
			
 
				-  </glossary>
			
 
				-</book>
			
 
				-
			
--- a/Documentation/DocBook/kgdb.tmpl
+++ b/Documentation/DocBook/kgdb.tmpl
@@ -1,918 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="kgdbOnLinux">
			
 
				- <bookinfo>
			
 
				-  <title>Using kgdb, kdb and the kernel debugger internals</title>
			
 
				-
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Jason</firstname>
			
 
				-    <surname>Wessel</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>jason.wessel@windriver.com</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-  <copyright>
			
 
				-   <year>2008,2010</year>
			
 
				-   <holder>Wind River Systems, Inc.</holder>
			
 
				-  </copyright>
			
 
				-  <copyright>
			
 
				-   <year>2004-2005</year>
			
 
				-   <holder>MontaVista Software, Inc.</holder>
			
 
				-  </copyright>
			
 
				-  <copyright>
			
 
				-   <year>2004</year>
			
 
				-   <holder>Amit S. Kale</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-   This file is licensed under the terms of the GNU General Public License
			
 
				-   version 2. This program is licensed "as is" without any warranty of any
			
 
				-   kind, whether express or implied.
			
 
				-   </para>
			
 
				-
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-  <chapter id="Introduction">
			
 
				-    <title>Introduction</title>
			
 
				-    <para>
			
 
				-    The kernel has two different debugger front ends (kdb and kgdb)
			
 
				-    which interface to the debug core.  It is possible to use either
			
 
				-    of the debugger front ends and dynamically transition between them
			
 
				-    if you configure the kernel properly at compile and runtime.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    Kdb is simplistic shell-style interface which you can use on a
			
 
				-    system console with a keyboard or serial console.  You can use it
			
 
				-    to inspect memory, registers, process lists, dmesg, and even set
			
 
				-    breakpoints to stop in a certain location.  Kdb is not a source
			
 
				-    level debugger, although you can set breakpoints and execute some
			
 
				-    basic kernel run control.  Kdb is mainly aimed at doing some
			
 
				-    analysis to aid in development or diagnosing kernel problems.  You
			
 
				-    can access some symbols by name in kernel built-ins or in kernel
			
 
				-    modules if the code was built
			
 
				-    with <symbol>CONFIG_KALLSYMS</symbol>.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    Kgdb is intended to be used as a source level debugger for the
			
 
				-    Linux kernel. It is used along with gdb to debug a Linux kernel.
			
 
				-    The expectation is that gdb can be used to "break in" to the
			
 
				-    kernel to inspect memory, variables and look through call stack
			
 
				-    information similar to the way an application developer would use
			
 
				-    gdb to debug an application.  It is possible to place breakpoints
			
 
				-    in kernel code and perform some limited execution stepping.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    Two machines are required for using kgdb. One of these machines is
			
 
				-    a development machine and the other is the target machine.  The
			
 
				-    kernel to be debugged runs on the target machine. The development
			
 
				-    machine runs an instance of gdb against the vmlinux file which
			
 
				-    contains the symbols (not a boot image such as bzImage, zImage,
			
 
				-    uImage...).  In gdb the developer specifies the connection
			
 
				-    parameters and connects to kgdb.  The type of connection a
			
 
				-    developer makes with gdb depends on the availability of kgdb I/O
			
 
				-    modules compiled as built-ins or loadable kernel modules in the test
			
 
				-    machine's kernel.
			
 
				-    </para>
			
 
				-  </chapter>
			
 
				-  <chapter id="CompilingAKernel">
			
 
				-  <title>Compiling a kernel</title>
			
 
				-  <para>
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para>In order to enable compilation of kdb, you must first enable kgdb.</para></listitem>
			
 
				-  <listitem><para>The kgdb test compile options are described in the kgdb test suite chapter.</para></listitem>
			
 
				-  </itemizedlist>
			
 
				-  </para>
			
 
				-  <sect1 id="CompileKGDB">
			
 
				-    <title>Kernel config options for kgdb</title>
			
 
				-    <para>
			
 
				-    To enable <symbol>CONFIG_KGDB</symbol> you should look under
			
 
				-    "Kernel hacking" / "Kernel debugging" and select "KGDB: kernel debugger".
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    While it is not a hard requirement that you have symbols in your
			
 
				-    vmlinux file, gdb tends not to be very useful without the symbolic
			
 
				-    data, so you will want to turn
			
 
				-    on <symbol>CONFIG_DEBUG_INFO</symbol> which is called "Compile the
			
 
				-    kernel with debug info" in the config menu.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    It is advised, but not required, that you turn on the
			
 
				-    <symbol>CONFIG_FRAME_POINTER</symbol> kernel option which is called "Compile the
			
 
				-    kernel with frame pointers" in the config menu.  This option
			
 
				-    inserts code to into the compiled executable which saves the frame
			
 
				-    information in registers or on the stack at different points which
			
 
				-    allows a debugger such as gdb to more accurately construct
			
 
				-    stack back traces while debugging the kernel.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    If the architecture that you are using supports the kernel option
			
 
				-    CONFIG_STRICT_KERNEL_RWX, you should consider turning it off.  This
			
 
				-    option will prevent the use of software breakpoints because it
			
 
				-    marks certain regions of the kernel's memory space as read-only.
			
 
				-    If kgdb supports it for the architecture you are using, you can
			
 
				-    use hardware breakpoints if you desire to run with the
			
 
				-    CONFIG_STRICT_KERNEL_RWX option turned on, else you need to turn off
			
 
				-    this option.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    Next you should choose one of more I/O drivers to interconnect
			
 
				-    debugging host and debugged target.  Early boot debugging requires
			
 
				-    a KGDB I/O driver that supports early debugging and the driver
			
 
				-    must be built into the kernel directly. Kgdb I/O driver
			
 
				-    configuration takes place via kernel or module parameters which
			
 
				-    you can learn more about in the in the section that describes the
			
 
				-    parameter "kgdboc".
			
 
				-    </para>
			
 
				-    <para>Here is an example set of .config symbols to enable or
			
 
				-    disable for kgdb:
			
 
				-    <itemizedlist>
			
 
				-    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
			
 
				-    <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KGDB=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
			
 
				-    </itemizedlist>
			
 
				-    </para>
			
 
				-  </sect1>
			
 
				-  <sect1 id="CompileKDB">
			
 
				-    <title>Kernel config options for kdb</title>
			
 
				-    <para>Kdb is quite a bit more complex than the simple gdbstub
			
 
				-    sitting on top of the kernel's debug core.  Kdb must implement a
			
 
				-    shell, and also adds some helper functions in other parts of the
			
 
				-    kernel, responsible for printing out interesting data such as what
			
 
				-    you would see if you ran "lsmod", or "ps".  In order to build kdb
			
 
				-    into the kernel you follow the same steps as you would for kgdb.
			
 
				-    </para>
			
 
				-    <para>The main config option for kdb
			
 
				-    is <symbol>CONFIG_KGDB_KDB</symbol> which is called "KGDB_KDB:
			
 
				-    include kdb frontend for kgdb" in the config menu.  In theory you
			
 
				-    would have already also selected an I/O driver such as the
			
 
				-    CONFIG_KGDB_SERIAL_CONSOLE interface if you plan on using kdb on a
			
 
				-    serial port, when you were configuring kgdb.
			
 
				-    </para>
			
 
				-    <para>If you want to use a PS/2-style keyboard with kdb, you would
			
 
				-    select CONFIG_KDB_KEYBOARD which is called "KGDB_KDB: keyboard as
			
 
				-    input device" in the config menu.  The CONFIG_KDB_KEYBOARD option
			
 
				-    is not used for anything in the gdb interface to kgdb.  The
			
 
				-    CONFIG_KDB_KEYBOARD option only works with kdb.
			
 
				-    </para>
			
 
				-    <para>Here is an example set of .config symbols to enable/disable kdb:
			
 
				-    <itemizedlist>
			
 
				-    <listitem><para># CONFIG_STRICT_KERNEL_RWX is not set</para></listitem>
			
 
				-    <listitem><para>CONFIG_FRAME_POINTER=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KGDB=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KGDB_SERIAL_CONSOLE=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KGDB_KDB=y</para></listitem>
			
 
				-    <listitem><para>CONFIG_KDB_KEYBOARD=y</para></listitem>
			
 
				-    </itemizedlist>
			
 
				-    </para>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="kgdbKernelArgs">
			
 
				-  <title>Kernel Debugger Boot Arguments</title>
			
 
				-  <para>This section describes the various runtime kernel
			
 
				-  parameters that affect the configuration of the kernel debugger.
			
 
				-  The following chapter covers using kdb and kgdb as well as
			
 
				-  providing some examples of the configuration parameters.</para>
			
 
				-   <sect1 id="kgdboc">
			
 
				-   <title>Kernel parameter: kgdboc</title>
			
 
				-   <para>The kgdboc driver was originally an abbreviation meant to
			
 
				-   stand for "kgdb over console".  Today it is the primary mechanism
			
 
				-   to configure how to communicate from gdb to kgdb as well as the
			
 
				-   devices you want to use to interact with the kdb shell.
			
 
				-   </para>
			
 
				-   <para>For kgdb/gdb, kgdboc is designed to work with a single serial
			
 
				-   port. It is intended to cover the circumstance where you want to
			
 
				-   use a serial console as your primary console as well as using it to
			
 
				-   perform kernel debugging.  It is also possible to use kgdb on a
			
 
				-   serial port which is not designated as a system console.  Kgdboc
			
 
				-   may be configured as a kernel built-in or a kernel loadable module.
			
 
				-   You can only make use of <constant>kgdbwait</constant> and early
			
 
				-   debugging if you build kgdboc into the kernel as a built-in.
			
 
				-   </para>
			
 
				-   <para>Optionally you can elect to activate kms (Kernel Mode
			
 
				-   Setting) integration.  When you use kms with kgdboc and you have a
			
 
				-   video driver that has atomic mode setting hooks, it is possible to
			
 
				-   enter the debugger on the graphics console.  When the kernel
			
 
				-   execution is resumed, the previous graphics mode will be restored.
			
 
				-   This integration can serve as a useful tool to aid in diagnosing
			
 
				-   crashes or doing analysis of memory with kdb while allowing the
			
 
				-   full graphics console applications to run.
			
 
				-   </para>
			
 
				-   <sect2 id="kgdbocArgs">
			
 
				-   <title>kgdboc arguments</title>
			
 
				-   <para>Usage: <constant>kgdboc=[kms][[,]kbd][[,]serial_device][,baud]</constant></para>
			
 
				-   <para>The order listed above must be observed if you use any of the
			
 
				-   optional configurations together.
			
 
				-   </para>
			
 
				-   <para>Abbreviations:
			
 
				-   <itemizedlist>
			
 
				-   <listitem><para>kms = Kernel Mode Setting</para></listitem>
			
 
				-   <listitem><para>kbd = Keyboard</para></listitem>
			
 
				-   </itemizedlist>
			
 
				-   </para>
			
 
				-   <para>You can configure kgdboc to use the keyboard, and/or a serial
			
 
				-   device depending on if you are using kdb and/or kgdb, in one of the
			
 
				-   following scenarios.  The order listed above must be observed if
			
 
				-   you use any of the optional configurations together.  Using kms +
			
 
				-   only gdb is generally not a useful combination.</para>
			
 
				-   <sect3 id="kgdbocArgs1">
			
 
				-   <title>Using loadable module or built-in</title>
			
 
				-   <para>
			
 
				-   <orderedlist>
			
 
				-   <listitem><para>As a kernel built-in:</para>
			
 
				-   <para>Use the kernel boot argument: <constant>kgdboc=&lt;tty-device&gt;,[baud]</constant></para></listitem>
			
 
				-   <listitem>
			
 
				-   <para>As a kernel loadable module:</para>
			
 
				-   <para>Use the command: <constant>modprobe kgdboc kgdboc=&lt;tty-device&gt;,[baud]</constant></para>
			
 
				-   <para>Here are two examples of how you might format the kgdboc
			
 
				-   string. The first is for an x86 target using the first serial port.
			
 
				-   The second example is for the ARM Versatile AB using the second
			
 
				-   serial port.
			
 
				-   <orderedlist>
			
 
				-   <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
			
 
				-   <listitem><para><constant>kgdboc=ttyAMA1,115200</constant></para></listitem>
			
 
				-   </orderedlist>
			
 
				-   </para>
			
 
				-   </listitem>
			
 
				-   </orderedlist></para>
			
 
				-   </sect3>
			
 
				-   <sect3 id="kgdbocArgs2">
			
 
				-   <title>Configure kgdboc at runtime with sysfs</title>
			
 
				-   <para>At run time you can enable or disable kgdboc by echoing a
			
 
				-   parameters into the sysfs.  Here are two examples:</para>
			
 
				-   <orderedlist>
			
 
				-   <listitem><para>Enable kgdboc on ttyS0</para>
			
 
				-   <para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
			
 
				-   <listitem><para>Disable kgdboc</para>
			
 
				-   <para><constant>echo "" &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
			
 
				-   </orderedlist>
			
 
				-   <para>NOTE: You do not need to specify the baud if you are
			
 
				-   configuring the console on tty which is already configured or
			
 
				-   open.</para>
			
 
				-   </sect3>
			
 
				-   <sect3 id="kgdbocArgs3">
			
 
				-   <title>More examples</title>
			
 
				-   <para>You can configure kgdboc to use the keyboard, and/or a serial device
			
 
				-   depending on if you are using kdb and/or kgdb, in one of the
			
 
				-   following scenarios.
			
 
				-   <orderedlist>
			
 
				-   <listitem><para>kdb and kgdb over only a serial port</para>
			
 
				-   <para><constant>kgdboc=&lt;serial_device&gt;[,baud]</constant></para>
			
 
				-   <para>Example: <constant>kgdboc=ttyS0,115200</constant></para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>kdb and kgdb with keyboard and a serial port</para>
			
 
				-   <para><constant>kgdboc=kbd,&lt;serial_device&gt;[,baud]</constant></para>
			
 
				-   <para>Example: <constant>kgdboc=kbd,ttyS0,115200</constant></para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>kdb with a keyboard</para>
			
 
				-   <para><constant>kgdboc=kbd</constant></para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>kdb with kernel mode setting</para>
			
 
				-   <para><constant>kgdboc=kms,kbd</constant></para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>kdb with kernel mode setting and kgdb over a serial port</para>
			
 
				-   <para><constant>kgdboc=kms,kbd,ttyS0,115200</constant></para>
			
 
				-   </listitem>
			
 
				-   </orderedlist>
			
 
				-   </para>
			
 
				-   <para>NOTE: Kgdboc does not support interrupting the target via the
			
 
				-   gdb remote protocol.  You must manually send a sysrq-g unless you
			
 
				-   have a proxy that splits console output to a terminal program.
			
 
				-   A console proxy has a separate TCP port for the debugger and a separate
			
 
				-   TCP port for the "human" console.  The proxy can take care of sending
			
 
				-   the sysrq-g for you.
			
 
				-   </para>
			
 
				-   <para>When using kgdboc with no debugger proxy, you can end up
			
 
				-    connecting the debugger at one of two entry points.  If an
			
 
				-    exception occurs after you have loaded kgdboc, a message should
			
 
				-    print on the console stating it is waiting for the debugger.  In
			
 
				-    this case you disconnect your terminal program and then connect the
			
 
				-    debugger in its place.  If you want to interrupt the target system
			
 
				-    and forcibly enter a debug session you have to issue a Sysrq
			
 
				-    sequence and then type the letter <constant>g</constant>.  Then
			
 
				-    you disconnect the terminal session and connect gdb.  Your options
			
 
				-    if you don't like this are to hack gdb to send the sysrq-g for you
			
 
				-    as well as on the initial connect, or to use a debugger proxy that
			
 
				-    allows an unmodified gdb to do the debugging.
			
 
				-   </para>
			
 
				-   </sect3>
			
 
				-   </sect2>
			
 
				-   </sect1>
			
 
				-   <sect1 id="kgdbwait">
			
 
				-   <title>Kernel parameter: kgdbwait</title>
			
 
				-   <para>
			
 
				-   The Kernel command line option <constant>kgdbwait</constant> makes
			
 
				-   kgdb wait for a debugger connection during booting of a kernel.  You
			
 
				-   can only use this option if you compiled a kgdb I/O driver into the
			
 
				-   kernel and you specified the I/O driver configuration as a kernel
			
 
				-   command line option.  The kgdbwait parameter should always follow the
			
 
				-   configuration parameter for the kgdb I/O driver in the kernel
			
 
				-   command line else the I/O driver will not be configured prior to
			
 
				-   asking the kernel to use it to wait.
			
 
				-   </para>
			
 
				-   <para>
			
 
				-   The kernel will stop and wait as early as the I/O driver and
			
 
				-   architecture allows when you use this option.  If you build the
			
 
				-   kgdb I/O driver as a loadable kernel module kgdbwait will not do
			
 
				-   anything.
			
 
				-   </para>
			
 
				-   </sect1>
			
 
				-   <sect1 id="kgdbcon">
			
 
				-   <title>Kernel parameter: kgdbcon</title>
			
 
				-   <para> The kgdbcon feature allows you to see printk() messages
			
 
				-   inside gdb while gdb is connected to the kernel.  Kdb does not make
			
 
				-    use of the kgdbcon feature.
			
 
				-   </para>
			
 
				-   <para>Kgdb supports using the gdb serial protocol to send console
			
 
				-   messages to the debugger when the debugger is connected and running.
			
 
				-   There are two ways to activate this feature.
			
 
				-   <orderedlist>
			
 
				-   <listitem><para>Activate with the kernel command line option:</para>
			
 
				-   <para><constant>kgdbcon</constant></para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>Use sysfs before configuring an I/O driver</para>
			
 
				-   <para>
			
 
				-   <constant>echo 1 &gt; /sys/module/kgdb/parameters/kgdb_use_con</constant>
			
 
				-   </para>
			
 
				-   <para>
			
 
				-   NOTE: If you do this after you configure the kgdb I/O driver, the
			
 
				-   setting will not take effect until the next point the I/O is
			
 
				-   reconfigured.
			
 
				-   </para>
			
 
				-   </listitem>
			
 
				-   </orderedlist>
			
 
				-  </para>
			
 
				-   <para>IMPORTANT NOTE: You cannot use kgdboc + kgdbcon on a tty that is an
			
 
				-   active system console.  An example of incorrect usage is <constant>console=ttyS0,115200 kgdboc=ttyS0 kgdbcon</constant>
			
 
				-   </para>
			
 
				-   <para>It is possible to use this option with kgdboc on a tty that is not a system console.
			
 
				-   </para>
			
 
				-  </sect1>
			
 
				-   <sect1 id="kgdbreboot">
			
 
				-   <title>Run time parameter: kgdbreboot</title>
			
 
				-   <para> The kgdbreboot feature allows you to change how the debugger
			
 
				-   deals with the reboot notification.  You have 3 choices for the
			
 
				-   behavior.  The default behavior is always set to 0.</para>
			
 
				-   <orderedlist>
			
 
				-   <listitem><para>echo -1 > /sys/module/debug_core/parameters/kgdbreboot</para>
			
 
				-   <para>Ignore the reboot notification entirely.</para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>echo 0 > /sys/module/debug_core/parameters/kgdbreboot</para>
			
 
				-   <para>Send the detach message to any attached debugger client.</para>
			
 
				-   </listitem>
			
 
				-   <listitem><para>echo 1 > /sys/module/debug_core/parameters/kgdbreboot</para>
			
 
				-   <para>Enter the debugger on reboot notify.</para>
			
 
				-   </listitem>
			
 
				-   </orderedlist>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="usingKDB">
			
 
				-  <title>Using kdb</title>
			
 
				-  <para>
			
 
				-  </para>
			
 
				-  <sect1 id="quickKDBserial">
			
 
				-  <title>Quick start for kdb on a serial port</title>
			
 
				-  <para>This is a quick example of how to use kdb.</para>
			
 
				-  <para><orderedlist>
			
 
				-  <listitem><para>Configure kgdboc at boot using kernel parameters:
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para><constant>console=ttyS0,115200 kgdboc=ttyS0,115200</constant></para></listitem>
			
 
				-  </itemizedlist></para>
			
 
				-  <para>OR</para>
			
 
				-  <para>Configure kgdboc after the kernel has booted; assuming you are using a serial port console:
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
			
 
				-  </itemizedlist>
			
 
				-  </para>
			
 
				-  </listitem>
			
 
				-  <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault.  There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para>When logged in as root or with a super user session you can run:</para>
			
 
				-   <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
			
 
				-  <listitem><para>Example using minicom 2.2</para>
			
 
				-  <para>Press: <constant>Control-a</constant></para>
			
 
				-  <para>Press: <constant>f</constant></para>
			
 
				-  <para>Press: <constant>g</constant></para>
			
 
				-  </listitem>
			
 
				-  <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
			
 
				-  <para>Press: <constant>Control-]</constant></para>
			
 
				-  <para>Type in:<constant>send break</constant></para>
			
 
				-  <para>Press: <constant>Enter</constant></para>
			
 
				-  <para>Press: <constant>g</constant></para>
			
 
				-  </listitem>
			
 
				-  </itemizedlist>
			
 
				-  </listitem>
			
 
				-  <listitem><para>From the kdb prompt you can run the "help" command to see a complete list of the commands that are available.</para>
			
 
				-  <para>Some useful commands in kdb include:
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para>lsmod  -- Shows where kernel modules are loaded</para></listitem>
			
 
				-  <listitem><para>ps -- Displays only the active processes</para></listitem>
			
 
				-  <listitem><para>ps A -- Shows all the processes</para></listitem>
			
 
				-  <listitem><para>summary -- Shows kernel version info and memory usage</para></listitem>
			
 
				-  <listitem><para>bt -- Get a backtrace of the current process using dump_stack()</para></listitem>
			
 
				-  <listitem><para>dmesg -- View the kernel syslog buffer</para></listitem>
			
 
				-  <listitem><para>go -- Continue the system</para></listitem>
			
 
				-  </itemizedlist>
			
 
				-  </para>
			
 
				-  </listitem>
			
 
				-  <listitem>
			
 
				-  <para>When you are done using kdb you need to consider rebooting the
			
 
				-  system or using the "go" command to resuming normal kernel
			
 
				-  execution.  If you have paused the kernel for a lengthy period of
			
 
				-  time, applications that rely on timely networking or anything to do
			
 
				-  with real wall clock time could be adversely affected, so you
			
 
				-  should take this into consideration when using the kernel
			
 
				-  debugger.</para>
			
 
				-  </listitem>
			
 
				-  </orderedlist></para>
			
 
				-  </sect1>
			
 
				-  <sect1 id="quickKDBkeyboard">
			
 
				-  <title>Quick start for kdb using a keyboard connected console</title>
			
 
				-  <para>This is a quick example of how to use kdb with a keyboard.</para>
			
 
				-  <para><orderedlist>
			
 
				-  <listitem><para>Configure kgdboc at boot using kernel parameters:
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para><constant>kgdboc=kbd</constant></para></listitem>
			
 
				-  </itemizedlist></para>
			
 
				-  <para>OR</para>
			
 
				-  <para>Configure kgdboc after the kernel has booted:
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para><constant>echo kbd &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
			
 
				-  </itemizedlist>
			
 
				-  </para>
			
 
				-  </listitem>
			
 
				-  <listitem><para>Enter the kernel debugger manually or by waiting for an oops or fault.  There are several ways you can enter the kernel debugger manually; all involve using the sysrq-g, which means you must have enabled CONFIG_MAGIC_SYSRQ=y in your kernel config.</para>
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para>When logged in as root or with a super user session you can run:</para>
			
 
				-   <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
			
 
				-  <listitem><para>Example using a laptop keyboard</para>
			
 
				-  <para>Press and hold down: <constant>Alt</constant></para>
			
 
				-  <para>Press and hold down: <constant>Fn</constant></para>
			
 
				-  <para>Press and release the key with the label: <constant>SysRq</constant></para>
			
 
				-  <para>Release: <constant>Fn</constant></para>
			
 
				-  <para>Press and release: <constant>g</constant></para>
			
 
				-  <para>Release: <constant>Alt</constant></para>
			
 
				-  </listitem>
			
 
				-  <listitem><para>Example using a PS/2 101-key keyboard</para>
			
 
				-  <para>Press and hold down: <constant>Alt</constant></para>
			
 
				-  <para>Press and release the key with the label: <constant>SysRq</constant></para>
			
 
				-  <para>Press and release: <constant>g</constant></para>
			
 
				-  <para>Release: <constant>Alt</constant></para>
			
 
				-  </listitem>
			
 
				-  </itemizedlist>
			
 
				-  </listitem>
			
 
				-  <listitem>
			
 
				-  <para>Now type in a kdb command such as "help", "dmesg", "bt" or "go" to continue kernel execution.</para>
			
 
				-  </listitem>
			
 
				-  </orderedlist></para>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="EnableKGDB">
			
 
				-   <title>Using kgdb / gdb</title>
			
 
				-   <para>In order to use kgdb you must activate it by passing
			
 
				-   configuration information to one of the kgdb I/O drivers.  If you
			
 
				-   do not pass any configuration information kgdb will not do anything
			
 
				-   at all.  Kgdb will only actively hook up to the kernel trap hooks
			
 
				-   if a kgdb I/O driver is loaded and configured.  If you unconfigure
			
 
				-   a kgdb I/O driver, kgdb will unregister all the kernel hook points.
			
 
				-   </para>
			
 
				-   <para> All kgdb I/O drivers can be reconfigured at run time, if
			
 
				-   <symbol>CONFIG_SYSFS</symbol> and <symbol>CONFIG_MODULES</symbol>
			
 
				-   are enabled, by echo'ing a new config string to
			
 
				-   <constant>/sys/module/&lt;driver&gt;/parameter/&lt;option&gt;</constant>.
			
 
				-   The driver can be unconfigured by passing an empty string.  You cannot
			
 
				-   change the configuration while the debugger is attached.  Make sure
			
 
				-   to detach the debugger with the <constant>detach</constant> command
			
 
				-   prior to trying to unconfigure a kgdb I/O driver.
			
 
				-   </para>
			
 
				-  <sect1 id="ConnectingGDB">
			
 
				-  <title>Connecting with gdb to a serial port</title>
			
 
				-  <orderedlist>
			
 
				-  <listitem><para>Configure kgdboc</para>
			
 
				-   <para>Configure kgdboc at boot using kernel parameters:
			
 
				-   <itemizedlist>
			
 
				-    <listitem><para><constant>kgdboc=ttyS0,115200</constant></para></listitem>
			
 
				-   </itemizedlist></para>
			
 
				-   <para>OR</para>
			
 
				-   <para>Configure kgdboc after the kernel has booted:
			
 
				-   <itemizedlist>
			
 
				-    <listitem><para><constant>echo ttyS0 &gt; /sys/module/kgdboc/parameters/kgdboc</constant></para></listitem>
			
 
				-   </itemizedlist></para>
			
 
				-  </listitem>
			
 
				-  <listitem>
			
 
				-  <para>Stop kernel execution (break into the debugger)</para>
			
 
				-  <para>In order to connect to gdb via kgdboc, the kernel must
			
 
				-  first be stopped.  There are several ways to stop the kernel which
			
 
				-  include using kgdbwait as a boot argument, via a sysrq-g, or running
			
 
				-  the kernel until it takes an exception where it waits for the
			
 
				-  debugger to attach.
			
 
				-  <itemizedlist>
			
 
				-  <listitem><para>When logged in as root or with a super user session you can run:</para>
			
 
				-   <para><constant>echo g &gt; /proc/sysrq-trigger</constant></para></listitem>
			
 
				-  <listitem><para>Example using minicom 2.2</para>
			
 
				-  <para>Press: <constant>Control-a</constant></para>
			
 
				-  <para>Press: <constant>f</constant></para>
			
 
				-  <para>Press: <constant>g</constant></para>
			
 
				-  </listitem>
			
 
				-  <listitem><para>When you have telneted to a terminal server that supports sending a remote break</para>
			
 
				-  <para>Press: <constant>Control-]</constant></para>
			
 
				-  <para>Type in:<constant>send break</constant></para>
			
 
				-  <para>Press: <constant>Enter</constant></para>
			
 
				-  <para>Press: <constant>g</constant></para>
			
 
				-  </listitem>
			
 
				-  </itemizedlist>
			
 
				-  </para>
			
 
				-  </listitem>
			
 
				-  <listitem>
			
 
				-    <para>Connect from gdb</para>
			
 
				-    <para>
			
 
				-    Example (using a directly connected port):
			
 
				-    </para>
			
 
				-    <programlisting>
			
 
				-    % gdb ./vmlinux
			
 
				-    (gdb) set remotebaud 115200
			
 
				-    (gdb) target remote /dev/ttyS0
			
 
				-    </programlisting>
			
 
				-    <para>
			
 
				-    Example (kgdb to a terminal server on TCP port 2012):
			
 
				-    </para>
			
 
				-    <programlisting>
			
 
				-    % gdb ./vmlinux
			
 
				-    (gdb) target remote 192.168.2.2:2012
			
 
				-    </programlisting>
			
 
				-    <para>
			
 
				-    Once connected, you can debug a kernel the way you would debug an
			
 
				-    application program.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    If you are having problems connecting or something is going
			
 
				-    seriously wrong while debugging, it will most often be the case
			
 
				-    that you want to enable gdb to be verbose about its target
			
 
				-    communications.  You do this prior to issuing the <constant>target
			
 
				-    remote</constant> command by typing in: <constant>set debug remote 1</constant>
			
 
				-    </para>
			
 
				-  </listitem>
			
 
				-  </orderedlist>
			
 
				-  <para>Remember if you continue in gdb, and need to "break in" again,
			
 
				-  you need to issue an other sysrq-g.  It is easy to create a simple
			
 
				-  entry point by putting a breakpoint at <constant>sys_sync</constant>
			
 
				-  and then you can run "sync" from a shell or script to break into the
			
 
				-  debugger.</para>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="switchKdbKgdb">
			
 
				-  <title>kgdb and kdb interoperability</title>
			
 
				-  <para>It is possible to transition between kdb and kgdb dynamically.
			
 
				-  The debug core will remember which you used the last time and
			
 
				-  automatically start in the same mode.</para>
			
 
				-  <sect1>
			
 
				-  <title>Switching between kdb and kgdb</title>
			
 
				-  <sect2>
			
 
				-  <title>Switching from kgdb to kdb</title>
			
 
				-  <para>
			
 
				-  There are two ways to switch from kgdb to kdb: you can use gdb to
			
 
				-  issue a maintenance packet, or you can blindly type the command $3#33.
			
 
				-  Whenever the kernel debugger stops in kgdb mode it will print the
			
 
				-  message <constant>KGDB or $3#33 for KDB</constant>.  It is important
			
 
				-  to note that you have to type the sequence correctly in one pass.
			
 
				-  You cannot type a backspace or delete because kgdb will interpret
			
 
				-  that as part of the debug stream.
			
 
				-  <orderedlist>
			
 
				-  <listitem><para>Change from kgdb to kdb by blindly typing:</para>
			
 
				-  <para><constant>$3#33</constant></para></listitem>
			
 
				-  <listitem><para>Change from kgdb to kdb with gdb</para>
			
 
				-  <para><constant>maintenance packet 3</constant></para>
			
 
				-  <para>NOTE: Now you must kill gdb. Typically you press control-z and
			
 
				-  issue the command: kill -9 %</para></listitem>
			
 
				-  </orderedlist>
			
 
				-  </para>
			
 
				-  </sect2>
			
 
				-  <sect2>
			
 
				-  <title>Change from kdb to kgdb</title>
			
 
				-  <para>There are two ways you can change from kdb to kgdb.  You can
			
 
				-  manually enter kgdb mode by issuing the kgdb command from the kdb
			
 
				-  shell prompt, or you can connect gdb while the kdb shell prompt is
			
 
				-  active.  The kdb shell looks for the typical first commands that gdb
			
 
				-  would issue with the gdb remote protocol and if it sees one of those
			
 
				-  commands it automatically changes into kgdb mode.</para>
			
 
				-  <orderedlist>
			
 
				-  <listitem><para>From kdb issue the command:</para>
			
 
				-  <para><constant>kgdb</constant></para>
			
 
				-  <para>Now disconnect your terminal program and connect gdb in its place</para></listitem>
			
 
				-  <listitem><para>At the kdb prompt, disconnect the terminal program and connect gdb in its place.</para></listitem>
			
 
				-  </orderedlist>
			
 
				-  </sect2>
			
 
				-  </sect1>
			
 
				-  <sect1>
			
 
				-  <title>Running kdb commands from gdb</title>
			
 
				-  <para>It is possible to run a limited set of kdb commands from gdb,
			
 
				-  using the gdb monitor command.  You don't want to execute any of the
			
 
				-  run control or breakpoint operations, because it can disrupt the
			
 
				-  state of the kernel debugger.  You should be using gdb for
			
 
				-  breakpoints and run control operations if you have gdb connected.
			
 
				-  The more useful commands to run are things like lsmod, dmesg, ps or
			
 
				-  possibly some of the memory information commands.  To see all the kdb
			
 
				-  commands you can run <constant>monitor help</constant>.</para>
			
 
				-  <para>Example:
			
 
				-  <informalexample><programlisting>
			
 
				-(gdb) monitor ps
			
 
				-1 idle process (state I) and
			
 
				-27 sleeping system daemon (state M) processes suppressed,
			
 
				-use 'ps A' to see all.
			
 
				-Task Addr       Pid   Parent [*] cpu State Thread     Command
			
 
				-
			
 
				-0xc78291d0        1        0  0    0   S  0xc7829404  init
			
 
				-0xc7954150      942        1  0    0   S  0xc7954384  dropbear
			
 
				-0xc78789c0      944        1  0    0   S  0xc7878bf4  sh
			
 
				-(gdb)
			
 
				-  </programlisting></informalexample>
			
 
				-  </para>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="KGDBTestSuite">
			
 
				-    <title>kgdb Test Suite</title>
			
 
				-    <para>
			
 
				-    When kgdb is enabled in the kernel config you can also elect to
			
 
				-    enable the config parameter KGDB_TESTS.  Turning this on will
			
 
				-    enable a special kgdb I/O module which is designed to test the
			
 
				-    kgdb internal functions.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    The kgdb tests are mainly intended for developers to test the kgdb
			
 
				-    internals as well as a tool for developing a new kgdb architecture
			
 
				-    specific implementation.  These tests are not really for end users
			
 
				-    of the Linux kernel.  The primary source of documentation would be
			
 
				-    to look in the drivers/misc/kgdbts.c file.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-    The kgdb test suite can also be configured at compile time to run
			
 
				-    the core set of tests by setting the kernel config parameter
			
 
				-    KGDB_TESTS_ON_BOOT.  This particular option is aimed at automated
			
 
				-    regression testing and does not require modifying the kernel boot
			
 
				-    config arguments.  If this is turned on, the kgdb test suite can
			
 
				-    be disabled by specifying "kgdbts=" as a kernel boot argument.
			
 
				-    </para>
			
 
				-  </chapter>
			
 
				-  <chapter id="CommonBackEndReq">
			
 
				-  <title>Kernel Debugger Internals</title>
			
 
				-  <sect1 id="kgdbArchitecture">
			
 
				-    <title>Architecture Specifics</title>
			
 
				-      <para>
			
 
				-      The kernel debugger is organized into a number of components:
			
 
				-      <orderedlist>
			
 
				-      <listitem><para>The debug core</para>
			
 
				-      <para>
			
 
				-      The debug core is found in kernel/debugger/debug_core.c.  It contains:
			
 
				-      <itemizedlist>
			
 
				-      <listitem><para>A generic OS exception handler which includes
			
 
				-      sync'ing the processors into a stopped state on an multi-CPU
			
 
				-      system.</para></listitem>
			
 
				-      <listitem><para>The API to talk to the kgdb I/O drivers</para></listitem>
			
 
				-      <listitem><para>The API to make calls to the arch-specific kgdb implementation</para></listitem>
			
 
				-      <listitem><para>The logic to perform safe memory reads and writes to memory while using the debugger</para></listitem>
			
 
				-      <listitem><para>A full implementation for software breakpoints unless overridden by the arch</para></listitem>
			
 
				-      <listitem><para>The API to invoke either the kdb or kgdb frontend to the debug core.</para></listitem>
			
 
				-      <listitem><para>The structures and callback API for atomic kernel mode setting.</para>
			
 
				-      <para>NOTE: kgdboc is where the kms callbacks are invoked.</para></listitem>
			
 
				-      </itemizedlist>
			
 
				-      </para>
			
 
				-      </listitem>
			
 
				-      <listitem><para>kgdb arch-specific implementation</para>
			
 
				-      <para>
			
 
				-      This implementation is generally found in arch/*/kernel/kgdb.c.
			
 
				-      As an example, arch/x86/kernel/kgdb.c contains the specifics to
			
 
				-      implement HW breakpoint as well as the initialization to
			
 
				-      dynamically register and unregister for the trap handlers on
			
 
				-      this architecture.  The arch-specific portion implements:
			
 
				-      <itemizedlist>
			
 
				-      <listitem><para>contains an arch-specific trap catcher which
			
 
				-      invokes kgdb_handle_exception() to start kgdb about doing its
			
 
				-      work</para></listitem>
			
 
				-      <listitem><para>translation to and from gdb specific packet format to pt_regs</para></listitem>
			
 
				-      <listitem><para>Registration and unregistration of architecture specific trap hooks</para></listitem>
			
 
				-      <listitem><para>Any special exception handling and cleanup</para></listitem>
			
 
				-      <listitem><para>NMI exception handling and cleanup</para></listitem>
			
 
				-      <listitem><para>(optional) HW breakpoints</para></listitem>
			
 
				-      </itemizedlist>
			
 
				-      </para>
			
 
				-      </listitem>
			
 
				-      <listitem><para>gdbstub frontend (aka kgdb)</para>
			
 
				-      <para>The gdbstub is located in kernel/debug/gdbstub.c. It contains:</para>
			
 
				-      <itemizedlist>
			
 
				-        <listitem><para>All the logic to implement the gdb serial protocol</para></listitem>
			
 
				-      </itemizedlist>
			
 
				-      </listitem>
			
 
				-      <listitem><para>kdb frontend</para>
			
 
				-      <para>The kdb debugger shell is broken down into a number of
			
 
				-      components.  The kdb core is located in kernel/debug/kdb.  There
			
 
				-      are a number of helper functions in some of the other kernel
			
 
				-      components to make it possible for kdb to examine and report
			
 
				-      information about the kernel without taking locks that could
			
 
				-      cause a kernel deadlock.  The kdb core contains implements the following functionality.</para>
			
 
				-      <itemizedlist>
			
 
				-        <listitem><para>A simple shell</para></listitem>
			
 
				-        <listitem><para>The kdb core command set</para></listitem>
			
 
				-        <listitem><para>A registration API to register additional kdb shell commands.</para>
			
 
				-	<itemizedlist>
			
 
				-        <listitem><para>A good example of a self-contained kdb module
			
 
				-        is the "ftdump" command for dumping the ftrace buffer.  See:
			
 
				-        kernel/trace/trace_kdb.c</para></listitem>
			
 
				-        <listitem><para>For an example of how to dynamically register
			
 
				-        a new kdb command you can build the kdb_hello.ko kernel module
			
 
				-        from samples/kdb/kdb_hello.c.  To build this example you can
			
 
				-        set CONFIG_SAMPLES=y and CONFIG_SAMPLE_KDB=m in your kernel
			
 
				-        config.  Later run "modprobe kdb_hello" and the next time you
			
 
				-        enter the kdb shell, you can run the "hello"
			
 
				-        command.</para></listitem>
			
 
				-	</itemizedlist></listitem>
			
 
				-        <listitem><para>The implementation for kdb_printf() which
			
 
				-        emits messages directly to I/O drivers, bypassing the kernel
			
 
				-        log.</para></listitem>
			
 
				-        <listitem><para>SW / HW breakpoint management for the kdb shell</para></listitem>
			
 
				-      </itemizedlist>
			
 
				-      </listitem>
			
 
				-      <listitem><para>kgdb I/O driver</para>
			
 
				-      <para>
			
 
				-      Each kgdb I/O driver has to provide an implementation for the following:
			
 
				-      <itemizedlist>
			
 
				-      <listitem><para>configuration via built-in or module</para></listitem>
			
 
				-      <listitem><para>dynamic configuration and kgdb hook registration calls</para></listitem>
			
 
				-      <listitem><para>read and write character interface</para></listitem>
			
 
				-      <listitem><para>A cleanup handler for unconfiguring from the kgdb core</para></listitem>
			
 
				-      <listitem><para>(optional) Early debug methodology</para></listitem>
			
 
				-      </itemizedlist>
			
 
				-      Any given kgdb I/O driver has to operate very closely with the
			
 
				-      hardware and must do it in such a way that does not enable
			
 
				-      interrupts or change other parts of the system context without
			
 
				-      completely restoring them. The kgdb core will repeatedly "poll"
			
 
				-      a kgdb I/O driver for characters when it needs input.  The I/O
			
 
				-      driver is expected to return immediately if there is no data
			
 
				-      available.  Doing so allows for the future possibility to touch
			
 
				-      watchdog hardware in such a way as to have a target system not
			
 
				-      reset when these are enabled.
			
 
				-      </para>
			
 
				-      </listitem>
			
 
				-      </orderedlist>
			
 
				-      </para>
			
 
				-      <para>
			
 
				-      If you are intent on adding kgdb architecture specific support
			
 
				-      for a new architecture, the architecture should define
			
 
				-      <constant>HAVE_ARCH_KGDB</constant> in the architecture specific
			
 
				-      Kconfig file.  This will enable kgdb for the architecture, and
			
 
				-      at that point you must create an architecture specific kgdb
			
 
				-      implementation.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-      There are a few flags which must be set on every architecture in
			
 
				-      their &lt;asm/kgdb.h&gt; file.  These are:
			
 
				-      <itemizedlist>
			
 
				-        <listitem>
			
 
				-          <para>
			
 
				-          NUMREGBYTES: The size in bytes of all of the registers, so
			
 
				-          that we can ensure they will all fit into a packet.
			
 
				-          </para>
			
 
				-        </listitem>
			
 
				-        <listitem>
			
 
				-          <para>
			
 
				-          BUFMAX: The size in bytes of the buffer GDB will read into.
			
 
				-          This must be larger than NUMREGBYTES.
			
 
				-          </para>
			
 
				-        </listitem>
			
 
				-        <listitem>
			
 
				-          <para>
			
 
				-          CACHE_FLUSH_IS_SAFE: Set to 1 if it is always safe to call
			
 
				-          flush_cache_range or flush_icache_range.  On some architectures,
			
 
				-          these functions may not be safe to call on SMP since we keep other
			
 
				-          CPUs in a holding pattern.
			
 
				-          </para>
			
 
				-        </listitem>
			
 
				-      </itemizedlist>
			
 
				-      </para>
			
 
				-      <para>
			
 
				-      There are also the following functions for the common backend,
			
 
				-      found in kernel/kgdb.c, that must be supplied by the
			
 
				-      architecture-specific backend unless marked as (optional), in
			
 
				-      which case a default function maybe used if the architecture
			
 
				-      does not need to provide a specific implementation.
			
 
				-      </para>
			
 
				-!Iinclude/linux/kgdb.h
			
 
				-  </sect1>
			
 
				-  <sect1 id="kgdbocDesign">
			
 
				-  <title>kgdboc internals</title>
			
 
				-  <sect2>
			
 
				-  <title>kgdboc and uarts</title>
			
 
				-  <para>
			
 
				-  The kgdboc driver is actually a very thin driver that relies on the
			
 
				-  underlying low level to the hardware driver having "polling hooks"
			
 
				-  to which the tty driver is attached.  In the initial
			
 
				-  implementation of kgdboc the serial_core was changed to expose a
			
 
				-  low level UART hook for doing polled mode reading and writing of a
			
 
				-  single character while in an atomic context.  When kgdb makes an I/O
			
 
				-  request to the debugger, kgdboc invokes a callback in the serial
			
 
				-  core which in turn uses the callback in the UART driver.</para>
			
 
				-  <para>
			
 
				-  When using kgdboc with a UART, the UART driver must implement two callbacks in the <constant>struct uart_ops</constant>. Example from drivers/8250.c:<programlisting>
			
 
				-#ifdef CONFIG_CONSOLE_POLL
			
 
				-	.poll_get_char = serial8250_get_poll_char,
			
 
				-	.poll_put_char = serial8250_put_poll_char,
			
 
				-#endif
			
 
				-  </programlisting>
			
 
				-  Any implementation specifics around creating a polling driver use the
			
 
				-  <constant>#ifdef CONFIG_CONSOLE_POLL</constant>, as shown above.
			
 
				-  Keep in mind that polling hooks have to be implemented in such a way
			
 
				-  that they can be called from an atomic context and have to restore
			
 
				-  the state of the UART chip on return such that the system can return
			
 
				-  to normal when the debugger detaches.  You need to be very careful
			
 
				-  with any kind of lock you consider, because failing here is most likely
			
 
				-  going to mean pressing the reset button.
			
 
				-  </para>
			
 
				-  </sect2>
			
 
				-  <sect2 id="kgdbocKbd">
			
 
				-  <title>kgdboc and keyboards</title>
			
 
				-  <para>The kgdboc driver contains logic to configure communications
			
 
				-  with an attached keyboard.  The keyboard infrastructure is only
			
 
				-  compiled into the kernel when CONFIG_KDB_KEYBOARD=y is set in the
			
 
				-  kernel configuration.</para>
			
 
				-  <para>The core polled keyboard driver driver for PS/2 type keyboards
			
 
				-  is in drivers/char/kdb_keyboard.c.  This driver is hooked into the
			
 
				-  debug core when kgdboc populates the callback in the array
			
 
				-  called <constant>kdb_poll_funcs[]</constant>.  The
			
 
				-  kdb_get_kbd_char() is the top-level function which polls hardware
			
 
				-  for single character input.
			
 
				-  </para>
			
 
				-  </sect2>
			
 
				-  <sect2 id="kgdbocKms">
			
 
				-  <title>kgdboc and kms</title>
			
 
				-  <para>The kgdboc driver contains logic to request the graphics
			
 
				-  display to switch to a text context when you are using
			
 
				-  "kgdboc=kms,kbd", provided that you have a video driver which has a
			
 
				-  frame buffer console and atomic kernel mode setting support.</para>
			
 
				-  <para>
			
 
				-  Every time the kernel
			
 
				-  debugger is entered it calls kgdboc_pre_exp_handler() which in turn
			
 
				-  calls con_debug_enter() in the virtual console layer.  On resuming kernel
			
 
				-  execution, the kernel debugger calls kgdboc_post_exp_handler() which
			
 
				-  in turn calls con_debug_leave().</para>
			
 
				-  <para>Any video driver that wants to be compatible with the kernel
			
 
				-  debugger and the atomic kms callbacks must implement the
			
 
				-  mode_set_base_atomic, fb_debug_enter and fb_debug_leave operations.
			
 
				-  For the fb_debug_enter and fb_debug_leave the option exists to use
			
 
				-  the generic drm fb helper functions or implement something custom for
			
 
				-  the hardware.  The following example shows the initialization of the
			
 
				-  .mode_set_base_atomic operation in
			
 
				-  drivers/gpu/drm/i915/intel_display.c:
			
 
				-  <informalexample>
			
 
				-  <programlisting>
			
 
				-static const struct drm_crtc_helper_funcs intel_helper_funcs = {
			
 
				-[...]
			
 
				-        .mode_set_base_atomic = intel_pipe_set_base_atomic,
			
 
				-[...]
			
 
				-};
			
 
				-  </programlisting>
			
 
				-  </informalexample>
			
 
				-  </para>
			
 
				-  <para>Here is an example of how the i915 driver initializes the fb_debug_enter and fb_debug_leave functions to use the generic drm helpers in
			
 
				-  drivers/gpu/drm/i915/intel_fb.c:
			
 
				-  <informalexample>
			
 
				-  <programlisting>
			
 
				-static struct fb_ops intelfb_ops = {
			
 
				-[...]
			
 
				-       .fb_debug_enter = drm_fb_helper_debug_enter,
			
 
				-       .fb_debug_leave = drm_fb_helper_debug_leave,
			
 
				-[...]
			
 
				-};
			
 
				-  </programlisting>
			
 
				-  </informalexample>
			
 
				-  </para>
			
 
				-  </sect2>
			
 
				-  </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="credits">
			
 
				-     <title>Credits</title>
			
 
				-	<para>
			
 
				-		The following people have contributed to this document:
			
 
				-		<orderedlist>
			
 
				-			<listitem><para>Amit Kale<email>amitkale@linsyssoft.com</email></para></listitem>
			
 
				-			<listitem><para>Tom Rini<email>trini@kernel.crashing.org</email></para></listitem>
			
 
				-		</orderedlist>
			
 
				-                In March 2008 this document was completely rewritten by:
			
 
				-		<itemizedlist>
			
 
				-		<listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
			
 
				-		</itemizedlist>
			
 
				-                In Jan 2010 this document was updated to include kdb.
			
 
				-		<itemizedlist>
			
 
				-		<listitem><para>Jason Wessel<email>jason.wessel@windriver.com</email></para></listitem>
			
 
				-		</itemizedlist>
			
 
				-	</para>
			
 
				-  </chapter>
			
 
				-</book>
			
 
				-
			
--- a/Documentation/DocBook/libata.tmpl
+++ b/Documentation/DocBook/libata.tmpl
@@ -1,1625 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="libataDevGuide">
			
 
				- <bookinfo>
			
 
				-  <title>libATA Developer's Guide</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Jeff</firstname>
			
 
				-    <surname>Garzik</surname>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2003-2006</year>
			
 
				-   <holder>Jeff Garzik</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-   The contents of this file are subject to the Open
			
 
				-   Software License version 1.1 that can be found at
			
 
				-   <ulink url="http://fedoraproject.org/wiki/Licensing:OSL1.1">http://fedoraproject.org/wiki/Licensing:OSL1.1</ulink>
			
 
				-   and is included herein by reference.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-   Alternatively, the contents of this file may be used under the terms
			
 
				-   of the GNU General Public License version 2 (the "GPL") as distributed
			
 
				-   in the kernel source COPYING file, in which case the provisions of
			
 
				-   the GPL are applicable instead of the above.  If you wish to allow
			
 
				-   the use of your version of this file only under the terms of the
			
 
				-   GPL and not to allow others to use your version of this file under
			
 
				-   the OSL, indicate your decision by deleting the provisions above and
			
 
				-   replace them with the notice and other provisions required by the GPL.
			
 
				-   If you do not delete the provisions above, a recipient may use your
			
 
				-   version of this file under either the OSL or the GPL.
			
 
				-   </para>
			
 
				-
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="libataIntroduction">
			
 
				-     <title>Introduction</title>
			
 
				-  <para>
			
 
				-  libATA is a library used inside the Linux kernel to support ATA host
			
 
				-  controllers and devices.  libATA provides an ATA driver API, class
			
 
				-  transports for ATA and ATAPI devices, and SCSI&lt;-&gt;ATA translation
			
 
				-  for ATA devices according to the T10 SAT specification.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  This Guide documents the libATA driver API, library functions, library
			
 
				-  internals, and a couple sample ATA low-level drivers.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataDriverApi">
			
 
				-     <title>libata Driver API</title>
			
 
				-     <para>
			
 
				-     struct ata_port_operations is defined for every low-level libata
			
 
				-     hardware driver, and it controls how the low-level driver
			
 
				-     interfaces with the ATA and SCSI layers.
			
 
				-     </para>
			
 
				-     <para>
			
 
				-     FIS-based drivers will hook into the system with ->qc_prep() and
			
 
				-     ->qc_issue() high-level hooks.  Hardware which behaves in a manner
			
 
				-     similar to PCI IDE hardware may utilize several generic helpers,
			
 
				-     defining at a bare minimum the bus I/O addresses of the ATA shadow
			
 
				-     register blocks.
			
 
				-     </para>
			
 
				-     <sect1>
			
 
				-        <title>struct ata_port_operations</title>
			
 
				-
			
 
				-	<sect2><title>Disable ATA port</title>
			
 
				-	<programlisting>
			
 
				-void (*port_disable) (struct ata_port *);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Called from ata_bus_probe() error path, as well as when
			
 
				-	unregistering from the SCSI module (rmmod, hot unplug).
			
 
				-	This function should do whatever needs to be done to take the
			
 
				-	port out of use.  In most cases, ata_port_disable() can be used
			
 
				-	as this hook.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Called from ata_bus_probe() on a failed probe.
			
 
				-	Called from ata_scsi_release().
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Post-IDENTIFY device configuration</title>
			
 
				-	<programlisting>
			
 
				-void (*dev_config) (struct ata_port *, struct ata_device *);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Called after IDENTIFY [PACKET] DEVICE is issued to each device
			
 
				-	found.  Typically used to apply device-specific fixups prior to
			
 
				-	issue of SET FEATURES - XFER MODE, and prior to operation.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	This entry may be specified as NULL in ata_port_operations.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Set PIO/DMA mode</title>
			
 
				-	<programlisting>
			
 
				-void (*set_piomode) (struct ata_port *, struct ata_device *);
			
 
				-void (*set_dmamode) (struct ata_port *, struct ata_device *);
			
 
				-void (*post_set_mode) (struct ata_port *);
			
 
				-unsigned int (*mode_filter) (struct ata_port *, struct ata_device *, unsigned int);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Hooks called prior to the issue of SET FEATURES - XFER MODE
			
 
				-	command.  The optional ->mode_filter() hook is called when libata
			
 
				-	has built a mask of the possible modes. This is passed to the 
			
 
				-	->mode_filter() function which should return a mask of valid modes
			
 
				-	after filtering those unsuitable due to hardware limits. It is not
			
 
				-	valid to use this interface to add modes.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	dev->pio_mode and dev->dma_mode are guaranteed to be valid when
			
 
				-	->set_piomode() and when ->set_dmamode() is called. The timings for
			
 
				-	any other drive sharing the cable will also be valid at this point.
			
 
				-	That is the library records the decisions for the modes of each
			
 
				-	drive on a channel before it attempts to set any of them.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	->post_set_mode() is
			
 
				-	called unconditionally, after the SET FEATURES - XFER MODE
			
 
				-	command completes successfully.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	->set_piomode() is always called (if present), but
			
 
				-	->set_dma_mode() is only called if DMA is possible.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Taskfile read/write</title>
			
 
				-	<programlisting>
			
 
				-void (*sff_tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
			
 
				-void (*sff_tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	->tf_load() is called to load the given taskfile into hardware
			
 
				-	registers / DMA buffers.  ->tf_read() is called to read the
			
 
				-	hardware registers / DMA buffers, to obtain the current set of
			
 
				-	taskfile register values.
			
 
				-	Most drivers for taskfile-based hardware (PIO or MMIO) use
			
 
				-	ata_sff_tf_load() and ata_sff_tf_read() for these hooks.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>PIO data read/write</title>
			
 
				-	<programlisting>
			
 
				-void (*sff_data_xfer) (struct ata_device *, unsigned char *, unsigned int, int);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-All bmdma-style drivers must implement this hook.  This is the low-level
			
 
				-operation that actually copies the data bytes during a PIO data
			
 
				-transfer.
			
 
				-Typically the driver will choose one of ata_sff_data_xfer_noirq(),
			
 
				-ata_sff_data_xfer(), or ata_sff_data_xfer32().
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>ATA command execute</title>
			
 
				-	<programlisting>
			
 
				-void (*sff_exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	causes an ATA command, previously loaded with
			
 
				-	->tf_load(), to be initiated in hardware.
			
 
				-	Most drivers for taskfile-based hardware use ata_sff_exec_command()
			
 
				-	for this hook.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Per-cmd ATAPI DMA capabilities filter</title>
			
 
				-	<programlisting>
			
 
				-int (*check_atapi_dma) (struct ata_queued_cmd *qc);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-Allow low-level driver to filter ATA PACKET commands, returning a status
			
 
				-indicating whether or not it is OK to use DMA for the supplied PACKET
			
 
				-command.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	This hook may be specified as NULL, in which case libata will
			
 
				-	assume that atapi dma can be supported.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Read specific ATA shadow registers</title>
			
 
				-	<programlisting>
			
 
				-u8   (*sff_check_status)(struct ata_port *ap);
			
 
				-u8   (*sff_check_altstatus)(struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Reads the Status/AltStatus ATA shadow register from
			
 
				-	hardware.  On some hardware, reading the Status register has
			
 
				-	the side effect of clearing the interrupt condition.
			
 
				-	Most drivers for taskfile-based hardware use
			
 
				-	ata_sff_check_status() for this hook.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Write specific ATA shadow register</title>
			
 
				-	<programlisting>
			
 
				-void (*sff_set_devctl)(struct ata_port *ap, u8 ctl);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Write the device control ATA shadow register to the hardware.
			
 
				-	Most drivers don't need to define this.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Select ATA device on bus</title>
			
 
				-	<programlisting>
			
 
				-void (*sff_dev_select)(struct ata_port *ap, unsigned int device);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Issues the low-level hardware command(s) that causes one of N
			
 
				-	hardware devices to be considered 'selected' (active and
			
 
				-	available for use) on the ATA bus.  This generally has no
			
 
				-	meaning on FIS-based devices.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Most drivers for taskfile-based hardware use
			
 
				-	ata_sff_dev_select() for this hook.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Private tuning method</title>
			
 
				-	<programlisting>
			
 
				-void (*set_mode) (struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	By default libata performs drive and controller tuning in
			
 
				-	accordance with the ATA timing rules and also applies blacklists
			
 
				-	and cable limits. Some controllers need special handling and have
			
 
				-	custom tuning rules, typically raid controllers that use ATA
			
 
				-	commands but do not actually do drive timing.
			
 
				-	</para>
			
 
				-
			
 
				-	<warning>
			
 
				-	<para>
			
 
				-	This hook should not be used to replace the standard controller
			
 
				-	tuning logic when a controller has quirks. Replacing the default
			
 
				-	tuning logic in that case would bypass handling for drive and
			
 
				-	bridge quirks that may be important to data reliability. If a
			
 
				-	controller needs to filter the mode selection it should use the
			
 
				-	mode_filter hook instead.
			
 
				-	</para>
			
 
				-	</warning>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Control PCI IDE BMDMA engine</title>
			
 
				-	<programlisting>
			
 
				-void (*bmdma_setup) (struct ata_queued_cmd *qc);
			
 
				-void (*bmdma_start) (struct ata_queued_cmd *qc);
			
 
				-void (*bmdma_stop) (struct ata_port *ap);
			
 
				-u8   (*bmdma_status) (struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-When setting up an IDE BMDMA transaction, these hooks arm
			
 
				-(->bmdma_setup), fire (->bmdma_start), and halt (->bmdma_stop)
			
 
				-the hardware's DMA engine.  ->bmdma_status is used to read the standard
			
 
				-PCI IDE DMA Status register.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-These hooks are typically either no-ops, or simply not implemented, in
			
 
				-FIS-based drivers.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-Most legacy IDE drivers use ata_bmdma_setup() for the bmdma_setup()
			
 
				-hook.  ata_bmdma_setup() will write the pointer to the PRD table to
			
 
				-the IDE PRD Table Address register, enable DMA in the DMA Command
			
 
				-register, and call exec_command() to begin the transfer.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-Most legacy IDE drivers use ata_bmdma_start() for the bmdma_start()
			
 
				-hook.  ata_bmdma_start() will write the ATA_DMA_START flag to the DMA
			
 
				-Command register.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-Many legacy IDE drivers use ata_bmdma_stop() for the bmdma_stop()
			
 
				-hook.  ata_bmdma_stop() clears the ATA_DMA_START flag in the DMA
			
 
				-command register.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-Many legacy IDE drivers use ata_bmdma_status() as the bmdma_status() hook.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>High-level taskfile hooks</title>
			
 
				-	<programlisting>
			
 
				-void (*qc_prep) (struct ata_queued_cmd *qc);
			
 
				-int (*qc_issue) (struct ata_queued_cmd *qc);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Higher-level hooks, these two hooks can potentially supercede
			
 
				-	several of the above taskfile/DMA engine hooks.  ->qc_prep is
			
 
				-	called after the buffers have been DMA-mapped, and is typically
			
 
				-	used to populate the hardware's DMA scatter-gather table.
			
 
				-	Most drivers use the standard ata_qc_prep() helper function, but
			
 
				-	more advanced drivers roll their own.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	->qc_issue is used to make a command active, once the hardware
			
 
				-	and S/G tables have been prepared.  IDE BMDMA drivers use the
			
 
				-	helper function ata_qc_issue_prot() for taskfile protocol-based
			
 
				-	dispatch.  More advanced drivers implement their own ->qc_issue.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	ata_qc_issue_prot() calls ->tf_load(), ->bmdma_setup(), and
			
 
				-	->bmdma_start() as necessary to initiate a transfer.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Exception and probe handling (EH)</title>
			
 
				-	<programlisting>
			
 
				-void (*eng_timeout) (struct ata_port *ap);
			
 
				-void (*phy_reset) (struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-Deprecated.  Use ->error_handler() instead.
			
 
				-	</para>
			
 
				-
			
 
				-	<programlisting>
			
 
				-void (*freeze) (struct ata_port *ap);
			
 
				-void (*thaw) (struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-ata_port_freeze() is called when HSM violations or some other
			
 
				-condition disrupts normal operation of the port.  A frozen port
			
 
				-is not allowed to perform any operation until the port is
			
 
				-thawed, which usually follows a successful reset.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-The optional ->freeze() callback can be used for freezing the port
			
 
				-hardware-wise (e.g. mask interrupt and stop DMA engine).  If a
			
 
				-port cannot be frozen hardware-wise, the interrupt handler
			
 
				-must ack and clear interrupts unconditionally while the port
			
 
				-is frozen.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-The optional ->thaw() callback is called to perform the opposite of ->freeze():
			
 
				-prepare the port for normal operation once again.  Unmask interrupts,
			
 
				-start DMA engine, etc.
			
 
				-	</para>
			
 
				-
			
 
				-	<programlisting>
			
 
				-void (*error_handler) (struct ata_port *ap);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-->error_handler() is a driver's hook into probe, hotplug, and recovery
			
 
				-and other exceptional conditions.  The primary responsibility of an
			
 
				-implementation is to call ata_do_eh() or ata_bmdma_drive_eh() with a set
			
 
				-of EH hooks as arguments:
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-'prereset' hook (may be NULL) is called during an EH reset, before any other actions
			
 
				-are taken.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-'postreset' hook (may be NULL) is called after the EH reset is performed.  Based on
			
 
				-existing conditions, severity of the problem, and hardware capabilities,
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-Either 'softreset' (may be NULL) or 'hardreset' (may be NULL) will be
			
 
				-called to perform the low-level EH reset.
			
 
				-	</para>
			
 
				-
			
 
				-	<programlisting>
			
 
				-void (*post_internal_cmd) (struct ata_queued_cmd *qc);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-Perform any hardware-specific actions necessary to finish processing
			
 
				-after executing a probe-time or EH-time command via ata_exec_internal().
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Hardware interrupt handling</title>
			
 
				-	<programlisting>
			
 
				-irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
			
 
				-void (*irq_clear) (struct ata_port *);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	->irq_handler is the interrupt handling routine registered with
			
 
				-	the system, by libata.  ->irq_clear is called during probe just
			
 
				-	before the interrupt handler is registered, to be sure hardware
			
 
				-	is quiet.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	The second argument, dev_instance, should be cast to a pointer
			
 
				-	to struct ata_host_set.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Most legacy IDE drivers use ata_sff_interrupt() for the
			
 
				-	irq_handler hook, which scans all ports in the host_set,
			
 
				-	determines which queued command was active (if any), and calls
			
 
				-	ata_sff_host_intr(ap,qc).
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Most legacy IDE drivers use ata_sff_irq_clear() for the
			
 
				-	irq_clear() hook, which simply clears the interrupt and error
			
 
				-	flags in the DMA status register.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>SATA phy read/write</title>
			
 
				-	<programlisting>
			
 
				-int (*scr_read) (struct ata_port *ap, unsigned int sc_reg,
			
 
				-		 u32 *val);
			
 
				-int (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
			
 
				-                   u32 val);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	Read and write standard SATA phy registers.  Currently only used
			
 
				-	if ->phy_reset hook called the sata_phy_reset() helper function.
			
 
				-	sc_reg is one of SCR_STATUS, SCR_CONTROL, SCR_ERROR, or SCR_ACTIVE.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-	<sect2><title>Init and shutdown</title>
			
 
				-	<programlisting>
			
 
				-int (*port_start) (struct ata_port *ap);
			
 
				-void (*port_stop) (struct ata_port *ap);
			
 
				-void (*host_stop) (struct ata_host_set *host_set);
			
 
				-	</programlisting>
			
 
				-
			
 
				-	<para>
			
 
				-	->port_start() is called just after the data structures for each
			
 
				-	port are initialized.  Typically this is used to alloc per-port
			
 
				-	DMA buffers / tables / rings, enable DMA engines, and similar
			
 
				-	tasks.  Some drivers also use this entry point as a chance to
			
 
				-	allocate driver-private memory for ap->private_data.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Many drivers use ata_port_start() as this hook or call
			
 
				-	it from their own port_start() hooks.  ata_port_start()
			
 
				-	allocates space for a legacy IDE PRD table and returns.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	->port_stop() is called after ->host_stop().  Its sole function
			
 
				-	is to release DMA/memory resources, now that they are no longer
			
 
				-	actively being used.  Many drivers also free driver-private
			
 
				-	data from port at this time.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	->host_stop() is called after all ->port_stop() calls
			
 
				-have completed.  The hook must finalize hardware shutdown, release DMA
			
 
				-and other resources, etc.
			
 
				-	This hook may be specified as NULL, in which case it is not called.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect2>
			
 
				-
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataEH">
			
 
				-        <title>Error handling</title>
			
 
				-
			
 
				-	<para>
			
 
				-	This chapter describes how errors are handled under libata.
			
 
				-	Readers are advised to read SCSI EH
			
 
				-	(Documentation/scsi/scsi_eh.txt) and ATA exceptions doc first.
			
 
				-	</para>
			
 
				-
			
 
				-	<sect1><title>Origins of commands</title>
			
 
				-	<para>
			
 
				-	In libata, a command is represented with struct ata_queued_cmd
			
 
				-	or qc.  qc's are preallocated during port initialization and
			
 
				-	repetitively used for command executions.  Currently only one
			
 
				-	qc is allocated per port but yet-to-be-merged NCQ branch
			
 
				-	allocates one for each tag and maps each qc to NCQ tag 1-to-1.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	libata commands can originate from two sources - libata itself
			
 
				-	and SCSI midlayer.  libata internal commands are used for
			
 
				-	initialization and error handling.  All normal blk requests
			
 
				-	and commands for SCSI emulation are passed as SCSI commands
			
 
				-	through queuecommand callback of SCSI host template.
			
 
				-	</para>
			
 
				-	</sect1>
			
 
				-
			
 
				-	<sect1><title>How commands are issued</title>
			
 
				-
			
 
				-	<variablelist>
			
 
				-
			
 
				-	<varlistentry><term>Internal commands</term>
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	First, qc is allocated and initialized using
			
 
				-	ata_qc_new_init().  Although ata_qc_new_init() doesn't
			
 
				-	implement any wait or retry mechanism when qc is not
			
 
				-	available, internal commands are currently issued only during
			
 
				-	initialization and error recovery, so no other command is
			
 
				-	active and allocation is guaranteed to succeed.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Once allocated qc's taskfile is initialized for the command to
			
 
				-	be executed.  qc currently has two mechanisms to notify
			
 
				-	completion.  One is via qc->complete_fn() callback and the
			
 
				-	other is completion qc->waiting.  qc->complete_fn() callback
			
 
				-	is the asynchronous path used by normal SCSI translated
			
 
				-	commands and qc->waiting is the synchronous (issuer sleeps in
			
 
				-	process context) path used by internal commands.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Once initialization is complete, host_set lock is acquired
			
 
				-	and the qc is issued.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-	</varlistentry>
			
 
				-
			
 
				-	<varlistentry><term>SCSI commands</term>
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	All libata drivers use ata_scsi_queuecmd() as
			
 
				-	hostt->queuecommand callback.  scmds can either be simulated
			
 
				-	or translated.  No qc is involved in processing a simulated
			
 
				-	scmd.  The result is computed right away and the scmd is
			
 
				-	completed.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	For a translated scmd, ata_qc_new_init() is invoked to
			
 
				-	allocate a qc and the scmd is translated into the qc.  SCSI
			
 
				-	midlayer's completion notification function pointer is stored
			
 
				-	into qc->scsidone.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	qc->complete_fn() callback is used for completion
			
 
				-	notification.  ATA commands use ata_scsi_qc_complete() while
			
 
				-	ATAPI commands use atapi_qc_complete().  Both functions end up
			
 
				-	calling qc->scsidone to notify upper layer when the qc is
			
 
				-	finished.  After translation is completed, the qc is issued
			
 
				-	with ata_qc_issue().
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Note that SCSI midlayer invokes hostt->queuecommand while
			
 
				-	holding host_set lock, so all above occur while holding
			
 
				-	host_set lock.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-	</varlistentry>
			
 
				-
			
 
				-	</variablelist>
			
 
				-	</sect1>
			
 
				-
			
 
				-	<sect1><title>How commands are processed</title>
			
 
				-	<para>
			
 
				-	Depending on which protocol and which controller are used,
			
 
				-	commands are processed differently.  For the purpose of
			
 
				-	discussion, a controller which uses taskfile interface and all
			
 
				-	standard callbacks is assumed.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Currently 6 ATA command protocols are used.  They can be
			
 
				-	sorted into the following four categories according to how
			
 
				-	they are processed.
			
 
				-	</para>
			
 
				-
			
 
				-	<variablelist>
			
 
				-	   <varlistentry><term>ATA NO DATA or DMA</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   ATA_PROT_NODATA and ATA_PROT_DMA fall into this category.
			
 
				-	   These types of commands don't require any software
			
 
				-	   intervention once issued.  Device will raise interrupt on
			
 
				-	   completion.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>ATA PIO</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   ATA_PROT_PIO is in this category.  libata currently
			
 
				-	   implements PIO with polling.  ATA_NIEN bit is set to turn
			
 
				-	   off interrupt and pio_task on ata_wq performs polling and
			
 
				-	   IO.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>ATAPI NODATA or DMA</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   ATA_PROT_ATAPI_NODATA and ATA_PROT_ATAPI_DMA are in this
			
 
				-	   category.  packet_task is used to poll BSY bit after
			
 
				-	   issuing PACKET command.  Once BSY is turned off by the
			
 
				-	   device, packet_task transfers CDB and hands off processing
			
 
				-	   to interrupt handler.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>ATAPI PIO</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   ATA_PROT_ATAPI is in this category.  ATA_NIEN bit is set
			
 
				-	   and, as in ATAPI NODATA or DMA, packet_task submits cdb.
			
 
				-	   However, after submitting cdb, further processing (data
			
 
				-	   transfer) is handed off to pio_task.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-	</variablelist>
			
 
				-        </sect1>
			
 
				-
			
 
				-	<sect1><title>How commands are completed</title>
			
 
				-	<para>
			
 
				-	Once issued, all qc's are either completed with
			
 
				-	ata_qc_complete() or time out.  For commands which are handled
			
 
				-	by interrupts, ata_host_intr() invokes ata_qc_complete(), and,
			
 
				-	for PIO tasks, pio_task invokes ata_qc_complete().  In error
			
 
				-	cases, packet_task may also complete commands.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	ata_qc_complete() does the following.
			
 
				-	</para>
			
 
				-
			
 
				-	<orderedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	DMA memory is unmapped.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	ATA_QCFLAG_ACTIVE is cleared from qc->flags.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	qc->complete_fn() callback is invoked.  If the return value of
			
 
				-	the callback is not zero.  Completion is short circuited and
			
 
				-	ata_qc_complete() returns.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	__ata_qc_complete() is called, which does
			
 
				-	   <orderedlist>
			
 
				-
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   qc->flags is cleared to zero.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   ap->active_tag and qc->tag are poisoned.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   qc->waiting is cleared &amp; completed (in that order).
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   qc is deallocated by clearing appropriate bit in ap->qactive.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-
			
 
				-	   </orderedlist>
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</orderedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	So, it basically notifies upper layer and deallocates qc.  One
			
 
				-	exception is short-circuit path in #3 which is used by
			
 
				-	atapi_qc_complete().
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	For all non-ATAPI commands, whether it fails or not, almost
			
 
				-	the same code path is taken and very little error handling
			
 
				-	takes place.  A qc is completed with success status if it
			
 
				-	succeeded, with failed status otherwise.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	However, failed ATAPI commands require more handling as
			
 
				-	REQUEST SENSE is needed to acquire sense data.  If an ATAPI
			
 
				-	command fails, ata_qc_complete() is invoked with error status,
			
 
				-	which in turn invokes atapi_qc_complete() via
			
 
				-	qc->complete_fn() callback.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	This makes atapi_qc_complete() set scmd->result to
			
 
				-	SAM_STAT_CHECK_CONDITION, complete the scmd and return 1.  As
			
 
				-	the sense data is empty but scmd->result is CHECK CONDITION,
			
 
				-	SCSI midlayer will invoke EH for the scmd, and returning 1
			
 
				-	makes ata_qc_complete() to return without deallocating the qc.
			
 
				-	This leads us to ata_scsi_error() with partially completed qc.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect1>
			
 
				-
			
 
				-	<sect1><title>ata_scsi_error()</title>
			
 
				-	<para>
			
 
				-	ata_scsi_error() is the current transportt->eh_strategy_handler()
			
 
				-	for libata.  As discussed above, this will be entered in two
			
 
				-	cases - timeout and ATAPI error completion.  This function
			
 
				-	calls low level libata driver's eng_timeout() callback, the
			
 
				-	standard callback for which is ata_eng_timeout().  It checks
			
 
				-	if a qc is active and calls ata_qc_timeout() on the qc if so.
			
 
				-	Actual error handling occurs in ata_qc_timeout().
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	If EH is invoked for timeout, ata_qc_timeout() stops BMDMA and
			
 
				-	completes the qc.  Note that as we're currently in EH, we
			
 
				-	cannot call scsi_done.  As described in SCSI EH doc, a
			
 
				-	recovered scmd should be either retried with
			
 
				-	scsi_queue_insert() or finished with scsi_finish_command().
			
 
				-	Here, we override qc->scsidone with scsi_finish_command() and
			
 
				-	calls ata_qc_complete().
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	If EH is invoked due to a failed ATAPI qc, the qc here is
			
 
				-	completed but not deallocated.  The purpose of this
			
 
				-	half-completion is to use the qc as place holder to make EH
			
 
				-	code reach this place.  This is a bit hackish, but it works.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Once control reaches here, the qc is deallocated by invoking
			
 
				-	__ata_qc_complete() explicitly.  Then, internal qc for REQUEST
			
 
				-	SENSE is issued.  Once sense data is acquired, scmd is
			
 
				-	finished by directly invoking scsi_finish_command() on the
			
 
				-	scmd.  Note that as we already have completed and deallocated
			
 
				-	the qc which was associated with the scmd, we don't need
			
 
				-	to/cannot call ata_qc_complete() again.
			
 
				-	</para>
			
 
				-
			
 
				-	</sect1>
			
 
				-
			
 
				-	<sect1><title>Problems with the current EH</title>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Error representation is too crude.  Currently any and all
			
 
				-	error conditions are represented with ATA STATUS and ERROR
			
 
				-	registers.  Errors which aren't ATA device errors are treated
			
 
				-	as ATA device errors by setting ATA_ERR bit.  Better error
			
 
				-	descriptor which can properly represent ATA and other
			
 
				-	errors/exceptions is needed.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	When handling timeouts, no action is taken to make device
			
 
				-	forget about the timed out command and ready for new commands.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	EH handling via ata_scsi_error() is not properly protected
			
 
				-	from usual command processing.  On EH entrance, the device is
			
 
				-	not in quiescent state.  Timed out commands may succeed or
			
 
				-	fail any time.  pio_task and atapi_task may still be running.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Too weak error recovery.  Devices / controllers causing HSM
			
 
				-	mismatch errors and other errors quite often require reset to
			
 
				-	return to known state.  Also, advanced error handling is
			
 
				-	necessary to support features like NCQ and hotplug.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	ATA errors are directly handled in the interrupt handler and
			
 
				-	PIO errors in pio_task.  This is problematic for advanced
			
 
				-	error handling for the following reasons.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	First, advanced error handling often requires context and
			
 
				-	internal qc execution.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Second, even a simple failure (say, CRC error) needs
			
 
				-	information gathering and could trigger complex error handling
			
 
				-	(say, resetting &amp; reconfiguring).  Having multiple code
			
 
				-	paths to gather information, enter EH and trigger actions
			
 
				-	makes life painful.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-	Third, scattered EH code makes implementing low level drivers
			
 
				-	difficult.  Low level drivers override libata callbacks.  If
			
 
				-	EH is scattered over several places, each affected callbacks
			
 
				-	should perform its part of error handling.  This can be error
			
 
				-	prone and painful.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-	</sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataExt">
			
 
				-     <title>libata Library</title>
			
 
				-!Edrivers/ata/libata-core.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataInt">
			
 
				-     <title>libata Core Internals</title>
			
 
				-!Idrivers/ata/libata-core.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataScsiInt">
			
 
				-     <title>libata SCSI translation/emulation</title>
			
 
				-!Edrivers/ata/libata-scsi.c
			
 
				-!Idrivers/ata/libata-scsi.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="ataExceptions">
			
 
				-     <title>ATA errors and exceptions</title>
			
 
				-
			
 
				-  <para>
			
 
				-  This chapter tries to identify what error/exception conditions exist
			
 
				-  for ATA/ATAPI devices and describe how they should be handled in
			
 
				-  implementation-neutral way.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-  The term 'error' is used to describe conditions where either an
			
 
				-  explicit error condition is reported from device or a command has
			
 
				-  timed out.
			
 
				-  </para>
			
 
				-
			
 
				-  <para>
			
 
				-  The term 'exception' is either used to describe exceptional
			
 
				-  conditions which are not errors (say, power or hotplug events), or
			
 
				-  to describe both errors and non-error exceptional conditions.  Where
			
 
				-  explicit distinction between error and exception is necessary, the
			
 
				-  term 'non-error exception' is used.
			
 
				-  </para>
			
 
				-
			
 
				-  <sect1 id="excat">
			
 
				-     <title>Exception categories</title>
			
 
				-     <para>
			
 
				-     Exceptions are described primarily with respect to legacy
			
 
				-     taskfile + bus master IDE interface.  If a controller provides
			
 
				-     other better mechanism for error reporting, mapping those into
			
 
				-     categories described below shouldn't be difficult.
			
 
				-     </para>
			
 
				-
			
 
				-     <para>
			
 
				-     In the following sections, two recovery actions - reset and
			
 
				-     reconfiguring transport - are mentioned.  These are described
			
 
				-     further in <xref linkend="exrec"/>.
			
 
				-     </para>
			
 
				-
			
 
				-     <sect2 id="excatHSMviolation">
			
 
				-        <title>HSM violation</title>
			
 
				-        <para>
			
 
				-        This error is indicated when STATUS value doesn't match HSM
			
 
				-        requirement during issuing or execution any ATA/ATAPI command.
			
 
				-        </para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-	<title>Examples</title>
			
 
				-
			
 
				-        <listitem>
			
 
				-	<para>
			
 
				-	ATA_STATUS doesn't contain !BSY &amp;&amp; DRDY &amp;&amp; !DRQ while trying
			
 
				-	to issue a command.
			
 
				-        </para>
			
 
				-	</listitem>
			
 
				-
			
 
				-        <listitem>
			
 
				-	<para>
			
 
				-	!BSY &amp;&amp; !DRQ during PIO data transfer.
			
 
				-        </para>
			
 
				-	</listitem>
			
 
				-
			
 
				-        <listitem>
			
 
				-	<para>
			
 
				-	DRQ on command completion.
			
 
				-        </para>
			
 
				-	</listitem>
			
 
				-
			
 
				-        <listitem>
			
 
				-	<para>
			
 
				-	!BSY &amp;&amp; ERR after CDB transfer starts but before the
			
 
				-        last byte of CDB is transferred.  ATA/ATAPI standard states
			
 
				-        that &quot;The device shall not terminate the PACKET command
			
 
				-        with an error before the last byte of the command packet has
			
 
				-        been written&quot; in the error outputs description of PACKET
			
 
				-        command and the state diagram doesn't include such
			
 
				-        transitions.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	In these cases, HSM is violated and not much information
			
 
				-	regarding the error can be acquired from STATUS or ERROR
			
 
				-	register.  IOW, this error can be anything - driver bug,
			
 
				-	faulty device, controller and/or cable.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	As HSM is violated, reset is necessary to restore known state.
			
 
				-	Reconfiguring transport for lower speed might be helpful too
			
 
				-	as transmission errors sometimes cause this kind of errors.
			
 
				-	</para>
			
 
				-     </sect2>
			
 
				-     
			
 
				-     <sect2 id="excatDevErr">
			
 
				-        <title>ATA/ATAPI device error (non-NCQ / non-CHECK CONDITION)</title>
			
 
				-
			
 
				-	<para>
			
 
				-	These are errors detected and reported by ATA/ATAPI devices
			
 
				-	indicating device problems.  For this type of errors, STATUS
			
 
				-	and ERROR register values are valid and describe error
			
 
				-	condition.  Note that some of ATA bus errors are detected by
			
 
				-	ATA/ATAPI devices and reported using the same mechanism as
			
 
				-	device errors.  Those cases are described later in this
			
 
				-	section.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	For ATA commands, this type of errors are indicated by !BSY
			
 
				-	&amp;&amp; ERR during command execution and on completion.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>For ATAPI commands,</para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	!BSY &amp;&amp; ERR &amp;&amp; ABRT right after issuing PACKET
			
 
				-	indicates that PACKET command is not supported and falls in
			
 
				-	this category.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	!BSY &amp;&amp; ERR(==CHK) &amp;&amp; !ABRT after the last
			
 
				-	byte of CDB is transferred indicates CHECK CONDITION and
			
 
				-	doesn't fall in this category.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	!BSY &amp;&amp; ERR(==CHK) &amp;&amp; ABRT after the last byte
			
 
				-        of CDB is transferred *probably* indicates CHECK CONDITION and
			
 
				-        doesn't fall in this category.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	Of errors detected as above, the following are not ATA/ATAPI
			
 
				-	device errors but ATA bus errors and should be handled
			
 
				-	according to <xref linkend="excatATAbusErr"/>.
			
 
				-	</para>
			
 
				-
			
 
				-	<variablelist>
			
 
				-
			
 
				-	   <varlistentry>
			
 
				-	   <term>CRC error during data transfer</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is indicated by ICRC bit in the ERROR register and
			
 
				-	   means that corruption occurred during data transfer.  Up to
			
 
				-	   ATA/ATAPI-7, the standard specifies that this bit is only
			
 
				-	   applicable to UDMA transfers but ATA/ATAPI-8 draft revision
			
 
				-	   1f says that the bit may be applicable to multiword DMA and
			
 
				-	   PIO.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry>
			
 
				-	   <term>ABRT error during data transfer or on completion</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   Up to ATA/ATAPI-7, the standard specifies that ABRT could be
			
 
				-	   set on ICRC errors and on cases where a device is not able
			
 
				-	   to complete a command.  Combined with the fact that MWDMA
			
 
				-	   and PIO transfer errors aren't allowed to use ICRC bit up to
			
 
				-	   ATA/ATAPI-7, it seems to imply that ABRT bit alone could
			
 
				-	   indicate transfer errors.
			
 
				-	   </para>
			
 
				-	   <para>
			
 
				-	   However, ATA/ATAPI-8 draft revision 1f removes the part
			
 
				-	   that ICRC errors can turn on ABRT.  So, this is kind of
			
 
				-	   gray area.  Some heuristics are needed here.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	</variablelist>
			
 
				-
			
 
				-	<para>
			
 
				-	ATA/ATAPI device errors can be further categorized as follows.
			
 
				-	</para>
			
 
				-
			
 
				-	<variablelist>
			
 
				-
			
 
				-	   <varlistentry>
			
 
				-	   <term>Media errors</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is indicated by UNC bit in the ERROR register.  ATA
			
 
				-	   devices reports UNC error only after certain number of
			
 
				-	   retries cannot recover the data, so there's nothing much
			
 
				-	   else to do other than notifying upper layer.
			
 
				-	   </para>
			
 
				-	   <para>
			
 
				-	   READ and WRITE commands report CHS or LBA of the first
			
 
				-	   failed sector but ATA/ATAPI standard specifies that the
			
 
				-	   amount of transferred data on error completion is
			
 
				-	   indeterminate, so we cannot assume that sectors preceding
			
 
				-	   the failed sector have been transferred and thus cannot
			
 
				-	   complete those sectors successfully as SCSI does.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry>
			
 
				-	   <term>Media changed / media change requested error</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   &lt;&lt;TODO: fill here&gt;&gt;
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>Address error</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is indicated by IDNF bit in the ERROR register.
			
 
				-	   Report to upper layer.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>Other errors</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This can be invalid command or parameter indicated by ABRT
			
 
				-	   ERROR bit or some other error condition.  Note that ABRT
			
 
				-	   bit can indicate a lot of things including ICRC and Address
			
 
				-	   errors.  Heuristics needed.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	</variablelist>
			
 
				-
			
 
				-	<para>
			
 
				-	Depending on commands, not all STATUS/ERROR bits are
			
 
				-	applicable.  These non-applicable bits are marked with
			
 
				-	&quot;na&quot; in the output descriptions but up to ATA/ATAPI-7
			
 
				-	no definition of &quot;na&quot; can be found.  However,
			
 
				-	ATA/ATAPI-8 draft revision 1f describes &quot;N/A&quot; as
			
 
				-	follows.
			
 
				-	</para>
			
 
				-
			
 
				-	<blockquote>
			
 
				-	<variablelist>
			
 
				-	   <varlistentry><term>3.2.3.3a N/A</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   A keyword the indicates a field has no defined value in
			
 
				-	   this standard and should not be checked by the host or
			
 
				-	   device. N/A fields should be cleared to zero.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-	</variablelist>
			
 
				-	</blockquote>
			
 
				-
			
 
				-	<para>
			
 
				-	So, it seems reasonable to assume that &quot;na&quot; bits are
			
 
				-	cleared to zero by devices and thus need no explicit masking.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatATAPIcc">
			
 
				-        <title>ATAPI device CHECK CONDITION</title>
			
 
				-
			
 
				-	<para>
			
 
				-	ATAPI device CHECK CONDITION error is indicated by set CHK bit
			
 
				-	(ERR bit) in the STATUS register after the last byte of CDB is
			
 
				-	transferred for a PACKET command.  For this kind of errors,
			
 
				-	sense data should be acquired to gather information regarding
			
 
				-	the errors.  REQUEST SENSE packet command should be used to
			
 
				-	acquire sense data.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Once sense data is acquired, this type of errors can be
			
 
				-	handled similarly to other SCSI errors.  Note that sense data
			
 
				-	may indicate ATA bus error (e.g. Sense Key 04h HARDWARE ERROR
			
 
				-	&amp;&amp; ASC/ASCQ 47h/00h SCSI PARITY ERROR).  In such
			
 
				-	cases, the error should be considered as an ATA bus error and
			
 
				-	handled according to <xref linkend="excatATAbusErr"/>.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatNCQerr">
			
 
				-        <title>ATA device error (NCQ)</title>
			
 
				-
			
 
				-	<para>
			
 
				-	NCQ command error is indicated by cleared BSY and set ERR bit
			
 
				-	during NCQ command phase (one or more NCQ commands
			
 
				-	outstanding).  Although STATUS and ERROR registers will
			
 
				-	contain valid values describing the error, READ LOG EXT is
			
 
				-	required to clear the error condition, determine which command
			
 
				-	has failed and acquire more information.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	READ LOG EXT Log Page 10h reports which tag has failed and
			
 
				-	taskfile register values describing the error.  With this
			
 
				-	information the failed command can be handled as a normal ATA
			
 
				-	command error as in <xref linkend="excatDevErr"/> and all
			
 
				-	other in-flight commands must be retried.  Note that this
			
 
				-	retry should not be counted - it's likely that commands
			
 
				-	retried this way would have completed normally if it were not
			
 
				-	for the failed command.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Note that ATA bus errors can be reported as ATA device NCQ
			
 
				-	errors.  This should be handled as described in <xref
			
 
				-	linkend="excatATAbusErr"/>.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	If READ LOG EXT Log Page 10h fails or reports NQ, we're
			
 
				-	thoroughly screwed.  This condition should be treated
			
 
				-	according to <xref linkend="excatHSMviolation"/>.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatATAbusErr">
			
 
				-        <title>ATA bus error</title>
			
 
				-
			
 
				-	<para>
			
 
				-	ATA bus error means that data corruption occurred during
			
 
				-	transmission over ATA bus (SATA or PATA).  This type of errors
			
 
				-	can be indicated by
			
 
				-	</para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	ICRC or ABRT error as described in <xref linkend="excatDevErr"/>.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Controller-specific error completion with error information
			
 
				-	indicating transmission error.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	On some controllers, command timeout.  In this case, there may
			
 
				-	be a mechanism to determine that the timeout is due to
			
 
				-	transmission error.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Unknown/random errors, timeouts and all sorts of weirdities.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	As described above, transmission errors can cause wide variety
			
 
				-	of symptoms ranging from device ICRC error to random device
			
 
				-	lockup, and, for many cases, there is no way to tell if an
			
 
				-	error condition is due to transmission error or not;
			
 
				-	therefore, it's necessary to employ some kind of heuristic
			
 
				-	when dealing with errors and timeouts.  For example,
			
 
				-	encountering repetitive ABRT errors for known supported
			
 
				-	command is likely to indicate ATA bus error.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Once it's determined that ATA bus errors have possibly
			
 
				-	occurred, lowering ATA bus transmission speed is one of
			
 
				-	actions which may alleviate the problem.  See <xref
			
 
				-	linkend="exrecReconf"/> for more information.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatPCIbusErr">
			
 
				-        <title>PCI bus error</title>
			
 
				-
			
 
				-	<para>
			
 
				-	Data corruption or other failures during transmission over PCI
			
 
				-	(or other system bus).  For standard BMDMA, this is indicated
			
 
				-	by Error bit in the BMDMA Status register.  This type of
			
 
				-	errors must be logged as it indicates something is very wrong
			
 
				-	with the system.  Resetting host controller is recommended.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatLateCompletion">
			
 
				-        <title>Late completion</title>
			
 
				-
			
 
				-	<para>
			
 
				-	This occurs when timeout occurs and the timeout handler finds
			
 
				-	out that the timed out command has completed successfully or
			
 
				-	with error.  This is usually caused by lost interrupts.  This
			
 
				-	type of errors must be logged.  Resetting host controller is
			
 
				-	recommended.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatUnknown">
			
 
				-        <title>Unknown error (timeout)</title>
			
 
				-
			
 
				-	<para>
			
 
				-	This is when timeout occurs and the command is still
			
 
				-	processing or the host and device are in unknown state.  When
			
 
				-	this occurs, HSM could be in any valid or invalid state.  To
			
 
				-	bring the device to known state and make it forget about the
			
 
				-	timed out command, resetting is necessary.  The timed out
			
 
				-	command may be retried.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Timeouts can also be caused by transmission errors.  Refer to
			
 
				-	<xref linkend="excatATAbusErr"/> for more details.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="excatHoplugPM">
			
 
				-        <title>Hotplug and power management exceptions</title>
			
 
				-
			
 
				-	<para>
			
 
				-	&lt;&lt;TODO: fill here&gt;&gt;
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-  </sect1>
			
 
				-
			
 
				-  <sect1 id="exrec">
			
 
				-     <title>EH recovery actions</title>
			
 
				-
			
 
				-     <para>
			
 
				-     This section discusses several important recovery actions.
			
 
				-     </para>
			
 
				-
			
 
				-     <sect2 id="exrecClr">
			
 
				-        <title>Clearing error condition</title>
			
 
				-
			
 
				-	<para>
			
 
				-	Many controllers require its error registers to be cleared by
			
 
				-	error handler.  Different controllers may have different
			
 
				-	requirements.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	For SATA, it's strongly recommended to clear at least SError
			
 
				-	register during error handling.
			
 
				-	</para>
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="exrecRst">
			
 
				-        <title>Reset</title>
			
 
				-
			
 
				-	<para>
			
 
				-	During EH, resetting is necessary in the following cases.
			
 
				-	</para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	HSM is in unknown or invalid state
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	HBA is in unknown or invalid state
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	EH needs to make HBA/device forget about in-flight commands
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	HBA/device behaves weirdly
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	Resetting during EH might be a good idea regardless of error
			
 
				-	condition to improve EH robustness.  Whether to reset both or
			
 
				-	either one of HBA and device depends on situation but the
			
 
				-	following scheme is recommended.
			
 
				-	</para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	When it's known that HBA is in ready state but ATA/ATAPI
			
 
				-	device is in unknown state, reset only device.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	If HBA is in unknown state, reset both HBA and device.
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	HBA resetting is implementation specific.  For a controller
			
 
				-	complying to taskfile/BMDMA PCI IDE, stopping active DMA
			
 
				-	transaction may be sufficient iff BMDMA state is the only HBA
			
 
				-	context.  But even mostly taskfile/BMDMA PCI IDE complying
			
 
				-	controllers may have implementation specific requirements and
			
 
				-	mechanism to reset themselves.  This must be addressed by
			
 
				-	specific drivers.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	OTOH, ATA/ATAPI standard describes in detail ways to reset
			
 
				-	ATA/ATAPI devices.
			
 
				-	</para>
			
 
				-
			
 
				-	<variablelist>
			
 
				-
			
 
				-	   <varlistentry><term>PATA hardware reset</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is hardware initiated device reset signalled with
			
 
				-	   asserted PATA RESET- signal.  There is no standard way to
			
 
				-	   initiate hardware reset from software although some
			
 
				-	   hardware provides registers that allow driver to directly
			
 
				-	   tweak the RESET- signal.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>Software reset</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is achieved by turning CONTROL SRST bit on for at
			
 
				-	   least 5us.  Both PATA and SATA support it but, in case of
			
 
				-	   SATA, this may require controller-specific support as the
			
 
				-	   second Register FIS to clear SRST should be transmitted
			
 
				-	   while BSY bit is still set.  Note that on PATA, this resets
			
 
				-	   both master and slave devices on a channel.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>EXECUTE DEVICE DIAGNOSTIC command</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   Although ATA/ATAPI standard doesn't describe exactly, EDD
			
 
				-	   implies some level of resetting, possibly similar level
			
 
				-	   with software reset.  Host-side EDD protocol can be handled
			
 
				-	   with normal command processing and most SATA controllers
			
 
				-	   should be able to handle EDD's just like other commands.
			
 
				-	   As in software reset, EDD affects both devices on a PATA
			
 
				-	   bus.
			
 
				-	   </para>
			
 
				-	   <para>
			
 
				-	   Although EDD does reset devices, this doesn't suit error
			
 
				-	   handling as EDD cannot be issued while BSY is set and it's
			
 
				-	   unclear how it will act when device is in unknown/weird
			
 
				-	   state.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>ATAPI DEVICE RESET command</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is very similar to software reset except that reset
			
 
				-	   can be restricted to the selected device without affecting
			
 
				-	   the other device sharing the cable.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	   <varlistentry><term>SATA phy reset</term>
			
 
				-	   <listitem>
			
 
				-	   <para>
			
 
				-	   This is the preferred way of resetting a SATA device.  In
			
 
				-	   effect, it's identical to PATA hardware reset.  Note that
			
 
				-	   this can be done with the standard SCR Control register.
			
 
				-	   As such, it's usually easier to implement than software
			
 
				-	   reset.
			
 
				-	   </para>
			
 
				-	   </listitem>
			
 
				-	   </varlistentry>
			
 
				-
			
 
				-	</variablelist>
			
 
				-
			
 
				-	<para>
			
 
				-	One more thing to consider when resetting devices is that
			
 
				-	resetting clears certain configuration parameters and they
			
 
				-	need to be set to their previous or newly adjusted values
			
 
				-	after reset.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Parameters affected are.
			
 
				-	</para>
			
 
				-
			
 
				-	<itemizedlist>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	CHS set up with INITIALIZE DEVICE PARAMETERS (seldom used)
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Parameters set with SET FEATURES including transfer mode setting
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Block count set with SET MULTIPLE MODE
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	Other parameters (SET MAX, MEDIA LOCK...)
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-
			
 
				-	</itemizedlist>
			
 
				-
			
 
				-	<para>
			
 
				-	ATA/ATAPI standard specifies that some parameters must be
			
 
				-	maintained across hardware or software reset, but doesn't
			
 
				-	strictly specify all of them.  Always reconfiguring needed
			
 
				-	parameters after reset is required for robustness.  Note that
			
 
				-	this also applies when resuming from deep sleep (power-off).
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	Also, ATA/ATAPI standard requires that IDENTIFY DEVICE /
			
 
				-	IDENTIFY PACKET DEVICE is issued after any configuration
			
 
				-	parameter is updated or a hardware reset and the result used
			
 
				-	for further operation.  OS driver is required to implement
			
 
				-	revalidation mechanism to support this.
			
 
				-	</para>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-     <sect2 id="exrecReconf">
			
 
				-        <title>Reconfigure transport</title>
			
 
				-
			
 
				-	<para>
			
 
				-	For both PATA and SATA, a lot of corners are cut for cheap
			
 
				-	connectors, cables or controllers and it's quite common to see
			
 
				-	high transmission error rate.  This can be mitigated by
			
 
				-	lowering transmission speed.
			
 
				-	</para>
			
 
				-
			
 
				-	<para>
			
 
				-	The following is a possible scheme Jeff Garzik suggested.
			
 
				-	</para>
			
 
				-
			
 
				-	<blockquote>
			
 
				-	<para>
			
 
				-	If more than $N (3?) transmission errors happen in 15 minutes,
			
 
				-	</para>	
			
 
				-	<itemizedlist>
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	if SATA, decrease SATA PHY speed.  if speed cannot be decreased,
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	decrease UDMA xfer speed.  if at UDMA0, switch to PIO4,
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-	<listitem>
			
 
				-	<para>
			
 
				-	decrease PIO xfer speed.  if at PIO3, complain, but continue
			
 
				-	</para>
			
 
				-	</listitem>
			
 
				-	</itemizedlist>
			
 
				-	</blockquote>
			
 
				-
			
 
				-     </sect2>
			
 
				-
			
 
				-  </sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="PiixInt">
			
 
				-     <title>ata_piix Internals</title>
			
 
				-!Idrivers/ata/ata_piix.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="SILInt">
			
 
				-     <title>sata_sil Internals</title>
			
 
				-!Idrivers/ata/sata_sil.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="libataThanks">
			
 
				-     <title>Thanks</title>
			
 
				-  <para>
			
 
				-  The bulk of the ATA knowledge comes thanks to long conversations with
			
 
				-  Andre Hedrick (www.linux-ide.org), and long hours pondering the ATA
			
 
				-  and SCSI specifications.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  Thanks to Alan Cox for pointing out similarities 
			
 
				-  between SATA and SCSI, and in general for motivation to hack on
			
 
				-  libata.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  libata's device detection
			
 
				-  method, ata_pio_devchk, and in general all the early probing was
			
 
				-  based on extensive study of Hale Landis's probe/reset code in his
			
 
				-  ATADRVR driver (www.ata-atapi.com).
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/DocBook/librs.tmpl
+++ b/Documentation/DocBook/librs.tmpl
@@ -1,289 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="Reed-Solomon-Library-Guide">
			
 
				- <bookinfo>
			
 
				-  <title>Reed-Solomon Library Programming Interface</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Thomas</firstname>
			
 
				-    <surname>Gleixner</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>tglx@linutronix.de</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2004</year>
			
 
				-   <holder>Thomas Gleixner</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License version 2 as published by the Free Software Foundation.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-      <title>Introduction</title>
			
 
				-  <para>
			
 
				-  	The generic Reed-Solomon Library provides encoding, decoding
			
 
				-	and error correction functions.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  	Reed-Solomon codes are used in communication and storage
			
 
				-	applications to ensure data integrity. 
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  	This documentation is provided for developers who want to utilize
			
 
				-	the functions provided by the library.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-  
			
 
				-  <chapter id="bugs">
			
 
				-     <title>Known Bugs And Assumptions</title>
			
 
				-  <para>
			
 
				-	None.	
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="usage">
			
 
				-     	<title>Usage</title>
			
 
				-	<para>
			
 
				-		This chapter provides examples of how to use the library.
			
 
				-	</para>
			
 
				-	<sect1>
			
 
				-		<title>Initializing</title>
			
 
				-		<para>
			
 
				-			The init function init_rs returns a pointer to an
			
 
				-			rs decoder structure, which holds the necessary
			
 
				-			information for encoding, decoding and error correction
			
 
				-			with the given polynomial. It either uses an existing
			
 
				-			matching decoder or creates a new one. On creation all
			
 
				-			the lookup tables for fast en/decoding are created.
			
 
				-			The function may take a while, so make sure not to 
			
 
				-			call it in critical code paths.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-/* the Reed Solomon control structure */
			
 
				-static struct rs_control *rs_decoder;
			
 
				-
			
 
				-/* Symbolsize is 10 (bits)
			
 
				- * Primitive polynomial is x^10+x^3+1
			
 
				- * first consecutive root is 0
			
 
				- * primitive element to generate roots = 1
			
 
				- * generator polynomial degree (number of roots) = 6
			
 
				- */
			
 
				-rs_decoder = init_rs (10, 0x409, 0, 1, 6);
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1>
			
 
				-		<title>Encoding</title>
			
 
				-		<para>
			
 
				-			The encoder calculates the Reed-Solomon code over
			
 
				-			the given data length and stores the result in 
			
 
				-			the parity buffer. Note that the parity buffer must
			
 
				-			be initialized before calling the encoder.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The expanded data can be inverted on the fly by
			
 
				-			providing a non-zero inversion mask. The expanded data is
			
 
				-			XOR'ed with the mask. This is used e.g. for FLASH
			
 
				-			ECC, where the all 0xFF is inverted to an all 0x00.
			
 
				-			The Reed-Solomon code for all 0x00 is all 0x00. The
			
 
				-			code is inverted before storing to FLASH so it is 0xFF
			
 
				-			too. This prevents that reading from an erased FLASH
			
 
				-			results in ECC errors.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The databytes are expanded to the given symbol size
			
 
				-			on the fly. There is no support for encoding continuous
			
 
				-			bitstreams with a symbol size != 8 at the moment. If
			
 
				-			it is necessary it should be not a big deal to implement
			
 
				-			such functionality.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-/* Parity buffer. Size = number of roots */
			
 
				-uint16_t par[6];
			
 
				-/* Initialize the parity buffer */
			
 
				-memset(par, 0, sizeof(par));
			
 
				-/* Encode 512 byte in data8. Store parity in buffer par */
			
 
				-encode_rs8 (rs_decoder, data8, 512, par, 0);
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1>
			
 
				-		<title>Decoding</title>
			
 
				-		<para>
			
 
				-			The decoder calculates the syndrome over
			
 
				-			the given data length and the received parity symbols
			
 
				-			and corrects errors in the data.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			If a syndrome is available from a hardware decoder
			
 
				-			then the syndrome calculation is skipped.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The correction of the data buffer can be suppressed
			
 
				-			by providing a correction pattern buffer and an error
			
 
				-			location buffer to the decoder. The decoder stores the
			
 
				-			calculated error location and the correction bitmask
			
 
				-			in the given buffers. This is useful for hardware
			
 
				-			decoders which use a weird bit ordering scheme.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The databytes are expanded to the given symbol size
			
 
				-			on the fly. There is no support for decoding continuous
			
 
				-			bitstreams with a symbolsize != 8 at the moment. If
			
 
				-			it is necessary it should be not a big deal to implement
			
 
				-			such functionality.
			
 
				-		</para>
			
 
				-		
			
 
				-		<sect2>
			
 
				-		<title>
			
 
				-			Decoding with syndrome calculation, direct data correction
			
 
				-		</title>
			
 
				-		<programlisting>
			
 
				-/* Parity buffer. Size = number of roots */
			
 
				-uint16_t par[6];
			
 
				-uint8_t  data[512];
			
 
				-int numerr;
			
 
				-/* Receive data */
			
 
				-.....
			
 
				-/* Receive parity */
			
 
				-.....
			
 
				-/* Decode 512 byte in data8.*/
			
 
				-numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
			
 
				-		</programlisting>
			
 
				-		</sect2>
			
 
				-
			
 
				-		<sect2>
			
 
				-		<title>
			
 
				-			Decoding with syndrome given by hardware decoder, direct data correction
			
 
				-		</title>
			
 
				-		<programlisting>
			
 
				-/* Parity buffer. Size = number of roots */
			
 
				-uint16_t par[6], syn[6];
			
 
				-uint8_t  data[512];
			
 
				-int numerr;
			
 
				-/* Receive data */
			
 
				-.....
			
 
				-/* Receive parity */
			
 
				-.....
			
 
				-/* Get syndrome from hardware decoder */
			
 
				-.....
			
 
				-/* Decode 512 byte in data8.*/
			
 
				-numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
			
 
				-		</programlisting>
			
 
				-		</sect2>
			
 
				-
			
 
				-		<sect2>
			
 
				-		<title>
			
 
				-			Decoding with syndrome given by hardware decoder, no direct data correction.
			
 
				-		</title>
			
 
				-		<para>
			
 
				-			Note: It's not necessary to give data and received parity to the decoder.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-/* Parity buffer. Size = number of roots */
			
 
				-uint16_t par[6], syn[6], corr[8];
			
 
				-uint8_t  data[512];
			
 
				-int numerr, errpos[8];
			
 
				-/* Receive data */
			
 
				-.....
			
 
				-/* Receive parity */
			
 
				-.....
			
 
				-/* Get syndrome from hardware decoder */
			
 
				-.....
			
 
				-/* Decode 512 byte in data8.*/
			
 
				-numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
			
 
				-for (i = 0; i &lt; numerr; i++) {
			
 
				-	do_error_correction_in_your_buffer(errpos[i], corr[i]);
			
 
				-}
			
 
				-		</programlisting>
			
 
				-		</sect2>
			
 
				-	</sect1>
			
 
				-	<sect1>
			
 
				-		<title>Cleanup</title>
			
 
				-		<para>
			
 
				-			The function free_rs frees the allocated resources,
			
 
				-			if the caller is the last user of the decoder.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-/* Release resources */
			
 
				-free_rs(rs_decoder);
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-	
			
 
				-  <chapter id="structs">
			
 
				-     <title>Structures</title>
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the structures which are
			
 
				-     used in the Reed-Solomon Library and are relevant for a developer.
			
 
				-     </para>
			
 
				-!Iinclude/linux/rslib.h
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="pubfunctions">
			
 
				-     <title>Public Functions Provided</title>
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the Reed-Solomon functions
			
 
				-     which are exported.
			
 
				-     </para>
			
 
				-!Elib/reed_solomon/reed_solomon.c
			
 
				-  </chapter>
			
 
				-  
			
 
				-  <chapter id="credits">
			
 
				-     <title>Credits</title>
			
 
				-	<para>
			
 
				-		The library code for encoding and decoding was written by Phil Karn.
			
 
				-	</para>
			
 
				-	<programlisting>
			
 
				-		Copyright 2002, Phil Karn, KA9Q
			
 
				- 		May be used under the terms of the GNU General Public License (GPL)
			
 
				-	</programlisting>
			
 
				-	<para>
			
 
				-		The wrapper functions and interfaces are written by Thomas Gleixner.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		Many users have provided bugfixes, improvements and helping hands for testing.
			
 
				-		Thanks a lot.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		The following people have contributed to this document:
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		Thomas Gleixner<email>tglx@linutronix.de</email>
			
 
				-	</para>
			
 
				-  </chapter>
			
 
				-</book>
			
--- a/Documentation/DocBook/lsm.tmpl
+++ b/Documentation/DocBook/lsm.tmpl
@@ -1,265 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<article class="whitepaper" id="LinuxSecurityModule" lang="en">
			
 
				- <articleinfo>
			
 
				- <title>Linux Security Modules:  General Security Hooks for Linux</title>
			
 
				- <authorgroup>
			
 
				- <author>
			
 
				- <firstname>Stephen</firstname> 
			
 
				- <surname>Smalley</surname>
			
 
				- <affiliation>
			
 
				- <orgname>NAI Labs</orgname>
			
 
				- <address><email>ssmalley@nai.com</email></address>
			
 
				- </affiliation>
			
 
				- </author>
			
 
				- <author>
			
 
				- <firstname>Timothy</firstname> 
			
 
				- <surname>Fraser</surname>
			
 
				- <affiliation>
			
 
				- <orgname>NAI Labs</orgname>
			
 
				- <address><email>tfraser@nai.com</email></address>
			
 
				- </affiliation>
			
 
				- </author>
			
 
				- <author>
			
 
				- <firstname>Chris</firstname> 
			
 
				- <surname>Vance</surname>
			
 
				- <affiliation>
			
 
				- <orgname>NAI Labs</orgname>
			
 
				- <address><email>cvance@nai.com</email></address>
			
 
				- </affiliation>
			
 
				- </author>
			
 
				- </authorgroup>
			
 
				- </articleinfo>
			
 
				-
			
 
				-<sect1 id="Introduction"><title>Introduction</title>
			
 
				-
			
 
				-<para>
			
 
				-In March 2001, the National Security Agency (NSA) gave a presentation
			
 
				-about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
			
 
				-Summit.  SELinux is an implementation of flexible and fine-grained
			
 
				-nondiscretionary access controls in the Linux kernel, originally
			
 
				-implemented as its own particular kernel patch.  Several other
			
 
				-security projects (e.g. RSBAC, Medusa) have also developed flexible
			
 
				-access control architectures for the Linux kernel, and various
			
 
				-projects have developed particular access control models for Linux
			
 
				-(e.g. LIDS, DTE, SubDomain).  Each project has developed and
			
 
				-maintained its own kernel patch to support its security needs.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-In response to the NSA presentation, Linus Torvalds made a set of
			
 
				-remarks that described a security framework he would be willing to
			
 
				-consider for inclusion in the mainstream Linux kernel.  He described a
			
 
				-general framework that would provide a set of security hooks to
			
 
				-control operations on kernel objects and a set of opaque security
			
 
				-fields in kernel data structures for maintaining security attributes.
			
 
				-This framework could then be used by loadable kernel modules to
			
 
				-implement any desired model of security.  Linus also suggested the
			
 
				-possibility of migrating the Linux capabilities code into such a
			
 
				-module.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The Linux Security Modules (LSM) project was started by WireX to
			
 
				-develop such a framework.  LSM is a joint development effort by
			
 
				-several security projects, including Immunix, SELinux, SGI and Janus,
			
 
				-and several individuals, including Greg Kroah-Hartman and James
			
 
				-Morris, to develop a Linux kernel patch that implements this
			
 
				-framework.  The patch is currently tracking the 2.4 series and is
			
 
				-targeted for integration into the 2.5 development series.  This
			
 
				-technical report provides an overview of the framework and the example
			
 
				-capabilities security module provided by the LSM kernel patch.
			
 
				-</para>
			
 
				-
			
 
				-</sect1>
			
 
				-
			
 
				-<sect1 id="framework"><title>LSM Framework</title>
			
 
				-
			
 
				-<para>
			
 
				-The LSM kernel patch provides a general kernel framework to support
			
 
				-security modules.  In particular, the LSM framework is primarily
			
 
				-focused on supporting access control modules, although future
			
 
				-development is likely to address other security needs such as
			
 
				-auditing.  By itself, the framework does not provide any additional
			
 
				-security; it merely provides the infrastructure to support security
			
 
				-modules.  The LSM kernel patch also moves most of the capabilities
			
 
				-logic into an optional security module, with the system defaulting
			
 
				-to the traditional superuser logic.  This capabilities module
			
 
				-is discussed further in <xref linkend="cap"/>.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The LSM kernel patch adds security fields to kernel data structures
			
 
				-and inserts calls to hook functions at critical points in the kernel
			
 
				-code to manage the security fields and to perform access control.  It
			
 
				-also adds functions for registering and unregistering security
			
 
				-modules, and adds a general <function>security</function> system call
			
 
				-to support new system calls for security-aware applications.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The LSM security fields are simply <type>void*</type> pointers.  For
			
 
				-process and program execution security information, security fields
			
 
				-were added to <structname>struct task_struct</structname> and 
			
 
				-<structname>struct linux_binprm</structname>.  For filesystem security
			
 
				-information, a security field was added to 
			
 
				-<structname>struct super_block</structname>.  For pipe, file, and socket
			
 
				-security information, security fields were added to 
			
 
				-<structname>struct inode</structname> and 
			
 
				-<structname>struct file</structname>.  For packet and network device security
			
 
				-information, security fields were added to
			
 
				-<structname>struct sk_buff</structname> and
			
 
				-<structname>struct net_device</structname>.  For System V IPC security
			
 
				-information, security fields were added to
			
 
				-<structname>struct kern_ipc_perm</structname> and
			
 
				-<structname>struct msg_msg</structname>; additionally, the definitions
			
 
				-for <structname>struct msg_msg</structname>, <structname>struct 
			
 
				-msg_queue</structname>, and <structname>struct 
			
 
				-shmid_kernel</structname> were moved to header files
			
 
				-(<filename>include/linux/msg.h</filename> and
			
 
				-<filename>include/linux/shm.h</filename> as appropriate) to allow
			
 
				-the security modules to use these definitions.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Each LSM hook is a function pointer in a global table,
			
 
				-security_ops. This table is a
			
 
				-<structname>security_operations</structname> structure as defined by
			
 
				-<filename>include/linux/security.h</filename>.  Detailed documentation
			
 
				-for each hook is included in this header file.  At present, this
			
 
				-structure consists of a collection of substructures that group related
			
 
				-hooks based on the kernel object (e.g. task, inode, file, sk_buff,
			
 
				-etc) as well as some top-level hook function pointers for system
			
 
				-operations.  This structure is likely to be flattened in the future
			
 
				-for performance.  The placement of the hook calls in the kernel code
			
 
				-is described by the "called:" lines in the per-hook documentation in
			
 
				-the header file.  The hook calls can also be easily found in the
			
 
				-kernel code by looking for the string "security_ops->".
			
 
				-
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Linus mentioned per-process security hooks in his original remarks as a
			
 
				-possible alternative to global security hooks.  However, if LSM were
			
 
				-to start from the perspective of per-process hooks, then the base
			
 
				-framework would have to deal with how to handle operations that
			
 
				-involve multiple processes (e.g. kill), since each process might have
			
 
				-its own hook for controlling the operation.  This would require a
			
 
				-general mechanism for composing hooks in the base framework.
			
 
				-Additionally, LSM would still need global hooks for operations that
			
 
				-have no process context (e.g. network input operations).
			
 
				-Consequently, LSM provides global security hooks, but a security
			
 
				-module is free to implement per-process hooks (where that makes sense)
			
 
				-by storing a security_ops table in each process' security field and
			
 
				-then invoking these per-process hooks from the global hooks.
			
 
				-The problem of composition is thus deferred to the module.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-The global security_ops table is initialized to a set of hook
			
 
				-functions provided by a dummy security module that provides
			
 
				-traditional superuser logic.  A <function>register_security</function>
			
 
				-function (in <filename>security/security.c</filename>) is provided to
			
 
				-allow a security module to set security_ops to refer to its own hook
			
 
				-functions, and an <function>unregister_security</function> function is
			
 
				-provided to revert security_ops to the dummy module hooks.  This
			
 
				-mechanism is used to set the primary security module, which is
			
 
				-responsible for making the final decision for each hook.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-LSM also provides a simple mechanism for stacking additional security
			
 
				-modules with the primary security module.  It defines
			
 
				-<function>register_security</function> and
			
 
				-<function>unregister_security</function> hooks in the
			
 
				-<structname>security_operations</structname> structure and provides
			
 
				-<function>mod_reg_security</function> and
			
 
				-<function>mod_unreg_security</function> functions that invoke these
			
 
				-hooks after performing some sanity checking.  A security module can
			
 
				-call these functions in order to stack with other modules.  However,
			
 
				-the actual details of how this stacking is handled are deferred to the
			
 
				-module, which can implement these hooks in any way it wishes
			
 
				-(including always returning an error if it does not wish to support
			
 
				-stacking).  In this manner, LSM again defers the problem of
			
 
				-composition to the module.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-Although the LSM hooks are organized into substructures based on
			
 
				-kernel object, all of the hooks can be viewed as falling into two
			
 
				-major categories: hooks that are used to manage the security fields
			
 
				-and hooks that are used to perform access control.  Examples of the
			
 
				-first category of hooks include the
			
 
				-<function>alloc_security</function> and
			
 
				-<function>free_security</function> hooks defined for each kernel data
			
 
				-structure that has a security field.  These hooks are used to allocate
			
 
				-and free security structures for kernel objects.  The first category
			
 
				-of hooks also includes hooks that set information in the security
			
 
				-field after allocation, such as the <function>post_lookup</function>
			
 
				-hook in <structname>struct inode_security_ops</structname>.  This hook
			
 
				-is used to set security information for inodes after successful lookup
			
 
				-operations.  An example of the second category of hooks is the
			
 
				-<function>permission</function> hook in 
			
 
				-<structname>struct inode_security_ops</structname>.  This hook checks
			
 
				-permission when accessing an inode.
			
 
				-</para>
			
 
				-
			
 
				-</sect1>
			
 
				-
			
 
				-<sect1 id="cap"><title>LSM Capabilities Module</title>
			
 
				-
			
 
				-<para>
			
 
				-The LSM kernel patch moves most of the existing POSIX.1e capabilities
			
 
				-logic into an optional security module stored in the file
			
 
				-<filename>security/capability.c</filename>.  This change allows
			
 
				-users who do not want to use capabilities to omit this code entirely
			
 
				-from their kernel, instead using the dummy module for traditional
			
 
				-superuser logic or any other module that they desire.  This change
			
 
				-also allows the developers of the capabilities logic to maintain and
			
 
				-enhance their code more freely, without needing to integrate patches
			
 
				-back into the base kernel.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-In addition to moving the capabilities logic, the LSM kernel patch
			
 
				-could move the capability-related fields from the kernel data
			
 
				-structures into the new security fields managed by the security
			
 
				-modules.  However, at present, the LSM kernel patch leaves the
			
 
				-capability fields in the kernel data structures.  In his original
			
 
				-remarks, Linus suggested that this might be preferable so that other
			
 
				-security modules can be easily stacked with the capabilities module
			
 
				-without needing to chain multiple security structures on the security field.
			
 
				-It also avoids imposing extra overhead on the capabilities module
			
 
				-to manage the security fields.  However, the LSM framework could
			
 
				-certainly support such a move if it is determined to be desirable,
			
 
				-with only a few additional changes described below.
			
 
				-</para>
			
 
				-
			
 
				-<para>
			
 
				-At present, the capabilities logic for computing process capabilities
			
 
				-on <function>execve</function> and <function>set*uid</function>,
			
 
				-checking capabilities for a particular process, saving and checking
			
 
				-capabilities for netlink messages, and handling the
			
 
				-<function>capget</function> and <function>capset</function> system
			
 
				-calls have been moved into the capabilities module.  There are still a
			
 
				-few locations in the base kernel where capability-related fields are
			
 
				-directly examined or modified, but the current version of the LSM
			
 
				-patch does allow a security module to completely replace the
			
 
				-assignment and testing of capabilities.  These few locations would
			
 
				-need to be changed if the capability-related fields were moved into
			
 
				-the security field.  The following is a list of known locations that
			
 
				-still perform such direct examination or modification of
			
 
				-capability-related fields:
			
 
				-<itemizedlist>
			
 
				-<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
			
 
				-<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
			
 
				-<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
			
 
				-<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
			
 
				-</itemizedlist>
			
 
				-</para>
			
 
				-
			
 
				-</sect1>
			
 
				-
			
 
				-</article>
			
--- a/Documentation/DocBook/mtdnand.tmpl
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -1,1291 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="MTD-NAND-Guide">
			
 
				- <bookinfo>
			
 
				-  <title>MTD NAND Driver Programming Interface</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Thomas</firstname>
			
 
				-    <surname>Gleixner</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>tglx@linutronix.de</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2004</year>
			
 
				-   <holder>Thomas Gleixner</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License version 2 as published by the Free Software Foundation.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-      <title>Introduction</title>
			
 
				-  <para>
			
 
				-  	The generic NAND driver supports almost all NAND and AG-AND based
			
 
				-	chips and connects them to the Memory Technology Devices (MTD)
			
 
				-	subsystem of the Linux Kernel.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  	This documentation is provided for developers who want to implement
			
 
				-	board drivers or filesystem drivers suitable for NAND devices.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-  
			
 
				-  <chapter id="bugs">
			
 
				-     <title>Known Bugs And Assumptions</title>
			
 
				-  <para>
			
 
				-	None.	
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="dochints">
			
 
				-     <title>Documentation hints</title>
			
 
				-     <para>
			
 
				-     The function and structure docs are autogenerated. Each function and 
			
 
				-     struct member has a short description which is marked with an [XXX] identifier.
			
 
				-     The following chapters explain the meaning of those identifiers.
			
 
				-     </para>
			
 
				-     <sect1 id="Function_identifiers_XXX">
			
 
				-	<title>Function identifiers [XXX]</title>
			
 
				-     	<para>
			
 
				-	The functions are marked with [XXX] identifiers in the short
			
 
				-	comment. The identifiers explain the usage and scope of the
			
 
				-	functions. Following identifiers are used:
			
 
				-     	</para>
			
 
				-	<itemizedlist>
			
 
				-		<listitem><para>
			
 
				-	  	[MTD Interface]</para><para>
			
 
				-		These functions provide the interface to the MTD kernel API. 
			
 
				-		They are not replaceable and provide functionality
			
 
				-		which is complete hardware independent.
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[NAND Interface]</para><para>
			
 
				-		These functions are exported and provide the interface to the NAND kernel API. 
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[GENERIC]</para><para>
			
 
				-		Generic functions are not replaceable and provide functionality
			
 
				-		which is complete hardware independent.
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[DEFAULT]</para><para>
			
 
				-		Default functions provide hardware related functionality which is suitable
			
 
				-		for most of the implementations. These functions can be replaced by the
			
 
				-		board driver if necessary. Those functions are called via pointers in the
			
 
				-		NAND chip description structure. The board driver can set the functions which
			
 
				-		should be replaced by board dependent functions before calling nand_scan().
			
 
				-		If the function pointer is NULL on entry to nand_scan() then the pointer
			
 
				-		is set to the default function which is suitable for the detected chip type.
			
 
				-		</para></listitem>
			
 
				-	</itemizedlist>
			
 
				-     </sect1>
			
 
				-     <sect1 id="Struct_member_identifiers_XXX">
			
 
				-	<title>Struct member identifiers [XXX]</title>
			
 
				-     	<para>
			
 
				-	The struct members are marked with [XXX] identifiers in the 
			
 
				-	comment. The identifiers explain the usage and scope of the
			
 
				-	members. Following identifiers are used:
			
 
				-     	</para>
			
 
				-	<itemizedlist>
			
 
				-		<listitem><para>
			
 
				-	  	[INTERN]</para><para>
			
 
				-		These members are for NAND driver internal use only and must not be
			
 
				-		modified. Most of these values are calculated from the chip geometry
			
 
				-		information which is evaluated during nand_scan().
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[REPLACEABLE]</para><para>
			
 
				-		Replaceable members hold hardware related functions which can be 
			
 
				-		provided by the board driver. The board driver can set the functions which
			
 
				-		should be replaced by board dependent functions before calling nand_scan().
			
 
				-		If the function pointer is NULL on entry to nand_scan() then the pointer
			
 
				-		is set to the default function which is suitable for the detected chip type.
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[BOARDSPECIFIC]</para><para>
			
 
				-		Board specific members hold hardware related information which must
			
 
				-		be provided by the board driver. The board driver must set the function
			
 
				-		pointers and datafields before calling nand_scan().
			
 
				-		</para></listitem>
			
 
				-		<listitem><para>
			
 
				-	  	[OPTIONAL]</para><para>
			
 
				-		Optional members can hold information relevant for the board driver. The
			
 
				-		generic NAND driver code does not use this information.
			
 
				-		</para></listitem>
			
 
				-	</itemizedlist>
			
 
				-     </sect1>
			
 
				-  </chapter>   
			
 
				-
			
 
				-  <chapter id="basicboarddriver">
			
 
				-     	<title>Basic board driver</title>
			
 
				-	<para>
			
 
				-		For most boards it will be sufficient to provide just the
			
 
				-		basic functions and fill out some really board dependent
			
 
				-		members in the nand chip description structure.
			
 
				-	</para>
			
 
				-	<sect1 id="Basic_defines">
			
 
				-		<title>Basic defines</title>
			
 
				-		<para>
			
 
				-			At least you have to provide a nand_chip structure
			
 
				-			and a storage for the ioremap'ed chip address.
			
 
				-			You can allocate the nand_chip structure using
			
 
				-			kmalloc or you can allocate it statically.
			
 
				-			The NAND chip structure embeds an mtd structure
			
 
				-			which will be registered to the MTD subsystem.
			
 
				-			You can extract a pointer to the mtd structure
			
 
				-			from a nand_chip pointer using the nand_to_mtd()
			
 
				-			helper.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			Kmalloc based example
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static struct mtd_info *board_mtd;
			
 
				-static void __iomem *baseaddr;
			
 
				-		</programlisting>
			
 
				-		<para>
			
 
				-			Static example
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static struct nand_chip board_chip;
			
 
				-static void __iomem *baseaddr;
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Partition_defines">
			
 
				-		<title>Partition defines</title>
			
 
				-		<para>
			
 
				-			If you want to divide your device into partitions, then
			
 
				-			define a partitioning scheme suitable to your board.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-#define NUM_PARTITIONS 2
			
 
				-static struct mtd_partition partition_info[] = {
			
 
				-	{ .name = "Flash partition 1",
			
 
				-	  .offset =  0,
			
 
				-	  .size =    8 * 1024 * 1024 },
			
 
				-	{ .name = "Flash partition 2",
			
 
				-	  .offset =  MTDPART_OFS_NEXT,
			
 
				-	  .size =    MTDPART_SIZ_FULL },
			
 
				-};
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Hardware_control_functions">
			
 
				-		<title>Hardware control function</title>
			
 
				-		<para>
			
 
				-			The hardware control function provides access to the 
			
 
				-			control pins of the NAND chip(s). 
			
 
				-			The access can be done by GPIO pins or by address lines.
			
 
				-			If you use address lines, make sure that the timing
			
 
				-			requirements are met.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			<emphasis>GPIO based example</emphasis>
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
			
 
				-{
			
 
				-	switch(cmd){
			
 
				-		case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
			
 
				-		case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
			
 
				-		case NAND_CTL_SETALE: /* Set ALE pin high */ break;
			
 
				-		case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
			
 
				-		case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
			
 
				-		case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
			
 
				-	}
			
 
				-}
			
 
				-		</programlisting>
			
 
				-		<para>
			
 
				-			<emphasis>Address lines based example.</emphasis> It's assumed that the
			
 
				-			nCE pin is driven by a chip select decoder.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static void board_hwcontrol(struct mtd_info *mtd, int cmd)
			
 
				-{
			
 
				-	struct nand_chip *this = mtd_to_nand(mtd);
			
 
				-	switch(cmd){
			
 
				-		case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT;  break;
			
 
				-		case NAND_CTL_CLRCLE: this->IO_ADDR_W &amp;= ~CLE_ADRR_BIT; break;
			
 
				-		case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT;  break;
			
 
				-		case NAND_CTL_CLRALE: this->IO_ADDR_W &amp;= ~ALE_ADRR_BIT; break;
			
 
				-	}
			
 
				-}
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Device_ready_function">
			
 
				-		<title>Device ready function</title>
			
 
				-		<para>
			
 
				-			If the hardware interface has the ready busy pin of the NAND chip connected to a
			
 
				-			GPIO or other accessible I/O pin, this function is used to read back the state of the
			
 
				-			pin. The function has no arguments and should return 0, if the device is busy (R/B pin 
			
 
				-			is low) and 1, if the device is ready (R/B pin is high).
			
 
				-			If the hardware interface does not give access to the ready busy pin, then
			
 
				-			the function must not be defined and the function pointer this->dev_ready is set to NULL.		
			
 
				-		</para>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Init_function">
			
 
				-		<title>Init function</title>
			
 
				-		<para>
			
 
				-			The init function allocates memory and sets up all the board
			
 
				-			specific parameters and function pointers. When everything
			
 
				-			is set up nand_scan() is called. This function tries to
			
 
				-			detect and identify then chip. If a chip is found all the
			
 
				-			internal data fields are initialized accordingly.
			
 
				-			The structure(s) have to be zeroed out first and then filled with the necessary
			
 
				-			information about the device.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static int __init board_init (void)
			
 
				-{
			
 
				-	struct nand_chip *this;
			
 
				-	int err = 0;
			
 
				-
			
 
				-	/* Allocate memory for MTD device structure and private data */
			
 
				-	this = kzalloc(sizeof(struct nand_chip), GFP_KERNEL);
			
 
				-	if (!this) {
			
 
				-		printk ("Unable to allocate NAND MTD device structure.\n");
			
 
				-		err = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	board_mtd = nand_to_mtd(this);
			
 
				-
			
 
				-	/* map physical address */
			
 
				-	baseaddr = ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
			
 
				-	if (!baseaddr) {
			
 
				-		printk("Ioremap to access NAND chip failed\n");
			
 
				-		err = -EIO;
			
 
				-		goto out_mtd;
			
 
				-	}
			
 
				-
			
 
				-	/* Set address of NAND IO lines */
			
 
				-	this->IO_ADDR_R = baseaddr;
			
 
				-	this->IO_ADDR_W = baseaddr;
			
 
				-	/* Reference hardware control function */
			
 
				-	this->hwcontrol = board_hwcontrol;
			
 
				-	/* Set command delay time, see datasheet for correct value */
			
 
				-	this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
			
 
				-	/* Assign the device ready function, if available */
			
 
				-	this->dev_ready = board_dev_ready;
			
 
				-	this->eccmode = NAND_ECC_SOFT;
			
 
				-
			
 
				-	/* Scan to find existence of the device */
			
 
				-	if (nand_scan (board_mtd, 1)) {
			
 
				-		err = -ENXIO;
			
 
				-		goto out_ior;
			
 
				-	}
			
 
				-	
			
 
				-	add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
			
 
				-	goto out;
			
 
				-
			
 
				-out_ior:
			
 
				-	iounmap(baseaddr);
			
 
				-out_mtd:
			
 
				-	kfree (this);
			
 
				-out:
			
 
				-	return err;
			
 
				-}
			
 
				-module_init(board_init);
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Exit_function">
			
 
				-		<title>Exit function</title>
			
 
				-		<para>
			
 
				-			The exit function is only necessary if the driver is
			
 
				-			compiled as a module. It releases all resources which
			
 
				-			are held by the chip driver and unregisters the partitions
			
 
				-			in the MTD layer.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-#ifdef MODULE
			
 
				-static void __exit board_cleanup (void)
			
 
				-{
			
 
				-	/* Release resources, unregister device */
			
 
				-	nand_release (board_mtd);
			
 
				-
			
 
				-	/* unmap physical address */
			
 
				-	iounmap(baseaddr);
			
 
				-	
			
 
				-	/* Free the MTD device structure */
			
 
				-	kfree (mtd_to_nand(board_mtd));
			
 
				-}
			
 
				-module_exit(board_cleanup);
			
 
				-#endif
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="boarddriversadvanced">
			
 
				-     	<title>Advanced board driver functions</title>
			
 
				-	<para>
			
 
				-		This chapter describes the advanced functionality of the NAND
			
 
				-		driver. For a list of functions which can be overridden by the board
			
 
				-		driver see the documentation of the nand_chip structure.
			
 
				-	</para>
			
 
				-	<sect1 id="Multiple_chip_control">
			
 
				-		<title>Multiple chip control</title>
			
 
				-		<para>
			
 
				-			The nand driver can control chip arrays. Therefore the
			
 
				-			board driver must provide an own select_chip function. This
			
 
				-			function must (de)select the requested chip.
			
 
				-			The function pointer in the nand_chip structure must
			
 
				-			be set before calling nand_scan(). The maxchip parameter
			
 
				-			of nand_scan() defines the maximum number of chips to
			
 
				-			scan for. Make sure that the select_chip function can
			
 
				-			handle the requested number of chips.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The nand driver concatenates the chips to one virtual
			
 
				-			chip and provides this virtual chip to the MTD layer.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			<emphasis>Note: The driver can only handle linear chip arrays
			
 
				-			of equally sized chips. There is no support for
			
 
				-			parallel arrays which extend the buswidth.</emphasis>
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			<emphasis>GPIO based example</emphasis>
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static void board_select_chip (struct mtd_info *mtd, int chip)
			
 
				-{
			
 
				-	/* Deselect all chips, set all nCE pins high */
			
 
				-	GPIO(BOARD_NAND_NCE) |= 0xff;	
			
 
				-	if (chip >= 0)
			
 
				-		GPIO(BOARD_NAND_NCE) &amp;= ~ (1 &lt;&lt; chip);
			
 
				-}
			
 
				-		</programlisting>
			
 
				-		<para>
			
 
				-			<emphasis>Address lines based example.</emphasis>
			
 
				-			Its assumed that the nCE pins are connected to an
			
 
				-			address decoder.
			
 
				-		</para>
			
 
				-		<programlisting>
			
 
				-static void board_select_chip (struct mtd_info *mtd, int chip)
			
 
				-{
			
 
				-	struct nand_chip *this = mtd_to_nand(mtd);
			
 
				-	
			
 
				-	/* Deselect all chips */
			
 
				-	this->IO_ADDR_R &amp;= ~BOARD_NAND_ADDR_MASK;
			
 
				-	this->IO_ADDR_W &amp;= ~BOARD_NAND_ADDR_MASK;
			
 
				-	switch (chip) {
			
 
				-	case 0:
			
 
				-		this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
			
 
				-		this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
			
 
				-		break;
			
 
				-	....	
			
 
				-	case n:
			
 
				-		this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
			
 
				-		this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
			
 
				-		break;
			
 
				-	}	
			
 
				-}
			
 
				-		</programlisting>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Hardware_ECC_support">
			
 
				-		<title>Hardware ECC support</title>
			
 
				-		<sect2 id="Functions_and_constants">
			
 
				-			<title>Functions and constants</title>
			
 
				-			<para>
			
 
				-				The nand driver supports three different types of
			
 
				-				hardware ECC.
			
 
				-				<itemizedlist>
			
 
				-				<listitem><para>NAND_ECC_HW3_256</para><para>
			
 
				-				Hardware ECC generator providing 3 bytes ECC per
			
 
				-				256 byte.
			
 
				-				</para>	</listitem>
			
 
				-				<listitem><para>NAND_ECC_HW3_512</para><para>
			
 
				-				Hardware ECC generator providing 3 bytes ECC per
			
 
				-				512 byte.
			
 
				-				</para>	</listitem>
			
 
				-				<listitem><para>NAND_ECC_HW6_512</para><para>
			
 
				-				Hardware ECC generator providing 6 bytes ECC per
			
 
				-				512 byte.
			
 
				-				</para>	</listitem>
			
 
				-				<listitem><para>NAND_ECC_HW8_512</para><para>
			
 
				-				Hardware ECC generator providing 6 bytes ECC per
			
 
				-				512 byte.
			
 
				-				</para>	</listitem>
			
 
				-				</itemizedlist>
			
 
				-				If your hardware generator has a different functionality
			
 
				-				add it at the appropriate place in nand_base.c
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				The board driver must provide following functions:
			
 
				-				<itemizedlist>
			
 
				-				<listitem><para>enable_hwecc</para><para>
			
 
				-				This function is called before reading / writing to
			
 
				-				the chip. Reset or initialize the hardware generator
			
 
				-				in this function. The function is called with an
			
 
				-				argument which let you distinguish between read 
			
 
				-				and write operations.
			
 
				-				</para>	</listitem>
			
 
				-				<listitem><para>calculate_ecc</para><para>
			
 
				-				This function is called after read / write from / to
			
 
				-				the chip. Transfer the ECC from the hardware to
			
 
				-				the buffer. If the option NAND_HWECC_SYNDROME is set
			
 
				-				then the function is only called on write. See below.
			
 
				-				</para>	</listitem>
			
 
				-				<listitem><para>correct_data</para><para>
			
 
				-				In case of an ECC error this function is called for
			
 
				-				error detection and correction. Return 1 respectively 2
			
 
				-				in case the error can be corrected. If the error is
			
 
				-				not correctable return -1. If your hardware generator
			
 
				-				matches the default algorithm of the nand_ecc software
			
 
				-				generator then use the correction function provided
			
 
				-				by nand_ecc instead of implementing duplicated code.
			
 
				-				</para>	</listitem>
			
 
				-				</itemizedlist>
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-		<sect2 id="Hardware_ECC_with_syndrome_calculation">
			
 
				-		<title>Hardware ECC with syndrome calculation</title>
			
 
				-			<para>
			
 
				-				Many hardware ECC implementations provide Reed-Solomon
			
 
				-				codes and calculate an error syndrome on read. The syndrome
			
 
				-				must be converted to a standard Reed-Solomon syndrome
			
 
				-				before calling the error correction code in the generic
			
 
				-				Reed-Solomon library.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				The ECC bytes must be placed immediately after the data
			
 
				-				bytes in order to make the syndrome generator work. This
			
 
				-				is contrary to the usual layout used by software ECC. The
			
 
				-				separation of data and out of band area is not longer
			
 
				-				possible. The nand driver code handles this layout and
			
 
				-				the remaining free bytes in the oob area are managed by 
			
 
				-				the autoplacement code. Provide a matching oob-layout
			
 
				-				in this case. See rts_from4.c and diskonchip.c for 
			
 
				-				implementation reference. In those cases we must also
			
 
				-				use bad block tables on FLASH, because the ECC layout is
			
 
				-				interfering with the bad block marker positions.
			
 
				-				See bad block table support for details.
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Bad_Block_table_support">
			
 
				-		<title>Bad block table support</title>
			
 
				-		<para>
			
 
				-			Most NAND chips mark the bad blocks at a defined
			
 
				-			position in the spare area. Those blocks must 
			
 
				-			not be erased under any circumstances as the bad 
			
 
				-			block information would be lost.
			
 
				-			It is possible to check the bad block mark each
			
 
				-			time when the blocks are accessed by reading the
			
 
				-			spare area of the first page in the block. This
			
 
				-			is time consuming so a bad block table is used.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The nand driver supports various types of bad block
			
 
				-			tables.
			
 
				-			<itemizedlist>
			
 
				-			<listitem><para>Per device</para><para>
			
 
				-			The bad block table contains all bad block information
			
 
				-			of the device which can consist of multiple chips.
			
 
				-			</para>	</listitem>
			
 
				-			<listitem><para>Per chip</para><para>
			
 
				-			A bad block table is used per chip and contains the
			
 
				-			bad block information for this particular chip.
			
 
				-			</para>	</listitem>
			
 
				-			<listitem><para>Fixed offset</para><para>
			
 
				-			The bad block table is located at a fixed offset
			
 
				-			in the chip (device). This applies to various
			
 
				-			DiskOnChip devices.
			
 
				-			</para>	</listitem>
			
 
				-			<listitem><para>Automatic placed</para><para>
			
 
				-			The bad block table is automatically placed and
			
 
				-			detected either at the end or at the beginning
			
 
				-			of a chip (device)
			
 
				-			</para>	</listitem>
			
 
				-			<listitem><para>Mirrored tables</para><para>
			
 
				-			The bad block table is mirrored on the chip (device) to
			
 
				-			allow updates of the bad block table without data loss.
			
 
				-			</para>	</listitem>
			
 
				-			</itemizedlist>
			
 
				-		</para>
			
 
				-		<para>	
			
 
				-			nand_scan() calls the function nand_default_bbt(). 
			
 
				-			nand_default_bbt() selects appropriate default
			
 
				-			bad block table descriptors depending on the chip information
			
 
				-			which was retrieved by nand_scan().
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			The standard policy is scanning the device for bad 
			
 
				-			blocks and build a ram based bad block table which
			
 
				-			allows faster access than always checking the
			
 
				-			bad block information on the flash chip itself.
			
 
				-		</para>
			
 
				-		<sect2 id="Flash_based_tables">
			
 
				-			<title>Flash based tables</title>
			
 
				-			<para>
			
 
				-				It may be desired or necessary to keep a bad block table in FLASH.
			
 
				-				For AG-AND chips this is mandatory, as they have no factory marked
			
 
				-				bad blocks. They have factory marked good blocks. The marker pattern
			
 
				-				is erased when the block is erased to be reused. So in case of
			
 
				-				powerloss before writing the pattern back to the chip this block 
			
 
				-				would be lost and added to the bad blocks. Therefore we scan the 
			
 
				-				chip(s) when we detect them the first time for good blocks and 
			
 
				-				store this information in a bad block table before erasing any 
			
 
				-				of the blocks.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				The blocks in which the tables are stored are protected against
			
 
				-				accidental access by marking them bad in the memory bad block
			
 
				-				table. The bad block table management functions are allowed
			
 
				-				to circumvent this protection.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				The simplest way to activate the FLASH based bad block table support 
			
 
				-				is to set the option NAND_BBT_USE_FLASH in the bbt_option field of
			
 
				-				the nand chip structure before calling nand_scan(). For AG-AND
			
 
				-				chips is this done by default.
			
 
				-				This activates the default FLASH based bad block table functionality 
			
 
				-				of the NAND driver. The default bad block table options are
			
 
				-				<itemizedlist>
			
 
				-				<listitem><para>Store bad block table per chip</para></listitem>
			
 
				-				<listitem><para>Use 2 bits per block</para></listitem>
			
 
				-				<listitem><para>Automatic placement at the end of the chip</para></listitem>
			
 
				-				<listitem><para>Use mirrored tables with version numbers</para></listitem>
			
 
				-				<listitem><para>Reserve 4 blocks at the end of the chip</para></listitem>
			
 
				-				</itemizedlist>
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-		<sect2 id="User_defined_tables">
			
 
				-			<title>User defined tables</title>
			
 
				-			<para>
			
 
				-				User defined tables are created by filling out a 
			
 
				-				nand_bbt_descr structure and storing the pointer in the
			
 
				-				nand_chip structure member bbt_td before calling nand_scan(). 
			
 
				-				If a mirror table is necessary a second structure must be
			
 
				-				created and a pointer to this structure must be stored
			
 
				-				in bbt_md inside the nand_chip structure. If the bbt_md 
			
 
				-				member is set to NULL then only the main table is used
			
 
				-				and no scan for the mirrored table is performed.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				The most important field in the nand_bbt_descr structure
			
 
				-				is the options field. The options define most of the 
			
 
				-				table properties. Use the predefined constants from
			
 
				-				nand.h to define the options.
			
 
				-				<itemizedlist>
			
 
				-				<listitem><para>Number of bits per block</para>
			
 
				-				<para>The supported number of bits is 1, 2, 4, 8.</para></listitem>
			
 
				-				<listitem><para>Table per chip</para>
			
 
				-				<para>Setting the constant NAND_BBT_PERCHIP selects that
			
 
				-				a bad block table is managed for each chip in a chip array.
			
 
				-				If this option is not set then a per device bad block table
			
 
				-				is used.</para></listitem>
			
 
				-				<listitem><para>Table location is absolute</para>
			
 
				-				<para>Use the option constant NAND_BBT_ABSPAGE and
			
 
				-				define the absolute page number where the bad block
			
 
				-				table starts in the field pages. If you have selected bad block
			
 
				-				tables per chip and you have a multi chip array then the start page
			
 
				-				must be given for each chip in the chip array. Note: there is no scan
			
 
				-				for a table ident pattern performed, so the fields 
			
 
				-				pattern, veroffs, offs, len can be left uninitialized</para></listitem>
			
 
				-				<listitem><para>Table location is automatically detected</para>
			
 
				-				<para>The table can either be located in the first or the last good
			
 
				-				blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
			
 
				-				the bad block table at the end of the chip (device). The
			
 
				-				bad block tables are marked and identified by a pattern which
			
 
				-				is stored in the spare area of the first page in the block which
			
 
				-				holds the bad block table. Store a pointer to the pattern  
			
 
				-				in the pattern field. Further the length of the pattern has to be 
			
 
				-				stored in len and the offset in the spare area must be given
			
 
				-				in the offs member of the nand_bbt_descr structure. For mirrored
			
 
				-				bad block tables different patterns are mandatory.</para></listitem>
			
 
				-				<listitem><para>Table creation</para>
			
 
				-				<para>Set the option NAND_BBT_CREATE to enable the table creation
			
 
				-				if no table can be found during the scan. Usually this is done only 
			
 
				-				once if a new chip is found. </para></listitem>
			
 
				-				<listitem><para>Table write support</para>
			
 
				-				<para>Set the option NAND_BBT_WRITE to enable the table write support.
			
 
				-				This allows the update of the bad block table(s) in case a block has
			
 
				-				to be marked bad due to wear. The MTD interface function block_markbad
			
 
				-				is calling the update function of the bad block table. If the write
			
 
				-				support is enabled then the table is updated on FLASH.</para>
			
 
				-				<para>
			
 
				-				Note: Write support should only be enabled for mirrored tables with
			
 
				-				version control.
			
 
				-				</para></listitem>
			
 
				-				<listitem><para>Table version control</para>
			
 
				-				<para>Set the option NAND_BBT_VERSION to enable the table version control.
			
 
				-				It's highly recommended to enable this for mirrored tables with write
			
 
				-				support. It makes sure that the risk of losing the bad block
			
 
				-				table information is reduced to the loss of the information about the
			
 
				-				one worn out block which should be marked bad. The version is stored in
			
 
				-				4 consecutive bytes in the spare area of the device. The position of
			
 
				-				the version number is defined by the member veroffs in the bad block table
			
 
				-				descriptor.</para></listitem>
			
 
				-				<listitem><para>Save block contents on write</para>
			
 
				-				<para>
			
 
				-				In case that the block which holds the bad block table does contain
			
 
				-				other useful information, set the option NAND_BBT_SAVECONTENT. When
			
 
				-				the bad block table is written then the whole block is read the bad
			
 
				-				block table is updated and the block is erased and everything is 
			
 
				-				written back. If this option is not set only the bad block table
			
 
				-				is written and everything else in the block is ignored and erased.
			
 
				-				</para></listitem>
			
 
				-				<listitem><para>Number of reserved blocks</para>
			
 
				-				<para>
			
 
				-				For automatic placement some blocks must be reserved for
			
 
				-				bad block table storage. The number of reserved blocks is defined 
			
 
				-				in the maxblocks member of the bad block table description structure.
			
 
				-				Reserving 4 blocks for mirrored tables should be a reasonable number. 
			
 
				-				This also limits the number of blocks which are scanned for the bad
			
 
				-				block table ident pattern.
			
 
				-				</para></listitem>
			
 
				-				</itemizedlist>
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-	</sect1>
			
 
				-	<sect1 id="Spare_area_placement">
			
 
				-		<title>Spare area (auto)placement</title>
			
 
				-		<para>
			
 
				-			The nand driver implements different possibilities for
			
 
				-			placement of filesystem data in the spare area, 
			
 
				-			<itemizedlist>
			
 
				-			<listitem><para>Placement defined by fs driver</para></listitem>
			
 
				-			<listitem><para>Automatic placement</para></listitem>
			
 
				-			</itemizedlist>
			
 
				-			The default placement function is automatic placement. The
			
 
				-			nand driver has built in default placement schemes for the
			
 
				-			various chiptypes. If due to hardware ECC functionality the
			
 
				-			default placement does not fit then the board driver can
			
 
				-			provide a own placement scheme.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			File system drivers can provide a own placement scheme which
			
 
				-			is used instead of the default placement scheme.
			
 
				-		</para>
			
 
				-		<para>
			
 
				-			Placement schemes are defined by a nand_oobinfo structure
			
 
				-	     		<programlisting>
			
 
				-struct nand_oobinfo {
			
 
				-	int	useecc;
			
 
				-	int	eccbytes;
			
 
				-	int	eccpos[24];
			
 
				-	int	oobfree[8][2];
			
 
				-};
			
 
				-	     		</programlisting>
			
 
				-			<itemizedlist>
			
 
				-			<listitem><para>useecc</para><para>
			
 
				-				The useecc member controls the ecc and placement function. The header
			
 
				-				file include/mtd/mtd-abi.h contains constants to select ecc and
			
 
				-				placement. MTD_NANDECC_OFF switches off the ecc complete. This is
			
 
				-				not recommended and available for testing and diagnosis only.
			
 
				-				MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
			
 
				-				selects automatic placement.
			
 
				-			</para></listitem>
			
 
				-			<listitem><para>eccbytes</para><para>
			
 
				-				The eccbytes member defines the number of ecc bytes per page.
			
 
				-			</para></listitem>
			
 
				-			<listitem><para>eccpos</para><para>
			
 
				-				The eccpos array holds the byte offsets in the spare area where
			
 
				-				the ecc codes are placed.
			
 
				-			</para></listitem>
			
 
				-			<listitem><para>oobfree</para><para>
			
 
				-				The oobfree array defines the areas in the spare area which can be
			
 
				-				used for automatic placement. The information is given in the format
			
 
				-				{offset, size}. offset defines the start of the usable area, size the
			
 
				-				length in bytes. More than one area can be defined. The list is terminated
			
 
				-				by an {0, 0} entry.
			
 
				-			</para></listitem>
			
 
				-			</itemizedlist>
			
 
				-		</para>
			
 
				-		<sect2 id="Placement_defined_by_fs_driver">
			
 
				-			<title>Placement defined by fs driver</title>
			
 
				-			<para>
			
 
				-				The calling function provides a pointer to a nand_oobinfo
			
 
				-				structure which defines the ecc placement. For writes the
			
 
				-				caller must provide a spare area buffer along with the
			
 
				-				data buffer. The spare area buffer size is (number of pages) *
			
 
				-				(size of spare area). For reads the buffer size is
			
 
				-				(number of pages) * ((size of spare area) + (number of ecc
			
 
				-				steps per page) * sizeof (int)). The driver stores the
			
 
				-				result of the ecc check for each tuple in the spare buffer.
			
 
				-				The storage sequence is 
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				&lt;spare data page 0&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				...
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				&lt;spare data page n&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				This is a legacy mode used by YAFFS1.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				If the spare area buffer is NULL then only the ECC placement is
			
 
				-				done according to the given scheme in the nand_oobinfo structure.
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-		<sect2 id="Automatic_placement">
			
 
				-			<title>Automatic placement</title>
			
 
				-			<para>
			
 
				-				Automatic placement uses the built in defaults to place the
			
 
				-				ecc bytes in the spare area. If filesystem data have to be stored /
			
 
				-				read into the spare area then the calling function must provide a
			
 
				-				buffer. The buffer size per page is determined by the oobfree array in
			
 
				-				the nand_oobinfo structure.
			
 
				-			</para>
			
 
				-			<para>
			
 
				-				If the spare area buffer is NULL then only the ECC placement is
			
 
				-				done according to the default builtin scheme.
			
 
				-			</para>
			
 
				-		</sect2>
			
 
				-	</sect1>	
			
 
				-	<sect1 id="Spare_area_autoplacement_default">
			
 
				-		<title>Spare area autoplacement default schemes</title>
			
 
				-		<sect2 id="pagesize_256">
			
 
				-			<title>256 byte pagesize</title>
			
 
				-<informaltable><tgroup cols="3"><tbody>
			
 
				-<row>
			
 
				-<entry>Offset</entry>
			
 
				-<entry>Content</entry>
			
 
				-<entry>Comment</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x00</entry>
			
 
				-<entry>ECC byte 0</entry>
			
 
				-<entry>Error correction code byte 0</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x01</entry>
			
 
				-<entry>ECC byte 1</entry>
			
 
				-<entry>Error correction code byte 1</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x02</entry>
			
 
				-<entry>ECC byte 2</entry>
			
 
				-<entry>Error correction code byte 2</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x03</entry>
			
 
				-<entry>Autoplace 0</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x04</entry>
			
 
				-<entry>Autoplace 1</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x05</entry>
			
 
				-<entry>Bad block marker</entry>
			
 
				-<entry>If any bit in this byte is zero, then this block is bad.
			
 
				-This applies only to the first page in a block. In the remaining
			
 
				-pages this byte is reserved</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x06</entry>
			
 
				-<entry>Autoplace 2</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x07</entry>
			
 
				-<entry>Autoplace 3</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-</tbody></tgroup></informaltable>
			
 
				-		</sect2>
			
 
				-		<sect2 id="pagesize_512">
			
 
				-			<title>512 byte pagesize</title>
			
 
				-<informaltable><tgroup cols="3"><tbody>
			
 
				-<row>
			
 
				-<entry>Offset</entry>
			
 
				-<entry>Content</entry>
			
 
				-<entry>Comment</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x00</entry>
			
 
				-<entry>ECC byte 0</entry>
			
 
				-<entry>Error correction code byte 0 of the lower 256 Byte data in
			
 
				-this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x01</entry>
			
 
				-<entry>ECC byte 1</entry>
			
 
				-<entry>Error correction code byte 1 of the lower 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x02</entry>
			
 
				-<entry>ECC byte 2</entry>
			
 
				-<entry>Error correction code byte 2 of the lower 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x03</entry>
			
 
				-<entry>ECC byte 3</entry>
			
 
				-<entry>Error correction code byte 0 of the upper 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x04</entry>
			
 
				-<entry>reserved</entry>
			
 
				-<entry>reserved</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x05</entry>
			
 
				-<entry>Bad block marker</entry>
			
 
				-<entry>If any bit in this byte is zero, then this block is bad.
			
 
				-This applies only to the first page in a block. In the remaining
			
 
				-pages this byte is reserved</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x06</entry>
			
 
				-<entry>ECC byte 4</entry>
			
 
				-<entry>Error correction code byte 1 of the upper 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x07</entry>
			
 
				-<entry>ECC byte 5</entry>
			
 
				-<entry>Error correction code byte 2 of the upper 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x08 - 0x0F</entry>
			
 
				-<entry>Autoplace 0 - 7</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-</tbody></tgroup></informaltable>
			
 
				-		</sect2>
			
 
				-		<sect2 id="pagesize_2048">
			
 
				-			<title>2048 byte pagesize</title>
			
 
				-<informaltable><tgroup cols="3"><tbody>
			
 
				-<row>
			
 
				-<entry>Offset</entry>
			
 
				-<entry>Content</entry>
			
 
				-<entry>Comment</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x00</entry>
			
 
				-<entry>Bad block marker</entry>
			
 
				-<entry>If any bit in this byte is zero, then this block is bad.
			
 
				-This applies only to the first page in a block. In the remaining
			
 
				-pages this byte is reserved</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x01</entry>
			
 
				-<entry>Reserved</entry>
			
 
				-<entry>Reserved</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x02-0x27</entry>
			
 
				-<entry>Autoplace 0 - 37</entry>
			
 
				-<entry></entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x28</entry>
			
 
				-<entry>ECC byte 0</entry>
			
 
				-<entry>Error correction code byte 0 of the first 256 Byte data in
			
 
				-this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x29</entry>
			
 
				-<entry>ECC byte 1</entry>
			
 
				-<entry>Error correction code byte 1 of the first 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2A</entry>
			
 
				-<entry>ECC byte 2</entry>
			
 
				-<entry>Error correction code byte 2 of the first 256 Bytes data in
			
 
				-this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2B</entry>
			
 
				-<entry>ECC byte 3</entry>
			
 
				-<entry>Error correction code byte 0 of the second 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2C</entry>
			
 
				-<entry>ECC byte 4</entry>
			
 
				-<entry>Error correction code byte 1 of the second 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2D</entry>
			
 
				-<entry>ECC byte 5</entry>
			
 
				-<entry>Error correction code byte 2 of the second 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2E</entry>
			
 
				-<entry>ECC byte 6</entry>
			
 
				-<entry>Error correction code byte 0 of the third 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x2F</entry>
			
 
				-<entry>ECC byte 7</entry>
			
 
				-<entry>Error correction code byte 1 of the third 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x30</entry>
			
 
				-<entry>ECC byte 8</entry>
			
 
				-<entry>Error correction code byte 2 of the third 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x31</entry>
			
 
				-<entry>ECC byte 9</entry>
			
 
				-<entry>Error correction code byte 0 of the fourth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x32</entry>
			
 
				-<entry>ECC byte 10</entry>
			
 
				-<entry>Error correction code byte 1 of the fourth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x33</entry>
			
 
				-<entry>ECC byte 11</entry>
			
 
				-<entry>Error correction code byte 2 of the fourth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x34</entry>
			
 
				-<entry>ECC byte 12</entry>
			
 
				-<entry>Error correction code byte 0 of the fifth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x35</entry>
			
 
				-<entry>ECC byte 13</entry>
			
 
				-<entry>Error correction code byte 1 of the fifth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x36</entry>
			
 
				-<entry>ECC byte 14</entry>
			
 
				-<entry>Error correction code byte 2 of the fifth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x37</entry>
			
 
				-<entry>ECC byte 15</entry>
			
 
				-<entry>Error correction code byte 0 of the sixt 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x38</entry>
			
 
				-<entry>ECC byte 16</entry>
			
 
				-<entry>Error correction code byte 1 of the sixt 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x39</entry>
			
 
				-<entry>ECC byte 17</entry>
			
 
				-<entry>Error correction code byte 2 of the sixt 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3A</entry>
			
 
				-<entry>ECC byte 18</entry>
			
 
				-<entry>Error correction code byte 0 of the seventh 256 Bytes of
			
 
				-data in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3B</entry>
			
 
				-<entry>ECC byte 19</entry>
			
 
				-<entry>Error correction code byte 1 of the seventh 256 Bytes of
			
 
				-data in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3C</entry>
			
 
				-<entry>ECC byte 20</entry>
			
 
				-<entry>Error correction code byte 2 of the seventh 256 Bytes of
			
 
				-data in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3D</entry>
			
 
				-<entry>ECC byte 21</entry>
			
 
				-<entry>Error correction code byte 0 of the eighth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3E</entry>
			
 
				-<entry>ECC byte 22</entry>
			
 
				-<entry>Error correction code byte 1 of the eighth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-<row>
			
 
				-<entry>0x3F</entry>
			
 
				-<entry>ECC byte 23</entry>
			
 
				-<entry>Error correction code byte 2 of the eighth 256 Bytes of data
			
 
				-in this page</entry>
			
 
				-</row>
			
 
				-</tbody></tgroup></informaltable>
			
 
				-		</sect2>
			
 
				-     	</sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="filesystems">
			
 
				-     	<title>Filesystem support</title>
			
 
				-	<para>
			
 
				-		The NAND driver provides all necessary functions for a
			
 
				-		filesystem via the MTD interface.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		Filesystems must be aware of the NAND peculiarities and
			
 
				-		restrictions. One major restrictions of NAND Flash is, that you cannot 
			
 
				-		write as often as you want to a page. The consecutive writes to a page, 
			
 
				-		before erasing it again, are restricted to 1-3 writes, depending on the 
			
 
				-		manufacturers specifications. This applies similar to the spare area. 
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		Therefore NAND aware filesystems must either write in page size chunks
			
 
				-		or hold a writebuffer to collect smaller writes until they sum up to 
			
 
				-		pagesize. Available NAND aware filesystems: JFFS2, YAFFS. 		
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		The spare area usage to store filesystem data is controlled by
			
 
				-		the spare area placement functionality which is described in one
			
 
				-		of the earlier chapters.
			
 
				-	</para>
			
 
				-  </chapter>	
			
 
				-  <chapter id="tools">
			
 
				-     	<title>Tools</title>
			
 
				-	<para>
			
 
				-		The MTD project provides a couple of helpful tools to handle NAND Flash.
			
 
				-		<itemizedlist>
			
 
				-		<listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem>
			
 
				-		<listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem>
			
 
				-		<listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem>
			
 
				-		</itemizedlist>
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		These tools are aware of the NAND restrictions. Please use those tools
			
 
				-		instead of complaining about errors which are caused by non NAND aware
			
 
				-		access methods.
			
 
				-	</para>
			
 
				-  </chapter>	
			
 
				-
			
 
				-  <chapter id="defines">
			
 
				-     <title>Constants</title>
			
 
				-     <para>
			
 
				-     This chapter describes the constants which might be relevant for a driver developer.
			
 
				-     </para>
			
 
				-     <sect1 id="Chip_option_constants">
			
 
				-	<title>Chip option constants</title>
			
 
				-     	<sect2 id="Constants_for_chip_id_table">
			
 
				-		<title>Constants for chip id table</title>
			
 
				-     		<para>
			
 
				-		These constants are defined in nand.h. They are ored together to describe
			
 
				-		the chip functionality.
			
 
				-     		<programlisting>
			
 
				-/* Buswitdh is 16 bit */
			
 
				-#define NAND_BUSWIDTH_16	0x00000002
			
 
				-/* Device supports partial programming without padding */
			
 
				-#define NAND_NO_PADDING		0x00000004
			
 
				-/* Chip has cache program function */
			
 
				-#define NAND_CACHEPRG		0x00000008
			
 
				-/* Chip has copy back function */
			
 
				-#define NAND_COPYBACK		0x00000010
			
 
				-/* AND Chip which has 4 banks and a confusing page / block 
			
 
				- * assignment. See Renesas datasheet for further information */
			
 
				-#define NAND_IS_AND		0x00000020
			
 
				-/* Chip has a array of 4 pages which can be read without
			
 
				- * additional ready /busy waits */
			
 
				-#define NAND_4PAGE_ARRAY	0x00000040 
			
 
				-		</programlisting>
			
 
				-     		</para>
			
 
				-     	</sect2>
			
 
				-     	<sect2 id="Constants_for_runtime_options">
			
 
				-		<title>Constants for runtime options</title>
			
 
				-     		<para>
			
 
				-		These constants are defined in nand.h. They are ored together to describe
			
 
				-		the functionality.
			
 
				-     		<programlisting>
			
 
				-/* The hw ecc generator provides a syndrome instead a ecc value on read 
			
 
				- * This can only work if we have the ecc bytes directly behind the 
			
 
				- * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
			
 
				-#define NAND_HWECC_SYNDROME	0x00020000
			
 
				-		</programlisting>
			
 
				-     		</para>
			
 
				-     	</sect2>
			
 
				-     </sect1>	
			
 
				-
			
 
				-     <sect1 id="EEC_selection_constants">
			
 
				-	<title>ECC selection constants</title>
			
 
				-	<para>
			
 
				-	Use these constants to select the ECC algorithm.
			
 
				-  	<programlisting>
			
 
				-/* No ECC. Usage is not recommended ! */
			
 
				-#define NAND_ECC_NONE		0
			
 
				-/* Software ECC 3 byte ECC per 256 Byte data */
			
 
				-#define NAND_ECC_SOFT		1
			
 
				-/* Hardware ECC 3 byte ECC per 256 Byte data */
			
 
				-#define NAND_ECC_HW3_256	2
			
 
				-/* Hardware ECC 3 byte ECC per 512 Byte data */
			
 
				-#define NAND_ECC_HW3_512	3
			
 
				-/* Hardware ECC 6 byte ECC per 512 Byte data */
			
 
				-#define NAND_ECC_HW6_512	4
			
 
				-/* Hardware ECC 6 byte ECC per 512 Byte data */
			
 
				-#define NAND_ECC_HW8_512	6
			
 
				-	</programlisting>
			
 
				-	</para>
			
 
				-     </sect1>	
			
 
				-
			
 
				-     <sect1 id="Hardware_control_related_constants">
			
 
				-	<title>Hardware control related constants</title>
			
 
				-	<para>
			
 
				-	These constants describe the requested hardware access function when
			
 
				-	the boardspecific hardware control function is called
			
 
				-  	<programlisting>
			
 
				-/* Select the chip by setting nCE to low */
			
 
				-#define NAND_CTL_SETNCE 	1
			
 
				-/* Deselect the chip by setting nCE to high */
			
 
				-#define NAND_CTL_CLRNCE		2
			
 
				-/* Select the command latch by setting CLE to high */
			
 
				-#define NAND_CTL_SETCLE		3
			
 
				-/* Deselect the command latch by setting CLE to low */
			
 
				-#define NAND_CTL_CLRCLE		4
			
 
				-/* Select the address latch by setting ALE to high */
			
 
				-#define NAND_CTL_SETALE		5
			
 
				-/* Deselect the address latch by setting ALE to low */
			
 
				-#define NAND_CTL_CLRALE		6
			
 
				-/* Set write protection by setting WP to high. Not used! */
			
 
				-#define NAND_CTL_SETWP		7
			
 
				-/* Clear write protection by setting WP to low. Not used! */
			
 
				-#define NAND_CTL_CLRWP		8
			
 
				-	</programlisting>
			
 
				-	</para>
			
 
				-     </sect1>	
			
 
				-
			
 
				-     <sect1 id="Bad_block_table_constants">
			
 
				-	<title>Bad block table related constants</title>
			
 
				-	<para>
			
 
				-	These constants describe the options used for bad block
			
 
				-	table descriptors.
			
 
				-  	<programlisting>
			
 
				-/* Options for the bad block table descriptors */
			
 
				-
			
 
				-/* The number of bits used per block in the bbt on the device */
			
 
				-#define NAND_BBT_NRBITS_MSK	0x0000000F
			
 
				-#define NAND_BBT_1BIT		0x00000001
			
 
				-#define NAND_BBT_2BIT		0x00000002
			
 
				-#define NAND_BBT_4BIT		0x00000004
			
 
				-#define NAND_BBT_8BIT		0x00000008
			
 
				-/* The bad block table is in the last good block of the device */
			
 
				-#define	NAND_BBT_LASTBLOCK	0x00000010
			
 
				-/* The bbt is at the given page, else we must scan for the bbt */
			
 
				-#define NAND_BBT_ABSPAGE	0x00000020
			
 
				-/* bbt is stored per chip on multichip devices */
			
 
				-#define NAND_BBT_PERCHIP	0x00000080
			
 
				-/* bbt has a version counter at offset veroffs */
			
 
				-#define NAND_BBT_VERSION	0x00000100
			
 
				-/* Create a bbt if none axists */
			
 
				-#define NAND_BBT_CREATE		0x00000200
			
 
				-/* Write bbt if necessary */
			
 
				-#define NAND_BBT_WRITE		0x00001000
			
 
				-/* Read and write back block contents when writing bbt */
			
 
				-#define NAND_BBT_SAVECONTENT	0x00002000
			
 
				-	</programlisting>
			
 
				-	</para>
			
 
				-     </sect1>	
			
 
				-
			
 
				-  </chapter>
			
 
				-  	
			
 
				-  <chapter id="structs">
			
 
				-     <title>Structures</title>
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the structures which are
			
 
				-     used in the NAND driver and might be relevant for a driver developer. Each  
			
 
				-     struct member has a short description which is marked with an [XXX] identifier.
			
 
				-     See the chapter "Documentation hints" for an explanation.
			
 
				-     </para>
			
 
				-!Iinclude/linux/mtd/nand.h
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="pubfunctions">
			
 
				-     <title>Public Functions Provided</title>
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the NAND kernel API functions
			
 
				-      which are exported. Each function has a short description which is marked with an [XXX] identifier.
			
 
				-     See the chapter "Documentation hints" for an explanation.
			
 
				-     </para>
			
 
				-!Edrivers/mtd/nand/nand_base.c
			
 
				-!Edrivers/mtd/nand/nand_bbt.c
			
 
				-!Edrivers/mtd/nand/nand_ecc.c
			
 
				-  </chapter>
			
 
				-  
			
 
				-  <chapter id="intfunctions">
			
 
				-     <title>Internal Functions Provided</title>
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the NAND driver internal functions.
			
 
				-     Each function has a short description which is marked with an [XXX] identifier.
			
 
				-     See the chapter "Documentation hints" for an explanation.
			
 
				-     The functions marked with [DEFAULT] might be relevant for a board driver developer.
			
 
				-     </para>
			
 
				-!Idrivers/mtd/nand/nand_base.c
			
 
				-!Idrivers/mtd/nand/nand_bbt.c
			
 
				-<!-- No internal functions for kernel-doc:
			
 
				-X!Idrivers/mtd/nand/nand_ecc.c
			
 
				--->
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="credits">
			
 
				-     <title>Credits</title>
			
 
				-	<para>
			
 
				-		The following people have contributed to the NAND driver:
			
 
				-		<orderedlist>
			
 
				-			<listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem>
			
 
				-			<listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem>
			
 
				-			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
			
 
				-		</orderedlist>
			
 
				-		A lot of users have provided bugfixes, improvements and helping hands for testing.
			
 
				-		Thanks a lot.
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		The following people have contributed to this document:
			
 
				-		<orderedlist>
			
 
				-			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
			
 
				-		</orderedlist>
			
 
				-	</para>
			
 
				-  </chapter>
			
 
				-</book>
			
--- a/Documentation/DocBook/networking.tmpl
+++ b/Documentation/DocBook/networking.tmpl
@@ -1,111 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="LinuxNetworking">
			
 
				- <bookinfo>
			
 
				-  <title>Linux Networking and Network Devices APIs</title>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License as published by the Free Software Foundation; either
			
 
				-     version 2 of the License, or (at your option) any later
			
 
				-     version.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="netcore">
			
 
				-     <title>Linux Networking</title>
			
 
				-     <sect1><title>Networking Base Types</title>
			
 
				-!Iinclude/linux/net.h
			
 
				-     </sect1>
			
 
				-     <sect1><title>Socket Buffer Functions</title>
			
 
				-!Iinclude/linux/skbuff.h
			
 
				-!Iinclude/net/sock.h
			
 
				-!Enet/socket.c
			
 
				-!Enet/core/skbuff.c
			
 
				-!Enet/core/sock.c
			
 
				-!Enet/core/datagram.c
			
 
				-!Enet/core/stream.c
			
 
				-     </sect1>
			
 
				-     <sect1><title>Socket Filter</title>
			
 
				-!Enet/core/filter.c
			
 
				-     </sect1>
			
 
				-     <sect1><title>Generic Network Statistics</title>
			
 
				-!Iinclude/uapi/linux/gen_stats.h
			
 
				-!Enet/core/gen_stats.c
			
 
				-!Enet/core/gen_estimator.c
			
 
				-     </sect1>
			
 
				-     <sect1><title>SUN RPC subsystem</title>
			
 
				-<!-- The !D functionality is not perfect, garbage has to be protected by comments
			
 
				-!Dnet/sunrpc/sunrpc_syms.c
			
 
				--->
			
 
				-!Enet/sunrpc/xdr.c
			
 
				-!Enet/sunrpc/svc_xprt.c
			
 
				-!Enet/sunrpc/xprt.c
			
 
				-!Enet/sunrpc/sched.c
			
 
				-!Enet/sunrpc/socklib.c
			
 
				-!Enet/sunrpc/stats.c
			
 
				-!Enet/sunrpc/rpc_pipe.c
			
 
				-!Enet/sunrpc/rpcb_clnt.c
			
 
				-!Enet/sunrpc/clnt.c
			
 
				-     </sect1>
			
 
				-     <sect1><title>WiMAX</title>
			
 
				-!Enet/wimax/op-msg.c
			
 
				-!Enet/wimax/op-reset.c
			
 
				-!Enet/wimax/op-rfkill.c
			
 
				-!Enet/wimax/stack.c
			
 
				-!Iinclude/net/wimax.h
			
 
				-!Iinclude/uapi/linux/wimax.h
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="netdev">
			
 
				-     <title>Network device support</title>
			
 
				-     <sect1><title>Driver Support</title>
			
 
				-!Enet/core/dev.c
			
 
				-!Enet/ethernet/eth.c
			
 
				-!Enet/sched/sch_generic.c
			
 
				-!Iinclude/linux/etherdevice.h
			
 
				-!Iinclude/linux/netdevice.h
			
 
				-     </sect1>
			
 
				-     <sect1><title>PHY Support</title>
			
 
				-!Edrivers/net/phy/phy.c
			
 
				-!Idrivers/net/phy/phy.c
			
 
				-!Edrivers/net/phy/phy_device.c
			
 
				-!Idrivers/net/phy/phy_device.c
			
 
				-!Edrivers/net/phy/mdio_bus.c
			
 
				-!Idrivers/net/phy/mdio_bus.c
			
 
				-     </sect1>
			
 
				-<!-- FIXME: Removed for now since no structured comments in source
			
 
				-     <sect1><title>Wireless</title>
			
 
				-X!Enet/core/wireless.c
			
 
				-     </sect1>
			
 
				--->
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/DocBook/rapidio.tmpl
+++ b/Documentation/DocBook/rapidio.tmpl
@@ -1,155 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-        "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
			
 
				-	<!ENTITY rapidio SYSTEM "rapidio.xml">
			
 
				-	]>
			
 
				-
			
 
				-<book id="RapidIO-Guide">
			
 
				- <bookinfo>
			
 
				-  <title>RapidIO Subsystem Guide</title>
			
 
				-
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Matt</firstname>
			
 
				-    <surname>Porter</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>mporter@kernel.crashing.org</email>
			
 
				-      <email>mporter@mvista.com</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2005</year>
			
 
				-   <holder>MontaVista Software, Inc.</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License version 2 as published by the Free Software Foundation.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-      <title>Introduction</title>
			
 
				-  <para>
			
 
				-	RapidIO is a high speed switched fabric interconnect with
			
 
				-	features aimed at the embedded market.  RapidIO provides
			
 
				-	support for memory-mapped I/O as well as message-based
			
 
				-	transactions over the switched fabric network. RapidIO has
			
 
				-	a standardized discovery mechanism not unlike the PCI bus
			
 
				-	standard that allows simple detection of devices in a
			
 
				-	network.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-  	This documentation is provided for developers intending
			
 
				-	to support RapidIO on new architectures, write new drivers,
			
 
				-	or to understand the subsystem internals.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="bugs">
			
 
				-     <title>Known Bugs and Limitations</title>
			
 
				-
			
 
				-     <sect1 id="known_bugs">
			
 
				-     	<title>Bugs</title>
			
 
				-	  <para>None. ;)</para>
			
 
				-     </sect1>
			
 
				-     <sect1 id="Limitations">
			
 
				-     	<title>Limitations</title>
			
 
				-	  <para>
			
 
				-	    <orderedlist>
			
 
				-	      <listitem><para>Access/management of RapidIO memory regions is not supported</para></listitem>
			
 
				-	      <listitem><para>Multiple host enumeration is not supported</para></listitem>
			
 
				-	    </orderedlist>
			
 
				-	 </para>
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="drivers">
			
 
				-     	<title>RapidIO driver interface</title>
			
 
				-	<para>
			
 
				-		Drivers are provided a set of calls in order
			
 
				-		to interface with the subsystem to gather info
			
 
				-		on devices, request/map memory region resources,
			
 
				-		and manage mailboxes/doorbells.
			
 
				-	</para>
			
 
				-	<sect1 id="Functions">
			
 
				-		<title>Functions</title>
			
 
				-!Iinclude/linux/rio_drv.h
			
 
				-!Edrivers/rapidio/rio-driver.c
			
 
				-!Edrivers/rapidio/rio.c
			
 
				-	</sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="internals">
			
 
				-     <title>Internals</title>
			
 
				-
			
 
				-     <para>
			
 
				-     This chapter contains the autogenerated documentation of the RapidIO
			
 
				-     subsystem.
			
 
				-     </para>
			
 
				-
			
 
				-     <sect1 id="Structures"><title>Structures</title>
			
 
				-!Iinclude/linux/rio.h
			
 
				-     </sect1>
			
 
				-     <sect1 id="Enumeration_and_Discovery"><title>Enumeration and Discovery</title>
			
 
				-!Idrivers/rapidio/rio-scan.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="Driver_functionality"><title>Driver functionality</title>
			
 
				-!Idrivers/rapidio/rio.c
			
 
				-!Idrivers/rapidio/rio-access.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="Device_model_support"><title>Device model support</title>
			
 
				-!Idrivers/rapidio/rio-driver.c
			
 
				-     </sect1>
			
 
				-     <sect1 id="PPC32_support"><title>PPC32 support</title>
			
 
				-!Iarch/powerpc/sysdev/fsl_rio.c
			
 
				-     </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="credits">
			
 
				-     <title>Credits</title>
			
 
				-	<para>
			
 
				-		The following people have contributed to the RapidIO
			
 
				-		subsystem directly or indirectly:
			
 
				-		<orderedlist>
			
 
				-			<listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
			
 
				-			<listitem><para>Randy Vinson<email>rvinson@mvista.com</email></para></listitem>
			
 
				-			<listitem><para>Dan Malek<email>dan@embeddedalley.com</email></para></listitem>
			
 
				-		</orderedlist>
			
 
				-	</para>
			
 
				-	<para>
			
 
				-		The following people have contributed to this document:
			
 
				-		<orderedlist>
			
 
				-			<listitem><para>Matt Porter<email>mporter@kernel.crashing.org</email></para></listitem>
			
 
				-		</orderedlist>
			
 
				-	</para>
			
 
				-  </chapter>
			
 
				-</book>
			
--- a/Documentation/DocBook/s390-drivers.tmpl
+++ b/Documentation/DocBook/s390-drivers.tmpl
@@ -1,161 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="s390drivers">
			
 
				- <bookinfo>
			
 
				-  <title>Writing s390 channel device drivers</title>
			
 
				-
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Cornelia</firstname>
			
 
				-    <surname>Huck</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-       <email>cornelia.huck@de.ibm.com</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2007</year>
			
 
				-   <holder>IBM Corp.</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License as published by the Free Software Foundation; either
			
 
				-     version 2 of the License, or (at your option) any later
			
 
				-     version.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-   <title>Introduction</title>
			
 
				-  <para>
			
 
				-    This document describes the interfaces available for device drivers that
			
 
				-    drive s390 based channel attached I/O devices. This includes interfaces for
			
 
				-    interaction with the hardware and interfaces for interacting with the
			
 
				-    common driver core. Those interfaces are provided by the s390 common I/O
			
 
				-    layer.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-    The document assumes a familarity with the technical terms associated
			
 
				-    with the s390 channel I/O architecture. For a description of this
			
 
				-    architecture, please refer to the "z/Architecture: Principles of
			
 
				-    Operation", IBM publication no. SA22-7832.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-    While most I/O devices on a s390 system are typically driven through the
			
 
				-    channel I/O mechanism described here, there are various other methods
			
 
				-    (like the diag interface). These are out of the scope of this document.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-    Some additional information can also be found in the kernel source
			
 
				-    under Documentation/s390/driver-model.txt.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-  <chapter id="ccw">
			
 
				-   <title>The ccw bus</title>
			
 
				-  <para>
			
 
				-	The ccw bus typically contains the majority of devices available to
			
 
				-	a s390 system. Named after the channel command word (ccw), the basic
			
 
				-	command structure used to address its devices, the ccw bus contains
			
 
				-	so-called channel attached devices. They are addressed via I/O
			
 
				-	subchannels, visible on the css bus. A device driver for
			
 
				-	channel-attached devices, however, will never interact	with the
			
 
				-	subchannel directly, but only via the I/O device on the ccw bus,
			
 
				-	the ccw device.
			
 
				-  </para>
			
 
				-    <sect1 id="channelIO">
			
 
				-     <title>I/O functions for channel-attached devices</title>
			
 
				-    <para>
			
 
				-      Some hardware structures have been translated into C structures for use
			
 
				-      by the common I/O layer and device drivers. For more information on
			
 
				-      the hardware structures represented here, please consult the Principles
			
 
				-      of Operation.
			
 
				-    </para>
			
 
				-!Iarch/s390/include/asm/cio.h
			
 
				-    </sect1>
			
 
				-    <sect1 id="ccwdev">
			
 
				-     <title>ccw devices</title>
			
 
				-    <para>
			
 
				-      Devices that want to initiate channel I/O need to attach to the ccw bus.
			
 
				-      Interaction with the driver core is done via the common I/O layer, which
			
 
				-      provides the abstractions of ccw devices and ccw device drivers.
			
 
				-    </para>
			
 
				-    <para>
			
 
				-      The functions that initiate or terminate channel I/O all act upon a
			
 
				-      ccw device structure. Device drivers must not bypass those functions
			
 
				-      or strange side effects may happen.
			
 
				-    </para>
			
 
				-!Iarch/s390/include/asm/ccwdev.h
			
 
				-!Edrivers/s390/cio/device.c
			
 
				-!Edrivers/s390/cio/device_ops.c
			
 
				-    </sect1>
			
 
				-    <sect1 id="cmf">
			
 
				-     <title>The channel-measurement facility</title>
			
 
				-  <para>
			
 
				-	The channel-measurement facility provides a means to collect
			
 
				-	measurement data which is made available by the channel subsystem
			
 
				-	for each channel attached device.
			
 
				-  </para>
			
 
				-!Iarch/s390/include/asm/cmb.h
			
 
				-!Edrivers/s390/cio/cmf.c
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="ccwgroup">
			
 
				-   <title>The ccwgroup bus</title>
			
 
				-  <para>
			
 
				-	The ccwgroup bus only contains artificial devices, created by the user.
			
 
				-	Many networking devices (e.g. qeth) are in fact composed of several
			
 
				-	ccw devices (like read, write and data channel for qeth). The
			
 
				-	ccwgroup bus provides a mechanism to create a meta-device which
			
 
				-	contains those ccw devices as slave devices and can be associated
			
 
				-	with the netdevice.
			
 
				-  </para>
			
 
				-   <sect1 id="ccwgroupdevices">
			
 
				-    <title>ccw group devices</title>
			
 
				-!Iarch/s390/include/asm/ccwgroup.h
			
 
				-!Edrivers/s390/cio/ccwgroup.c
			
 
				-   </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="genericinterfaces">
			
 
				-   <title>Generic interfaces</title>
			
 
				-  <para>
			
 
				-	Some interfaces are available to other drivers that do not necessarily
			
 
				-	have anything to do with the busses described above, but still are
			
 
				-	indirectly using basic infrastructure in the common I/O layer.
			
 
				-	One example is the support for adapter interrupts.
			
 
				-  </para>
			
 
				-!Edrivers/s390/cio/airq.c
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/DocBook/scsi.tmpl
+++ b/Documentation/DocBook/scsi.tmpl
@@ -1,409 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="scsimid">
			
 
				-  <bookinfo>
			
 
				-    <title>SCSI Interfaces Guide</title>
			
 
				-
			
 
				-    <authorgroup>
			
 
				-      <author>
			
 
				-        <firstname>James</firstname>
			
 
				-        <surname>Bottomley</surname>
			
 
				-        <affiliation>
			
 
				-          <address>
			
 
				-            <email>James.Bottomley@hansenpartnership.com</email>
			
 
				-          </address>
			
 
				-        </affiliation>
			
 
				-      </author>
			
 
				-
			
 
				-      <author>
			
 
				-        <firstname>Rob</firstname>
			
 
				-        <surname>Landley</surname>
			
 
				-        <affiliation>
			
 
				-          <address>
			
 
				-            <email>rob@landley.net</email>
			
 
				-          </address>
			
 
				-        </affiliation>
			
 
				-      </author>
			
 
				-
			
 
				-    </authorgroup>
			
 
				-
			
 
				-    <copyright>
			
 
				-      <year>2007</year>
			
 
				-      <holder>Linux Foundation</holder>
			
 
				-    </copyright>
			
 
				-
			
 
				-    <legalnotice>
			
 
				-      <para>
			
 
				-        This documentation is free software; you can redistribute
			
 
				-        it and/or modify it under the terms of the GNU General Public
			
 
				-        License version 2.
			
 
				-      </para>
			
 
				-
			
 
				-      <para>
			
 
				-        This program is distributed in the hope that it will be
			
 
				-        useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-        warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-        For more details see the file COPYING in the source
			
 
				-        distribution of Linux.
			
 
				-      </para>
			
 
				-    </legalnotice>
			
 
				-  </bookinfo>
			
 
				-
			
 
				-  <toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-    <title>Introduction</title>
			
 
				-    <sect1 id="protocol_vs_bus">
			
 
				-      <title>Protocol vs bus</title>
			
 
				-      <para>
			
 
				-        Once upon a time, the Small Computer Systems Interface defined both
			
 
				-        a parallel I/O bus and a data protocol to connect a wide variety of
			
 
				-        peripherals (disk drives, tape drives, modems, printers, scanners,
			
 
				-        optical drives, test equipment, and medical devices) to a host
			
 
				-        computer.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        Although the old parallel (fast/wide/ultra) SCSI bus has largely
			
 
				-        fallen out of use, the SCSI command set is more widely used than ever
			
 
				-        to communicate with devices over a number of different busses.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        The <ulink url='http://www.t10.org/scsi-3.htm'>SCSI protocol</ulink>
			
 
				-        is a big-endian peer-to-peer packet based protocol.  SCSI commands
			
 
				-        are 6, 10, 12, or 16 bytes long, often followed by an associated data
			
 
				-        payload.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        SCSI commands can be transported over just about any kind of bus, and
			
 
				-        are the default protocol for storage devices attached to USB, SATA,
			
 
				-        SAS, Fibre Channel, FireWire, and ATAPI devices.  SCSI packets are
			
 
				-        also commonly exchanged over Infiniband,
			
 
				-        <ulink url='http://i2o.shadowconnect.com/faq.php'>I20</ulink>, TCP/IP
			
 
				-        (<ulink url='https://en.wikipedia.org/wiki/ISCSI'>iSCSI</ulink>), even
			
 
				-        <ulink url='http://cyberelk.net/tim/parport/parscsi.html'>Parallel
			
 
				-        ports</ulink>.
			
 
				-      </para>
			
 
				-    </sect1>
			
 
				-    <sect1 id="subsystem_design">
			
 
				-      <title>Design of the Linux SCSI subsystem</title>
			
 
				-      <para>
			
 
				-        The SCSI subsystem uses a three layer design, with upper, mid, and low
			
 
				-        layers.  Every operation involving the SCSI subsystem (such as reading
			
 
				-        a sector from a disk) uses one driver at each of the 3 levels: one
			
 
				-        upper layer driver, one lower layer driver, and the SCSI midlayer.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        The SCSI upper layer provides the interface between userspace and the
			
 
				-        kernel, in the form of block and char device nodes for I/O and
			
 
				-        ioctl().  The SCSI lower layer contains drivers for specific hardware
			
 
				-        devices.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        In between is the SCSI mid-layer, analogous to a network routing
			
 
				-        layer such as the IPv4 stack.  The SCSI mid-layer routes a packet
			
 
				-        based data protocol between the upper layer's /dev nodes and the
			
 
				-        corresponding devices in the lower layer.  It manages command queues,
			
 
				-        provides error handling and power management functions, and responds
			
 
				-        to ioctl() requests.
			
 
				-      </para>
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="upper_layer">
			
 
				-    <title>SCSI upper layer</title>
			
 
				-    <para>
			
 
				-      The upper layer supports the user-kernel interface by providing
			
 
				-      device nodes.
			
 
				-    </para>
			
 
				-    <sect1 id="sd">
			
 
				-      <title>sd (SCSI Disk)</title>
			
 
				-      <para>sd (sd_mod.o)</para>
			
 
				-<!-- !Idrivers/scsi/sd.c -->
			
 
				-    </sect1>
			
 
				-    <sect1 id="sr">
			
 
				-      <title>sr (SCSI CD-ROM)</title>
			
 
				-      <para>sr (sr_mod.o)</para>
			
 
				-    </sect1>
			
 
				-    <sect1 id="st">
			
 
				-      <title>st (SCSI Tape)</title>
			
 
				-      <para>st (st.o)</para>
			
 
				-    </sect1>
			
 
				-    <sect1 id="sg">
			
 
				-      <title>sg (SCSI Generic)</title>
			
 
				-      <para>sg (sg.o)</para>
			
 
				-    </sect1>
			
 
				-    <sect1 id="ch">
			
 
				-      <title>ch (SCSI Media Changer)</title>
			
 
				-      <para>ch (ch.c)</para>
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="mid_layer">
			
 
				-    <title>SCSI mid layer</title>
			
 
				-
			
 
				-    <sect1 id="midlayer_implementation">
			
 
				-      <title>SCSI midlayer implementation</title>
			
 
				-      <sect2 id="scsi_device.h">
			
 
				-        <title>include/scsi/scsi_device.h</title>
			
 
				-        <para>
			
 
				-        </para>
			
 
				-!Iinclude/scsi/scsi_device.h
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="scsi.c">
			
 
				-        <title>drivers/scsi/scsi.c</title>
			
 
				-        <para>Main file for the SCSI midlayer.</para>
			
 
				-!Edrivers/scsi/scsi.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsicam.c">
			
 
				-        <title>drivers/scsi/scsicam.c</title>
			
 
				-        <para>
			
 
				-          <ulink url='http://www.t10.org/ftp/t10/drafts/cam/cam-r12b.pdf'>SCSI
			
 
				-          Common Access Method</ulink> support functions, for use with
			
 
				-          HDIO_GETGEO, etc.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsicam.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_error.c">
			
 
				-        <title>drivers/scsi/scsi_error.c</title>
			
 
				-        <para>Common SCSI error/timeout handling routines.</para>
			
 
				-!Edrivers/scsi/scsi_error.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_devinfo.c">
			
 
				-        <title>drivers/scsi/scsi_devinfo.c</title>
			
 
				-        <para>
			
 
				-          Manage scsi_dev_info_list, which tracks blacklisted and whitelisted
			
 
				-          devices.
			
 
				-        </para>
			
 
				-!Idrivers/scsi/scsi_devinfo.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_ioctl.c">
			
 
				-        <title>drivers/scsi/scsi_ioctl.c</title>
			
 
				-        <para>
			
 
				-          Handle ioctl() calls for SCSI devices.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_ioctl.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_lib.c">
			
 
				-        <title>drivers/scsi/scsi_lib.c</title>
			
 
				-        <para>
			
 
				-          SCSI queuing library.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_lib.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_lib_dma.c">
			
 
				-        <title>drivers/scsi/scsi_lib_dma.c</title>
			
 
				-        <para>
			
 
				-          SCSI library functions depending on DMA
			
 
				-          (map and unmap scatter-gather lists).
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_lib_dma.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_module.c">
			
 
				-        <title>drivers/scsi/scsi_module.c</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_module.c contains legacy support for
			
 
				-          old-style host templates.  It should never be used by any new driver.
			
 
				-        </para>
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_proc.c">
			
 
				-        <title>drivers/scsi/scsi_proc.c</title>
			
 
				-        <para>
			
 
				-          The functions in this file provide an interface between
			
 
				-          the PROC file system and the SCSI device drivers
			
 
				-          It is mainly used for debugging, statistics and to pass
			
 
				-          information directly to the lowlevel driver.
			
 
				-
			
 
				-          I.E. plumbing to manage /proc/scsi/*
			
 
				-        </para>
			
 
				-!Idrivers/scsi/scsi_proc.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_netlink.c">
			
 
				-        <title>drivers/scsi/scsi_netlink.c</title>
			
 
				-        <para>
			
 
				-          Infrastructure to provide async events from transports to userspace
			
 
				-          via netlink, using a single NETLINK_SCSITRANSPORT protocol for all
			
 
				-          transports.
			
 
				-
			
 
				-          See <ulink url='http://marc.info/?l=linux-scsi&amp;m=115507374832500&amp;w=2'>the
			
 
				-          original patch submission</ulink> for more details.
			
 
				-        </para>
			
 
				-!Idrivers/scsi/scsi_netlink.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_scan.c">
			
 
				-        <title>drivers/scsi/scsi_scan.c</title>
			
 
				-        <para>
			
 
				-          Scan a host to determine which (if any) devices are attached.
			
 
				-
			
 
				-          The general scanning/probing algorithm is as follows, exceptions are
			
 
				-          made to it depending on device specific flags, compilation options,
			
 
				-          and global variable (boot or module load time) settings.
			
 
				-
			
 
				-          A specific LUN is scanned via an INQUIRY command; if the LUN has a
			
 
				-          device attached, a scsi_device is allocated and setup for it.
			
 
				-
			
 
				-          For every id of every channel on the given host, start by scanning
			
 
				-          LUN 0.  Skip hosts that don't respond at all to a scan of LUN 0.
			
 
				-          Otherwise, if LUN 0 has a device attached, allocate and setup a
			
 
				-          scsi_device for it.  If target is SCSI-3 or up, issue a REPORT LUN,
			
 
				-          and scan all of the LUNs returned by the REPORT LUN; else,
			
 
				-          sequentially scan LUNs up until some maximum is reached, or a LUN is
			
 
				-          seen that cannot have a device attached to it.
			
 
				-        </para>
			
 
				-!Idrivers/scsi/scsi_scan.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_sysctl.c">
			
 
				-        <title>drivers/scsi/scsi_sysctl.c</title>
			
 
				-        <para>
			
 
				-          Set up the sysctl entry: "/dev/scsi/logging_level"
			
 
				-          (DEV_SCSI_LOGGING_LEVEL) which sets/returns scsi_logging_level.
			
 
				-        </para>
			
 
				-      </sect2>
			
 
				-      <sect2 id="scsi_sysfs.c">
			
 
				-        <title>drivers/scsi/scsi_sysfs.c</title>
			
 
				-        <para>
			
 
				-          SCSI sysfs interface routines.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_sysfs.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="hosts.c">
			
 
				-        <title>drivers/scsi/hosts.c</title>
			
 
				-        <para>
			
 
				-          mid to lowlevel SCSI driver interface
			
 
				-        </para>
			
 
				-!Edrivers/scsi/hosts.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="constants.c">
			
 
				-        <title>drivers/scsi/constants.c</title>
			
 
				-        <para>
			
 
				-          mid to lowlevel SCSI driver interface
			
 
				-        </para>
			
 
				-!Edrivers/scsi/constants.c
			
 
				-      </sect2>
			
 
				-    </sect1>
			
 
				-
			
 
				-    <sect1 id="Transport_classes">
			
 
				-      <title>Transport classes</title>
			
 
				-      <para>
			
 
				-        Transport classes are service libraries for drivers in the SCSI
			
 
				-        lower layer, which expose transport attributes in sysfs.
			
 
				-      </para>
			
 
				-      <sect2 id="Fibre_Channel_transport">
			
 
				-        <title>Fibre Channel transport</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_transport_fc.c defines transport attributes
			
 
				-          for Fibre Channel.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_transport_fc.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="iSCSI_transport">
			
 
				-        <title>iSCSI transport class</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_transport_iscsi.c defines transport
			
 
				-          attributes for the iSCSI class, which sends SCSI packets over TCP/IP
			
 
				-          connections.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_transport_iscsi.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="SAS_transport">
			
 
				-        <title>Serial Attached SCSI (SAS) transport class</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_transport_sas.c defines transport
			
 
				-          attributes for Serial Attached SCSI, a variant of SATA aimed at
			
 
				-          large high-end systems.
			
 
				-        </para>
			
 
				-        <para>
			
 
				-          The SAS transport class contains common code to deal with SAS HBAs,
			
 
				-          an aproximated representation of SAS topologies in the driver model,
			
 
				-          and various sysfs attributes to expose these topologies and management
			
 
				-          interfaces to userspace.
			
 
				-        </para>
			
 
				-        <para>
			
 
				-          In addition to the basic SCSI core objects this transport class
			
 
				-          introduces two additional intermediate objects:  The SAS PHY
			
 
				-          as represented by struct sas_phy defines an "outgoing" PHY on
			
 
				-          a SAS HBA or Expander, and the SAS remote PHY represented by
			
 
				-          struct sas_rphy defines an "incoming" PHY on a SAS Expander or
			
 
				-          end device.  Note that this is purely a software concept, the
			
 
				-          underlying hardware for a PHY and a remote PHY is the exactly
			
 
				-          the same.
			
 
				-        </para>
			
 
				-        <para>
			
 
				-          There is no concept of a SAS port in this code, users can see
			
 
				-          what PHYs form a wide port based on the port_identifier attribute,
			
 
				-          which is the same for all PHYs in a port.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_transport_sas.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="SATA_transport">
			
 
				-        <title>SATA transport class</title>
			
 
				-        <para>
			
 
				-          The SATA transport is handled by libata, which has its own book of
			
 
				-          documentation in this directory.
			
 
				-        </para>
			
 
				-      </sect2>
			
 
				-      <sect2 id="SPI_transport">
			
 
				-        <title>Parallel SCSI (SPI) transport class</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_transport_spi.c defines transport
			
 
				-          attributes for traditional (fast/wide/ultra) SCSI busses.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_transport_spi.c
			
 
				-      </sect2>
			
 
				-      <sect2 id="SRP_transport">
			
 
				-        <title>SCSI RDMA (SRP) transport class</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_transport_srp.c defines transport
			
 
				-          attributes for SCSI over Remote Direct Memory Access.
			
 
				-        </para>
			
 
				-!Edrivers/scsi/scsi_transport_srp.c
			
 
				-      </sect2>
			
 
				-    </sect1>
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="lower_layer">
			
 
				-    <title>SCSI lower layer</title>
			
 
				-    <sect1 id="hba_drivers">
			
 
				-      <title>Host Bus Adapter transport types</title>
			
 
				-      <para>
			
 
				-        Many modern device controllers use the SCSI command set as a protocol to
			
 
				-        communicate with their devices through many different types of physical
			
 
				-        connections.
			
 
				-      </para>
			
 
				-      <para>
			
 
				-        In SCSI language a bus capable of carrying SCSI commands is
			
 
				-        called a "transport", and a controller connecting to such a bus is
			
 
				-        called a "host bus adapter" (HBA).
			
 
				-      </para>
			
 
				-      <sect2 id="scsi_debug.c">
			
 
				-        <title>Debug transport</title>
			
 
				-        <para>
			
 
				-          The file drivers/scsi/scsi_debug.c simulates a host adapter with a
			
 
				-          variable number of disks (or disk like devices) attached, sharing a
			
 
				-          common amount of RAM.  Does a lot of checking to make sure that we are
			
 
				-          not getting blocks mixed up, and panics the kernel if anything out of
			
 
				-          the ordinary is seen.
			
 
				-        </para>
			
 
				-        <para>
			
 
				-          To be more realistic, the simulated devices have the transport
			
 
				-          attributes of SAS disks.
			
 
				-        </para>
			
 
				-        <para>
			
 
				-          For documentation see
			
 
				-          <ulink url='http://sg.danny.cz/sg/sdebug26.html'>http://sg.danny.cz/sg/sdebug26.html</ulink>
			
 
				-        </para>
			
 
				-<!-- !Edrivers/scsi/scsi_debug.c -->
			
 
				-      </sect2>
			
 
				-      <sect2 id="todo">
			
 
				-        <title>todo</title>
			
 
				-        <para>Parallel (fast/wide/ultra) SCSI, USB, SATA,
			
 
				-        SAS, Fibre Channel, FireWire, ATAPI devices, Infiniband,
			
 
				-        I20, iSCSI, Parallel ports, netlink...
			
 
				-        </para>
			
 
				-      </sect2>
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-</book>
			
--- a/Documentation/DocBook/sh.tmpl
+++ b/Documentation/DocBook/sh.tmpl
@@ -1,105 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="sh-drivers">
			
 
				- <bookinfo>
			
 
				-  <title>SuperH Interfaces Guide</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Paul</firstname>
			
 
				-    <surname>Mundt</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>lethal@linux-sh.org</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2008-2010</year>
			
 
				-   <holder>Paul Mundt</holder>
			
 
				-  </copyright>
			
 
				-  <copyright>
			
 
				-   <year>2008-2010</year>
			
 
				-   <holder>Renesas Technology Corp.</holder>
			
 
				-  </copyright>
			
 
				-  <copyright>
			
 
				-   <year>2010</year>
			
 
				-   <holder>Renesas Electronics Corp.</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License version 2 as published by the Free Software Foundation.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="mm">
			
 
				-    <title>Memory Management</title>
			
 
				-    <sect1 id="sh4">
			
 
				-    <title>SH-4</title>
			
 
				-      <sect2 id="sq">
			
 
				-        <title>Store Queue API</title>
			
 
				-!Earch/sh/kernel/cpu/sh4/sq.c
			
 
				-      </sect2>
			
 
				-    </sect1>
			
 
				-    <sect1 id="sh5">
			
 
				-      <title>SH-5</title>
			
 
				-      <sect2 id="tlb">
			
 
				-	<title>TLB Interfaces</title>
			
 
				-!Iarch/sh/mm/tlb-sh5.c
			
 
				-!Iarch/sh/include/asm/tlb_64.h
			
 
				-      </sect2>
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="mach">
			
 
				-    <title>Machine Specific Interfaces</title>
			
 
				-    <sect1 id="dreamcast">
			
 
				-      <title>mach-dreamcast</title>
			
 
				-!Iarch/sh/boards/mach-dreamcast/rtc.c
			
 
				-    </sect1>
			
 
				-    <sect1 id="x3proto">
			
 
				-      <title>mach-x3proto</title>
			
 
				-!Earch/sh/boards/mach-x3proto/ilsel.c
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-  <chapter id="busses">
			
 
				-    <title>Busses</title>
			
 
				-    <sect1 id="superhyway">
			
 
				-      <title>SuperHyway</title>
			
 
				-!Edrivers/sh/superhyway/superhyway.c
			
 
				-    </sect1>
			
 
				-
			
 
				-    <sect1 id="maple">
			
 
				-      <title>Maple</title>
			
 
				-!Edrivers/sh/maple/maple.c
			
 
				-    </sect1>
			
 
				-  </chapter>
			
 
				-</book>
			
--- a/Documentation/DocBook/stylesheet.xsl
+++ b/Documentation/DocBook/stylesheet.xsl
@@ -1,11 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<stylesheet xmlns="http://www.w3.org/1999/XSL/Transform" version="1.0">
			
 
				-<param name="chunk.quietly">1</param>
			
 
				-<param name="funcsynopsis.style">ansi</param>
			
 
				-<param name="funcsynopsis.tabular.threshold">80</param>
			
 
				-<param name="callout.graphics">0</param>
			
 
				-<!-- <param name="paper.type">A4</param> -->
			
 
				-<param name="generate.consistent.ids">1</param>
			
 
				-<param name="generate.section.toc.level">2</param>
			
 
				-<param name="use.id.as.filename">1</param>
			
 
				-</stylesheet>
			
--- a/Documentation/DocBook/w1.tmpl
+++ b/Documentation/DocBook/w1.tmpl
@@ -1,101 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="w1id">
			
 
				-  <bookinfo>
			
 
				-    <title>W1: Dallas' 1-wire bus</title>
			
 
				-
			
 
				-    <authorgroup>
			
 
				-      <author>
			
 
				-        <firstname>David</firstname>
			
 
				-        <surname>Fries</surname>
			
 
				-        <affiliation>
			
 
				-          <address>
			
 
				-            <email>David@Fries.net</email>
			
 
				-          </address>
			
 
				-        </affiliation>
			
 
				-      </author>
			
 
				-
			
 
				-    </authorgroup>
			
 
				-
			
 
				-    <copyright>
			
 
				-      <year>2013</year>
			
 
				-      <!--
			
 
				-      <holder></holder>
			
 
				-      -->
			
 
				-    </copyright>
			
 
				-
			
 
				-    <legalnotice>
			
 
				-      <para>
			
 
				-        This documentation is free software; you can redistribute
			
 
				-        it and/or modify it under the terms of the GNU General Public
			
 
				-        License version 2.
			
 
				-      </para>
			
 
				-
			
 
				-      <para>
			
 
				-        This program is distributed in the hope that it will be
			
 
				-        useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-        warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-        For more details see the file COPYING in the source
			
 
				-        distribution of Linux.
			
 
				-      </para>
			
 
				-    </legalnotice>
			
 
				-  </bookinfo>
			
 
				-
			
 
				-  <toc></toc>
			
 
				-
			
 
				-  <chapter id="w1_internal">
			
 
				-    <title>W1 API internal to the kernel</title>
			
 
				-
			
 
				-    <sect1 id="w1_internal_api">
			
 
				-      <title>W1 API internal to the kernel</title>
			
 
				-      <sect2 id="w1.h">
			
 
				-        <title>drivers/w1/w1.h</title>
			
 
				-        <para>W1 core functions.</para>
			
 
				-!Idrivers/w1/w1.h
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1.c">
			
 
				-        <title>drivers/w1/w1.c</title>
			
 
				-        <para>W1 core functions.</para>
			
 
				-!Idrivers/w1/w1.c
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1_family.h">
			
 
				-        <title>drivers/w1/w1_family.h</title>
			
 
				-        <para>Allows registering device family operations.</para>
			
 
				-!Idrivers/w1/w1_family.h
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1_family.c">
			
 
				-        <title>drivers/w1/w1_family.c</title>
			
 
				-        <para>Allows registering device family operations.</para>
			
 
				-!Edrivers/w1/w1_family.c
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1_int.c">
			
 
				-        <title>drivers/w1/w1_int.c</title>
			
 
				-        <para>W1 internal initialization for master devices.</para>
			
 
				-!Edrivers/w1/w1_int.c
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1_netlink.h">
			
 
				-        <title>drivers/w1/w1_netlink.h</title>
			
 
				-        <para>W1 external netlink API structures and commands.</para>
			
 
				-!Idrivers/w1/w1_netlink.h
			
 
				-      </sect2>
			
 
				-
			
 
				-      <sect2 id="w1_io.c">
			
 
				-        <title>drivers/w1/w1_io.c</title>
			
 
				-        <para>W1 input/output.</para>
			
 
				-!Edrivers/w1/w1_io.c
			
 
				-!Idrivers/w1/w1_io.c
			
 
				-      </sect2>
			
 
				-
			
 
				-    </sect1>
			
 
				-
			
 
				-
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/DocBook/z8530book.tmpl
+++ b/Documentation/DocBook/z8530book.tmpl
@@ -1,371 +0,0 @@
 
				-<?xml version="1.0" encoding="UTF-8"?>
			
 
				-<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
			
 
				-	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
			
 
				-
			
 
				-<book id="Z85230Guide">
			
 
				- <bookinfo>
			
 
				-  <title>Z8530 Programming Guide</title>
			
 
				-  
			
 
				-  <authorgroup>
			
 
				-   <author>
			
 
				-    <firstname>Alan</firstname>
			
 
				-    <surname>Cox</surname>
			
 
				-    <affiliation>
			
 
				-     <address>
			
 
				-      <email>alan@lxorguk.ukuu.org.uk</email>
			
 
				-     </address>
			
 
				-    </affiliation>
			
 
				-   </author>
			
 
				-  </authorgroup>
			
 
				-
			
 
				-  <copyright>
			
 
				-   <year>2000</year>
			
 
				-   <holder>Alan Cox</holder>
			
 
				-  </copyright>
			
 
				-
			
 
				-  <legalnotice>
			
 
				-   <para>
			
 
				-     This documentation is free software; you can redistribute
			
 
				-     it and/or modify it under the terms of the GNU General Public
			
 
				-     License as published by the Free Software Foundation; either
			
 
				-     version 2 of the License, or (at your option) any later
			
 
				-     version.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     This program is distributed in the hope that it will be
			
 
				-     useful, but WITHOUT ANY WARRANTY; without even the implied
			
 
				-     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
			
 
				-     See the GNU General Public License for more details.
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     You should have received a copy of the GNU General Public
			
 
				-     License along with this program; if not, write to the Free
			
 
				-     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
			
 
				-     MA 02111-1307 USA
			
 
				-   </para>
			
 
				-      
			
 
				-   <para>
			
 
				-     For more details see the file COPYING in the source
			
 
				-     distribution of Linux.
			
 
				-   </para>
			
 
				-  </legalnotice>
			
 
				- </bookinfo>
			
 
				-
			
 
				-<toc></toc>
			
 
				-
			
 
				-  <chapter id="intro">
			
 
				-      <title>Introduction</title>
			
 
				-  <para>
			
 
				-	The Z85x30 family synchronous/asynchronous controller chips are
			
 
				-	used on a large number of cheap network interface cards. The
			
 
				-	kernel provides a core interface layer that is designed to make
			
 
				-	it easy to provide WAN services using this chip.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The current driver only support synchronous operation. Merging the
			
 
				-	asynchronous driver support into this code to allow any Z85x30
			
 
				-	device to be used as both a tty interface and as a synchronous 
			
 
				-	controller is a project for Linux post the 2.4 release
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-  
			
 
				-  <chapter id="Driver_Modes">
			
 
				- 	<title>Driver Modes</title>
			
 
				-  <para>
			
 
				-	The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
			
 
				-	in three different modes. Each mode can be applied to an individual
			
 
				-	channel on the chip (each chip has two channels).
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The PIO synchronous mode supports the most common Z8530 wiring. Here
			
 
				-	the chip is interface to the I/O and interrupt facilities of the
			
 
				-	host machine but not to the DMA subsystem. When running PIO the
			
 
				-	Z8530 has extremely tight timing requirements. Doing high speeds,
			
 
				-	even with a Z85230 will be tricky. Typically you should expect to
			
 
				-	achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The DMA mode supports the chip when it is configured to use dual DMA
			
 
				-	channels on an ISA bus. The better cards tend to support this mode
			
 
				-	of operation for a single channel. With DMA running the Z85230 tops
			
 
				-	out when it starts to hit ISA DMA constraints at about 512Kbits. It
			
 
				-	is worth noting here that many PC machines hang or crash when the
			
 
				-	chip is driven fast enough to hold the ISA bus solid.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Transmit DMA mode uses a single DMA channel. The DMA channel is used
			
 
				-	for transmission as the transmit FIFO is smaller than the receive
			
 
				-	FIFO. it gives better performance than pure PIO mode but is nowhere
			
 
				-	near as ideal as pure DMA mode. 
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="Using_the_Z85230_driver">
			
 
				- 	<title>Using the Z85230 driver</title>
			
 
				-  <para>
			
 
				-	The Z85230 driver provides the back end interface to your board. To
			
 
				-	configure a Z8530 interface you need to detect the board and to 
			
 
				-	identify its ports and interrupt resources. It is also your problem
			
 
				-	to verify the resources are available.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Having identified the chip you need to fill in a struct z8530_dev,
			
 
				-	which describes each chip. This object must exist until you finally
			
 
				-	shutdown the board. Firstly zero the active field. This ensures 
			
 
				-	nothing goes off without you intending it. The irq field should
			
 
				-	be set to the interrupt number of the chip. (Each chip has a single
			
 
				-	interrupt source rather than each channel). You are responsible
			
 
				-	for allocating the interrupt line. The interrupt handler should be
			
 
				-	set to <function>z8530_interrupt</function>. The device id should
			
 
				-	be set to the z8530_dev structure pointer. Whether the interrupt can
			
 
				-	be shared or not is board dependent, and up to you to initialise.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The structure holds two channel structures. 
			
 
				-	Initialise chanA.ctrlio and chanA.dataio with the address of the
			
 
				-	control and data ports. You can or this with Z8530_PORT_SLEEP to
			
 
				-	indicate your interface needs the 5uS delay for chip settling done
			
 
				-	in software. The PORT_SLEEP option is architecture specific. Other
			
 
				-	flags may become available on future platforms, eg for MMIO.
			
 
				-	Initialise the chanA.irqs to &amp;z8530_nop to start the chip up
			
 
				-	as disabled and discarding interrupt events. This ensures that
			
 
				-	stray interrupts will be mopped up and not hang the bus. Set
			
 
				-	chanA.dev to point to the device structure itself. The
			
 
				-	private and name field you may use as you wish. The private field
			
 
				-	is unused by the Z85230 layer. The name is used for error reporting
			
 
				-	and it may thus make sense to make it match the network name.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Repeat the same operation with the B channel if your chip has
			
 
				-	both channels wired to something useful. This isn't always the
			
 
				-	case. If it is not wired then the I/O values do not matter, but
			
 
				-	you must initialise chanB.dev.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	If your board has DMA facilities then initialise the txdma and
			
 
				-	rxdma fields for the relevant channels. You must also allocate the
			
 
				-	ISA DMA channels and do any necessary board level initialisation
			
 
				-	to configure them. The low level driver will do the Z8530 and
			
 
				-	DMA controller programming but not board specific magic.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Having initialised the device you can then call
			
 
				-	<function>z8530_init</function>. This will probe the chip and 
			
 
				-	reset it into a known state. An identification sequence is then
			
 
				-	run to identify the chip type. If the checks fail to pass the
			
 
				-	function returns a non zero error code. Typically this indicates
			
 
				-	that the port given is not valid. After this call the
			
 
				-	type field of the z8530_dev structure is initialised to either
			
 
				-	Z8530, Z85C30 or Z85230 according to the chip found.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Once you have called z8530_init you can also make use of the utility
			
 
				-	function <function>z8530_describe</function>. This provides a 
			
 
				-	consistent reporting format for the Z8530 devices, and allows all
			
 
				-	the drivers to provide consistent reporting.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="Attaching_Network_Interfaces">
			
 
				- 	<title>Attaching Network Interfaces</title>
			
 
				-  <para>
			
 
				-	If you wish to use the network interface facilities of the driver,
			
 
				-	then you need to attach a network device to each channel that is
			
 
				-	present and in use. In addition to use the generic HDLC
			
 
				-	you need to follow some additional plumbing rules. They may seem 
			
 
				-	complex but a look at the example hostess_sv11 driver should
			
 
				-	reassure you.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The network device used for each channel should be pointed to by
			
 
				-	the netdevice field of each channel. The hdlc-&gt; priv field of the
			
 
				-	network device points to your private data - you will need to be
			
 
				-	able to find your private data from this.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The way most drivers approach this particular problem is to
			
 
				-	create a structure holding the Z8530 device definition and
			
 
				-	put that into the private field of the network device. The
			
 
				-	network device fields of the channels then point back to the
			
 
				-	network devices.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	If you wish to use the generic HDLC then you need to register
			
 
				-	the HDLC device.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Before you register your network device you will also need to
			
 
				-	provide suitable handlers for most of the network device callbacks. 
			
 
				-	See the network device documentation for more details on this.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="Configuring_And_Activating_The_Port">
			
 
				- 	<title>Configuring And Activating The Port</title>
			
 
				-  <para>
			
 
				-	The Z85230 driver provides helper functions and tables to load the
			
 
				-	port registers on the Z8530 chips. When programming the register
			
 
				-	settings for a channel be aware that the documentation recommends
			
 
				-	initialisation orders. Strange things happen when these are not
			
 
				-	followed. 
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	<function>z8530_channel_load</function> takes an array of
			
 
				-	pairs of initialisation values in an array of u8 type. The first
			
 
				-	value is the Z8530 register number. Add 16 to indicate the alternate
			
 
				-	register bank on the later chips. The array is terminated by a 255.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The driver provides a pair of public tables. The
			
 
				-	z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
			
 
				-	also happens to cover most other end host configurations. The
			
 
				-	z8530_hdlc_kilostream_85230 table is the same configuration using
			
 
				-	the enhancements of the 85230 chip. The configuration loaded is
			
 
				-	standard NRZ encoded synchronous data with HDLC bitstuffing. All
			
 
				-	of the timing is taken from the other end of the link.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	When writing your own tables be aware that the driver internally
			
 
				-	tracks register values. It may need to reload values. You should
			
 
				-	therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
			
 
				-	configurations. Where the register settings depend on DMA selection
			
 
				-	the driver will update the bits itself when you open or close.
			
 
				-	Loading a new table with the interface open is not recommended.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	There are three standard configurations supported by the core
			
 
				-	code. In PIO mode the interface is programmed up to use
			
 
				-	interrupt driven PIO. This places high demands on the host processor
			
 
				-	to avoid latency. The driver is written to take account of latency
			
 
				-	issues but it cannot avoid latencies caused by other drivers,
			
 
				-	notably IDE in PIO mode. Because the drivers allocate buffers you
			
 
				-	must also prevent MTU changes while the port is open.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Once the port is open it will call the rx_function of each channel
			
 
				-	whenever a completed packet arrived. This is invoked from
			
 
				-	interrupt context and passes you the channel and a network	
			
 
				-	buffer (struct sk_buff) holding the data. The data includes
			
 
				-	the CRC bytes so most users will want to trim the last two
			
 
				-	bytes before processing the data. This function is very timing
			
 
				-	critical. When you wish to simply discard data the support
			
 
				-	code provides the function <function>z8530_null_rx</function>
			
 
				-	to discard the data.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	To active PIO mode sending and receiving the <function>
			
 
				-	z8530_sync_open</function> is called. This expects to be passed
			
 
				-	the network device and the channel. Typically this is called from
			
 
				-	your network device open callback. On a failure a non zero error
			
 
				-	status is returned. The <function>z8530_sync_close</function> 
			
 
				-	function shuts down a PIO channel. This must be done before the 
			
 
				-	channel is opened again	and before the driver shuts down 
			
 
				-	and unloads.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The ideal mode of operation is dual channel DMA mode. Here the
			
 
				-	kernel driver will configure the board for DMA in both directions.
			
 
				-	The driver also handles ISA DMA issues such as controller
			
 
				-	programming and the memory range limit for you. This mode is
			
 
				-	activated by calling the <function>z8530_sync_dma_open</function>
			
 
				-	function. On failure a non zero error value is returned.
			
 
				-	Once this mode is activated it can be shut down by calling the
			
 
				-	<function>z8530_sync_dma_close</function>. You must call the close
			
 
				-	function matching the open mode you used.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The final supported mode uses a single DMA channel to drive the
			
 
				-	transmit side. As the Z85C30 has a larger FIFO on the receive
			
 
				-	channel	this tends to increase the maximum speed a little. 
			
 
				-	This is activated by calling the <function>z8530_sync_txdma_open
			
 
				-	</function>. This returns a non zero error code on failure. The
			
 
				-	<function>z8530_sync_txdma_close</function> function closes down
			
 
				-	the Z8530 interface from this mode.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="Network_Layer_Functions">
			
 
				- 	<title>Network Layer Functions</title>
			
 
				-  <para>
			
 
				-	The Z8530 layer provides functions to queue packets for
			
 
				-	transmission. The driver internally buffers the frame currently
			
 
				-	being transmitted and one further frame (in order to keep back
			
 
				-	to back transmission running). Any further buffering is up to
			
 
				-	the caller.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The function <function>z8530_queue_xmit</function> takes a network
			
 
				-	buffer in sk_buff format and queues it for transmission. The
			
 
				-	caller must provide the entire packet with the exception of the
			
 
				-	bitstuffing and CRC. This is normally done by the caller via
			
 
				-	the generic HDLC interface layer. It returns 0 if the buffer has been
			
 
				-	queued and non zero values for queue full. If the function accepts
			
 
				-	the buffer it becomes property of the Z8530 layer and the caller
			
 
				-	should not free it.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	The function <function>z8530_get_stats</function> returns a pointer
			
 
				-	to an internally maintained per interface statistics block. This
			
 
				-	provides most of the interface code needed to implement the network
			
 
				-	layer get_stats callback.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="Porting_The_Z8530_Driver">
			
 
				-     <title>Porting The Z8530 Driver</title>
			
 
				-  <para>
			
 
				-	The Z8530 driver is written to be portable. In DMA mode it makes
			
 
				-	assumptions about the use of ISA DMA. These are probably warranted
			
 
				-	in most cases as the Z85230 in particular was designed to glue to PC
			
 
				-	type machines. The PIO mode makes no real assumptions.
			
 
				-  </para>
			
 
				-  <para>
			
 
				-	Should you need to retarget the Z8530 driver to another architecture
			
 
				-	the only code that should need changing are the port I/O functions.
			
 
				-	At the moment these assume PC I/O port accesses. This may not be
			
 
				-	appropriate for all platforms. Replacing 
			
 
				-	<function>z8530_read_port</function> and <function>z8530_write_port
			
 
				-	</function> is intended to be all that is required to port this
			
 
				-	driver layer.
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="bugs">
			
 
				-     <title>Known Bugs And Assumptions</title>
			
 
				-  <para>
			
 
				-  <variablelist>
			
 
				-    <varlistentry><term>Interrupt Locking</term>
			
 
				-    <listitem>
			
 
				-    <para>
			
 
				-	The locking in the driver is done via the global cli/sti lock. This
			
 
				-	makes for relatively poor SMP performance. Switching this to use a
			
 
				-	per device spin lock would probably materially improve performance.
			
 
				-    </para>
			
 
				-    </listitem></varlistentry>
			
 
				-
			
 
				-    <varlistentry><term>Occasional Failures</term>
			
 
				-    <listitem>
			
 
				-    <para>
			
 
				-	We have reports of occasional failures when run for very long
			
 
				-	periods of time and the driver starts to receive junk frames. At
			
 
				-	the moment the cause of this is not clear.
			
 
				-    </para>
			
 
				-    </listitem></varlistentry>
			
 
				-  </variablelist>
			
 
				-	
			
 
				-  </para>
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="pubfunctions">
			
 
				-     <title>Public Functions Provided</title>
			
 
				-!Edrivers/net/wan/z85230.c
			
 
				-  </chapter>
			
 
				-
			
 
				-  <chapter id="intfunctions">
			
 
				-     <title>Internal Functions</title>
			
 
				-!Idrivers/net/wan/z85230.c
			
 
				-  </chapter>
			
 
				-
			
 
				-</book>
			
--- a/Documentation/IPMI.txt
+++ b/Documentation/IPMI.txt
@@ -1,9 +1,8 @@
 
				+=====================
			
 
				+The Linux IPMI Driver
			
 
				+=====================
			
 
				 
			
 
				-                          The Linux IPMI Driver
			
 
				-			  ---------------------
			
 
				-			      Corey Minyard
			
 
				-			  <minyard@mvista.com>
			
 
				-			    <minyard@acm.org>
			
 
				+:Author: Corey Minyard <minyard@mvista.com> / <minyard@acm.org>
			
 
				 
			
 
				 The Intelligent Platform Management Interface, or IPMI, is a
			
 
				 standard for controlling intelligent devices that monitor a system.
			
@@ -141,7 +140,7 @@ Addressing
 
				 ----------
			
 
				 
			
 
				 The IPMI addressing works much like IP addresses, you have an overlay
			
 
				-to handle the different address types.  The overlay is:
			
 
				+to handle the different address types.  The overlay is::
			
 
				 
			
 
				   struct ipmi_addr
			
 
				   {
			
@@ -153,7 +152,7 @@ to handle the different address types.  The overlay is:
 
				 The addr_type determines what the address really is.  The driver
			
 
				 currently understands two different types of addresses.
			
 
				 
			
 
				-"System Interface" addresses are defined as:
			
 
				+"System Interface" addresses are defined as::
			
 
				 
			
 
				   struct ipmi_system_interface_addr
			
 
				   {
			
@@ -166,7 +165,7 @@ straight to the BMC on the current card.  The channel must be
 
				 IPMI_BMC_CHANNEL.
			
 
				 
			
 
				 Messages that are destined to go out on the IPMB bus use the
			
 
				-IPMI_IPMB_ADDR_TYPE address type.  The format is
			
 
				+IPMI_IPMB_ADDR_TYPE address type.  The format is::
			
 
				 
			
 
				   struct ipmi_ipmb_addr
			
 
				   {
			
@@ -184,16 +183,16 @@ spec.
 
				 Messages
			
 
				 --------
			
 
				 
			
 
				-Messages are defined as:
			
 
				+Messages are defined as::
			
 
				 
			
 
				-struct ipmi_msg
			
 
				-{
			
 
				+  struct ipmi_msg
			
 
				+  {
			
 
				 	unsigned char netfn;
			
 
				 	unsigned char lun;
			
 
				 	unsigned char cmd;
			
 
				 	unsigned char *data;
			
 
				 	int           data_len;
			
 
				-};
			
 
				+  };
			
 
				 
			
 
				 The driver takes care of adding/stripping the header information.  The
			
 
				 data portion is just the data to be send (do NOT put addressing info
			
@@ -208,7 +207,7 @@ block of data, even when receiving messages.  Otherwise the driver
 
				 will have no place to put the message.
			
 
				 
			
 
				 Messages coming up from the message handler in kernelland will come in
			
 
				-as:
			
 
				+as::
			
 
				 
			
 
				   struct ipmi_recv_msg
			
 
				   {
			
@@ -246,6 +245,7 @@ and the user should not have to care what type of SMI is below them.
 
				 
			
 
				 
			
 
				 Watching For Interfaces
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 When your code comes up, the IPMI driver may or may not have detected
			
 
				 if IPMI devices exist.  So you might have to defer your setup until
			
@@ -256,6 +256,7 @@ and tell you when they come and go.
 
				 
			
 
				 
			
 
				 Creating the User
			
 
				+^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 To use the message handler, you must first create a user using
			
 
				 ipmi_create_user.  The interface number specifies which SMI you want
			
@@ -272,6 +273,7 @@ closing the device automatically destroys the user.
 
				 
			
 
				 
			
 
				 Messaging
			
 
				+^^^^^^^^^
			
 
				 
			
 
				 To send a message from kernel-land, the ipmi_request_settime() call does
			
 
				 pretty much all message handling.  Most of the parameter are
			
@@ -321,6 +323,7 @@ though, since it is tricky to manage your own buffers.
 
				 
			
 
				 
			
 
				 Events and Incoming Commands
			
 
				+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
			
 
				 
			
 
				 The driver takes care of polling for IPMI events and receiving
			
 
				 commands (commands are messages that are not responses, they are
			
@@ -367,7 +370,7 @@ in the system.  It discovers interfaces through a host of different
 
				 methods, depending on the system.
			
 
				 
			
 
				 You can specify up to four interfaces on the module load line and
			
 
				-control some module parameters:
			
 
				+control some module parameters::
			
 
				 
			
 
				   modprobe ipmi_si.o type=<type1>,<type2>....
			
 
				        ports=<port1>,<port2>... addrs=<addr1>,<addr2>...
			
@@ -437,7 +440,7 @@ default is one.  Setting to 0 is useful with the hotmod, but is
 
				 obviously only useful for modules.
			
 
				 
			
 
				 When compiled into the kernel, the parameters can be specified on the
			
 
				-kernel command line as:
			
 
				+kernel command line as::
			
 
				 
			
 
				   ipmi_si.type=<type1>,<type2>...
			
 
				        ipmi_si.ports=<port1>,<port2>... ipmi_si.addrs=<addr1>,<addr2>...
			
@@ -474,16 +477,22 @@ The driver supports a hot add and remove of interfaces.  This way,
 
				 interfaces can be added or removed after the kernel is up and running.
			
 
				 This is done using /sys/modules/ipmi_si/parameters/hotmod, which is a
			
 
				 write-only parameter.  You write a string to this interface.  The string
			
 
				-has the format:
			
 
				+has the format::
			
 
				+
			
 
				    <op1>[:op2[:op3...]]
			
 
				-The "op"s are:
			
 
				+
			
 
				+The "op"s are::
			
 
				+
			
 
				    add|remove,kcs|bt|smic,mem|i/o,<address>[,<opt1>[,<opt2>[,...]]]
			
 
				-You can specify more than one interface on the line.  The "opt"s are:
			
 
				+
			
 
				+You can specify more than one interface on the line.  The "opt"s are::
			
 
				+
			
 
				    rsp=<regspacing>
			
 
				    rsi=<regsize>
			
 
				    rsh=<regshift>
			
 
				    irq=<irq>
			
 
				    ipmb=<ipmb slave addr>
			
 
				+
			
 
				 and these have the same meanings as discussed above.  Note that you
			
 
				 can also use this on the kernel command line for a more compact format
			
 
				 for specifying an interface.  Note that when removing an interface,
			
@@ -496,7 +505,7 @@ The SMBus Driver (SSIF)
 
				 The SMBus driver allows up to 4 SMBus devices to be configured in the
			
 
				 system.  By default, the driver will only register with something it
			
 
				 finds in DMI or ACPI tables.  You can change this
			
 
				-at module load time (for a module) with:
			
 
				+at module load time (for a module) with::
			
 
				 
			
 
				   modprobe ipmi_ssif.o
			
 
				 	addr=<i2caddr1>[,<i2caddr2>[,...]]
			
@@ -535,7 +544,7 @@ the smb_addr parameter unless you have DMI or ACPI data to tell the
 
				 driver what to use.
			
 
				 
			
 
				 When compiled into the kernel, the addresses can be specified on the
			
 
				-kernel command line as:
			
 
				+kernel command line as::
			
 
				 
			
 
				   ipmb_ssif.addr=<i2caddr1>[,<i2caddr2>[...]]
			
 
				 	ipmi_ssif.adapter=<adapter1>[,<adapter2>[...]]
			
@@ -565,9 +574,9 @@ Some users need more detailed information about a device, like where
 
				 the address came from or the raw base device for the IPMI interface.
			
 
				 You can use the IPMI smi_watcher to catch the IPMI interfaces as they
			
 
				 come or go, and to grab the information, you can use the function
			
 
				-ipmi_get_smi_info(), which returns the following structure:
			
 
				+ipmi_get_smi_info(), which returns the following structure::
			
 
				 
			
 
				-struct ipmi_smi_info {
			
 
				+  struct ipmi_smi_info {
			
 
				 	enum ipmi_addr_src addr_src;
			
 
				 	struct device *dev;
			
 
				 	union {
			
@@ -575,7 +584,7 @@ struct ipmi_smi_info {
 
				 			void *acpi_handle;
			
 
				 		} acpi_info;
			
 
				 	} addr_info;
			
 
				-};
			
 
				+  };
			
 
				 
			
 
				 Currently special info for only for SI_ACPI address sources is
			
 
				 returned.  Others may be added as necessary.
			
@@ -590,7 +599,7 @@ Watchdog
 
				 
			
 
				 A watchdog timer is provided that implements the Linux-standard
			
 
				 watchdog timer interface.  It has three module parameters that can be
			
 
				-used to control it:
			
 
				+used to control it::
			
 
				 
			
 
				   modprobe ipmi_watchdog timeout=<t> pretimeout=<t> action=<action type>
			
 
				       preaction=<preaction type> preop=<preop type> start_now=x
			
@@ -635,7 +644,7 @@ watchdog device is closed.  The default value of nowayout is true
 
				 if the CONFIG_WATCHDOG_NOWAYOUT option is enabled, or false if not.
			
 
				 
			
 
				 When compiled into the kernel, the kernel command line is available
			
 
				-for configuring the watchdog:
			
 
				+for configuring the watchdog::
			
 
				 
			
 
				   ipmi_watchdog.timeout=<t> ipmi_watchdog.pretimeout=<t>
			
 
				 	ipmi_watchdog.action=<action type>
			
@@ -675,6 +684,7 @@ also get a bunch of OEM events holding the panic string.
 
				 
			
 
				 
			
 
				 The field settings of the events are:
			
 
				+
			
 
				 * Generator ID: 0x21 (kernel)
			
 
				 * EvM Rev: 0x03 (this event is formatting in IPMI 1.0 format)
			
 
				 * Sensor Type: 0x20 (OS critical stop sensor)
			
@@ -683,18 +693,20 @@ The field settings of the events are:
 
				 * Event Data 1: 0xa1 (Runtime stop in OEM bytes 2 and 3)
			
 
				 * Event data 2: second byte of panic string
			
 
				 * Event data 3: third byte of panic string
			
 
				+
			
 
				 See the IPMI spec for the details of the event layout.  This event is
			
 
				 always sent to the local management controller.  It will handle routing
			
 
				 the message to the right place
			
 
				 
			
 
				 Other OEM events have the following format:
			
 
				-Record ID (bytes 0-1): Set by the SEL.
			
 
				-Record type (byte 2): 0xf0 (OEM non-timestamped)
			
 
				-byte 3: The slave address of the card saving the panic
			
 
				-byte 4: A sequence number (starting at zero)
			
 
				-The rest of the bytes (11 bytes) are the panic string.  If the panic string
			
 
				-is longer than 11 bytes, multiple messages will be sent with increasing
			
 
				-sequence numbers.
			
 
				+
			
 
				+* Record ID (bytes 0-1): Set by the SEL.
			
 
				+* Record type (byte 2): 0xf0 (OEM non-timestamped)
			
 
				+* byte 3: The slave address of the card saving the panic
			
 
				+* byte 4: A sequence number (starting at zero)
			
 
				+  The rest of the bytes (11 bytes) are the panic string.  If the panic string
			
 
				+  is longer than 11 bytes, multiple messages will be sent with increasing
			
 
				+  sequence numbers.
			
 
				 
			
 
				 Because you cannot send OEM events using the standard interface, this
			
 
				 function will attempt to find an SEL and add the events there.  It
			
--- a/Documentation/IRQ-affinity.txt
+++ b/Documentation/IRQ-affinity.txt
@@ -1,8 +1,11 @@
 
				+================
			
 
				+SMP IRQ affinity
			
 
				+================
			
 
				+
			
 
				 ChangeLog:
			
 
				-	Started by Ingo Molnar <mingo@redhat.com>
			
 
				-	Update by Max Krasnyansky <maxk@qualcomm.com>
			
 
				+	- Started by Ingo Molnar <mingo@redhat.com>
			
 
				+	- Update by Max Krasnyansky <maxk@qualcomm.com>
			
 
				 
			
 
				-SMP IRQ affinity
			
 
				 
			
 
				 /proc/irq/IRQ#/smp_affinity and /proc/irq/IRQ#/smp_affinity_list specify
			
 
				 which target CPUs are permitted for a given IRQ source.  It's a bitmask
			
@@ -16,50 +19,52 @@ will be set to the default mask. It can then be changed as described above.
 
				 Default mask is 0xffffffff.
			
 
				 
			
 
				 Here is an example of restricting IRQ44 (eth1) to CPU0-3 then restricting
			
 
				-it to CPU4-7 (this is an 8-CPU SMP box):
			
 
				+it to CPU4-7 (this is an 8-CPU SMP box)::
			
 
				 
			
 
				-[root@moon 44]# cd /proc/irq/44
			
 
				-[root@moon 44]# cat smp_affinity
			
 
				-ffffffff
			
 
				+	[root@moon 44]# cd /proc/irq/44
			
 
				+	[root@moon 44]# cat smp_affinity
			
 
				+	ffffffff
			
 
				 
			
 
				-[root@moon 44]# echo 0f > smp_affinity
			
 
				-[root@moon 44]# cat smp_affinity
			
 
				-0000000f
			
 
				-[root@moon 44]# ping -f h
			
 
				-PING hell (195.4.7.3): 56 data bytes
			
 
				-...
			
 
				---- hell ping statistics ---
			
 
				-6029 packets transmitted, 6027 packets received, 0% packet loss
			
 
				-round-trip min/avg/max = 0.1/0.1/0.4 ms
			
 
				-[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
			
 
				-           CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
			
 
				- 44:       1068       1785       1785       1783         0          0           0         0    IO-APIC-level  eth1
			
 
				+	[root@moon 44]# echo 0f > smp_affinity
			
 
				+	[root@moon 44]# cat smp_affinity
			
 
				+	0000000f
			
 
				+	[root@moon 44]# ping -f h
			
 
				+	PING hell (195.4.7.3): 56 data bytes
			
 
				+	...
			
 
				+	--- hell ping statistics ---
			
 
				+	6029 packets transmitted, 6027 packets received, 0% packet loss
			
 
				+	round-trip min/avg/max = 0.1/0.1/0.4 ms
			
 
				+	[root@moon 44]# cat /proc/interrupts | grep 'CPU\|44:'
			
 
				+		CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
			
 
				+	44:       1068       1785       1785       1783         0          0           0         0    IO-APIC-level  eth1
			
 
				 
			
 
				 As can be seen from the line above IRQ44 was delivered only to the first four
			
 
				 processors (0-3).
			
 
				 Now lets restrict that IRQ to CPU(4-7).
			
 
				 
			
 
				-[root@moon 44]# echo f0 > smp_affinity
			
 
				-[root@moon 44]# cat smp_affinity
			
 
				-000000f0
			
 
				-[root@moon 44]# ping -f h
			
 
				-PING hell (195.4.7.3): 56 data bytes
			
 
				-..
			
 
				---- hell ping statistics ---
			
 
				-2779 packets transmitted, 2777 packets received, 0% packet loss
			
 
				-round-trip min/avg/max = 0.1/0.5/585.4 ms
			
 
				-[root@moon 44]# cat /proc/interrupts |  'CPU\|44:'
			
 
				-           CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
			
 
				- 44:       1068       1785       1785       1783      1784       1069        1070       1069   IO-APIC-level  eth1
			
 
				+::
			
 
				+
			
 
				+	[root@moon 44]# echo f0 > smp_affinity
			
 
				+	[root@moon 44]# cat smp_affinity
			
 
				+	000000f0
			
 
				+	[root@moon 44]# ping -f h
			
 
				+	PING hell (195.4.7.3): 56 data bytes
			
 
				+	..
			
 
				+	--- hell ping statistics ---
			
 
				+	2779 packets transmitted, 2777 packets received, 0% packet loss
			
 
				+	round-trip min/avg/max = 0.1/0.5/585.4 ms
			
 
				+	[root@moon 44]# cat /proc/interrupts |  'CPU\|44:'
			
 
				+		CPU0       CPU1       CPU2       CPU3      CPU4       CPU5        CPU6       CPU7
			
 
				+	44:       1068       1785       1785       1783      1784       1069        1070       1069   IO-APIC-level  eth1
			
 
				 
			
 
				 This time around IRQ44 was delivered only to the last four processors.
			
 
				 i.e counters for the CPU0-3 did not change.
			
 
				 
			
 
				-Here is an example of limiting that same irq (44) to cpus 1024 to 1031:
			
 
				+Here is an example of limiting that same irq (44) to cpus 1024 to 1031::
			
 
				 
			
 
				-[root@moon 44]# echo 1024-1031 > smp_affinity_list
			
 
				-[root@moon 44]# cat smp_affinity_list
			
 
				-1024-1031
			
 
				+	[root@moon 44]# echo 1024-1031 > smp_affinity_list
			
 
				+	[root@moon 44]# cat smp_affinity_list
			
 
				+	1024-1031
			
 
				 
			
 
				 Note that to do this with a bitmask would require 32 bitmasks of zero
			
 
				 to follow the pertinent one.
			
--- a/Documentation/IRQ-domain.txt
+++ b/Documentation/IRQ-domain.txt
@@ -1,4 +1,6 @@
 
				-irq_domain interrupt number mapping library
			
 
				+===============================================
			
 
				+The irq_domain interrupt number mapping library
			
 
				+===============================================
			
 
				 
			
 
				 The current design of the Linux kernel uses a single large number
			
 
				 space where each separate IRQ source is assigned a different number.
			
@@ -36,7 +38,9 @@ irq_domain also implements translation from an abstract irq_fwspec
 
				 structure to hwirq numbers (Device Tree and ACPI GSI so far), and can
			
 
				 be easily extended to support other IRQ topology data sources.
			
 
				 
			
 
				-=== irq_domain usage ===
			
 
				+irq_domain usage
			
 
				+================
			
 
				+
			
 
				 An interrupt controller driver creates and registers an irq_domain by
			
 
				 calling one of the irq_domain_add_*() functions (each mapping method
			
 
				 has a different allocator function, more on that later).  The function
			
@@ -62,15 +66,21 @@ If the driver has the Linux IRQ number or the irq_data pointer, and
 
				 needs to know the associated hwirq number (such as in the irq_chip
			
 
				 callbacks) then it can be directly obtained from irq_data->hwirq.
			
 
				 
			
 
				-=== Types of irq_domain mappings ===
			
 
				+Types of irq_domain mappings
			
 
				+============================
			
 
				+
			
 
				 There are several mechanisms available for reverse mapping from hwirq
			
 
				 to Linux irq, and each mechanism uses a different allocation function.
			
 
				 Which reverse map type should be used depends on the use case.  Each
			
 
				 of the reverse map types are described below:
			
 
				 
			
 
				-==== Linear ====
			
 
				-irq_domain_add_linear()
			
 
				-irq_domain_create_linear()
			
 
				+Linear
			
 
				+------
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	irq_domain_add_linear()
			
 
				+	irq_domain_create_linear()
			
 
				 
			
 
				 The linear reverse map maintains a fixed size table indexed by the
			
 
				 hwirq number.  When a hwirq is mapped, an irq_desc is allocated for
			
@@ -89,9 +99,13 @@ accepts a more general abstraction 'struct fwnode_handle'.
 
				 
			
 
				 The majority of drivers should use the linear map.
			
 
				 
			
 
				-==== Tree ====
			
 
				-irq_domain_add_tree()
			
 
				-irq_domain_create_tree()
			
 
				+Tree
			
 
				+----
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	irq_domain_add_tree()
			
 
				+	irq_domain_create_tree()
			
 
				 
			
 
				 The irq_domain maintains a radix tree map from hwirq numbers to Linux
			
 
				 IRQs.  When an hwirq is mapped, an irq_desc is allocated and the
			
@@ -109,8 +123,12 @@ accepts a more general abstraction 'struct fwnode_handle'.
 
				 
			
 
				 Very few drivers should need this mapping.
			
 
				 
			
 
				-==== No Map ===-
			
 
				-irq_domain_add_nomap()
			
 
				+No Map
			
 
				+------
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	irq_domain_add_nomap()
			
 
				 
			
 
				 The No Map mapping is to be used when the hwirq number is
			
 
				 programmable in the hardware.  In this case it is best to program the
			
@@ -121,10 +139,14 @@ Linux IRQ number into the hardware.
 
				 
			
 
				 Most drivers cannot use this mapping.
			
 
				 
			
 
				-==== Legacy ====
			
 
				-irq_domain_add_simple()
			
 
				-irq_domain_add_legacy()
			
 
				-irq_domain_add_legacy_isa()
			
 
				+Legacy
			
 
				+------
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	irq_domain_add_simple()
			
 
				+	irq_domain_add_legacy()
			
 
				+	irq_domain_add_legacy_isa()
			
 
				 
			
 
				 The Legacy mapping is a special case for drivers that already have a
			
 
				 range of irq_descs allocated for the hwirqs.  It is used when the
			
@@ -163,14 +185,17 @@ that the driver using the simple domain call irq_create_mapping()
 
				 before any irq_find_mapping() since the latter will actually work
			
 
				 for the static IRQ assignment case.
			
 
				 
			
 
				-==== Hierarchy IRQ domain ====
			
 
				+Hierarchy IRQ domain
			
 
				+--------------------
			
 
				+
			
 
				 On some architectures, there may be multiple interrupt controllers
			
 
				 involved in delivering an interrupt from the device to the target CPU.
			
 
				-Let's look at a typical interrupt delivering path on x86 platforms:
			
 
				+Let's look at a typical interrupt delivering path on x86 platforms::
			
 
				 
			
 
				-Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
			
 
				+  Device --> IOAPIC -> Interrupt remapping Controller -> Local APIC -> CPU
			
 
				 
			
 
				 There are three interrupt controllers involved:
			
 
				+
			
 
				 1) IOAPIC controller
			
 
				 2) Interrupt remapping controller
			
 
				 3) Local APIC controller
			
@@ -180,7 +205,8 @@ hardware architecture, an irq_domain data structure is built for each
 
				 interrupt controller and those irq_domains are organized into hierarchy.
			
 
				 When building irq_domain hierarchy, the irq_domain near to the device is
			
 
				 child and the irq_domain near to CPU is parent. So a hierarchy structure
			
 
				-as below will be built for the example above.
			
 
				+as below will be built for the example above::
			
 
				+
			
 
				 	CPU Vector irq_domain (root irq_domain to manage CPU vectors)
			
 
				 		^
			
 
				 		|
			
@@ -190,6 +216,7 @@ as below will be built for the example above.
 
				 	IOAPIC irq_domain (manage IOAPIC delivery entries/pins)
			
 
				 
			
 
				 There are four major interfaces to use hierarchy irq_domain:
			
 
				+
			
 
				 1) irq_domain_alloc_irqs(): allocate IRQ descriptors and interrupt
			
 
				    controller related resources to deliver these interrupts.
			
 
				 2) irq_domain_free_irqs(): free IRQ descriptors and interrupt controller
			
@@ -199,7 +226,8 @@ There are four major interfaces to use hierarchy irq_domain:
 
				 4) irq_domain_deactivate_irq(): deactivate interrupt controller hardware
			
 
				    to stop delivering the interrupt.
			
 
				 
			
 
				-Following changes are needed to support hierarchy irq_domain.
			
 
				+Following changes are needed to support hierarchy irq_domain:
			
 
				+
			
 
				 1) a new field 'parent' is added to struct irq_domain; it's used to
			
 
				    maintain irq_domain hierarchy information.
			
 
				 2) a new field 'parent_data' is added to struct irq_data; it's used to
			
@@ -223,6 +251,7 @@ software architecture.
 
				 
			
 
				 For an interrupt controller driver to support hierarchy irq_domain, it
			
 
				 needs to:
			
 
				+
			
 
				 1) Implement irq_domain_ops.alloc and irq_domain_ops.free
			
 
				 2) Optionally implement irq_domain_ops.activate and
			
 
				    irq_domain_ops.deactivate.
			
@@ -231,5 +260,42 @@ needs to:
 
				 4) No need to implement irq_domain_ops.map and irq_domain_ops.unmap,
			
 
				    they are unused with hierarchy irq_domain.
			
 
				 
			
 
				-Hierarchy irq_domain may also be used to support other architectures,
			
 
				-such as ARM, ARM64 etc.
			
 
				+Hierarchy irq_domain is in no way x86 specific, and is heavily used to
			
 
				+support other architectures, such as ARM, ARM64 etc.
			
 
				+
			
 
				+=== Debugging ===
			
 
				+
			
 
				+If you switch on CONFIG_IRQ_DOMAIN_DEBUG (which depends on
			
 
				+CONFIG_IRQ_DOMAIN and CONFIG_DEBUG_FS), you will find a new file in
			
 
				+your debugfs mount point, called irq_domain_mapping. This file
			
 
				+contains a live snapshot of all the IRQ domains in the system:
			
 
				+
			
 
				+ name              mapped  linear-max  direct-max  devtree-node
			
 
				+ pl061                  8           8           0  /smb/gpio@e0080000
			
 
				+ pl061                  8           8           0  /smb/gpio@e1050000
			
 
				+ pMSI                   0           0           0  /interrupt-controller@e1101000/v2m@e0080000
			
 
				+ MSI                   37           0           0  /interrupt-controller@e1101000/v2m@e0080000
			
 
				+ GICv2m                37           0           0  /interrupt-controller@e1101000/v2m@e0080000
			
 
				+ GICv2                448         448           0  /interrupt-controller@e1101000
			
 
				+
			
 
				+it also iterates over the interrupts to display their mapping in the
			
 
				+domains, and makes the domain stacking visible:
			
 
				+
			
 
				+
			
 
				+irq    hwirq    chip name        chip data           active  type            domain
			
 
				+    1  0x00019  GICv2            0xffff00000916bfd8     *    LINEAR          GICv2
			
 
				+    2  0x0001d  GICv2            0xffff00000916bfd8          LINEAR          GICv2
			
 
				+    3  0x0001e  GICv2            0xffff00000916bfd8     *    LINEAR          GICv2
			
 
				+    4  0x0001b  GICv2            0xffff00000916bfd8     *    LINEAR          GICv2
			
 
				+    5  0x0001a  GICv2            0xffff00000916bfd8          LINEAR          GICv2
			
 
				+[...]
			
 
				+   96  0x81808  MSI              0x          (null)           RADIX          MSI
			
 
				+   96+ 0x00063  GICv2m           0xffff8003ee116980           RADIX          GICv2m
			
 
				+   96+ 0x00063  GICv2            0xffff00000916bfd8          LINEAR          GICv2
			
 
				+   97  0x08800  MSI              0x          (null)     *     RADIX          MSI
			
 
				+   97+ 0x00064  GICv2m           0xffff8003ee116980     *     RADIX          GICv2m
			
 
				+   97+ 0x00064  GICv2            0xffff00000916bfd8     *    LINEAR          GICv2
			
 
				+
			
 
				+Here, interrupts 1-5 are only using a single domain, while 96 and 97
			
 
				+are build out of a stack of three domain, each level performing a
			
 
				+particular function.
			
--- a/Documentation/IRQ.txt
+++ b/Documentation/IRQ.txt
@@ -1,4 +1,6 @@
 
				+===============
			
 
				 What is an IRQ?
			
 
				+===============
			
 
				 
			
 
				 An IRQ is an interrupt request from a device.
			
 
				 Currently they can come in over a pin, or over a packet.
			
--- a/Documentation/Intel-IOMMU.txt
+++ b/Documentation/Intel-IOMMU.txt
@@ -1,3 +1,4 @@
 
				+===================
			
 
				 Linux IOMMU Support
			
 
				 ===================
			
 
				 
			
@@ -9,11 +10,11 @@ This guide gives a quick cheat sheet for some basic understanding.
 
				 
			
 
				 Some Keywords
			
 
				 
			
 
				-DMAR - DMA remapping
			
 
				-DRHD - DMA Remapping Hardware Unit Definition
			
 
				-RMRR - Reserved memory Region Reporting Structure
			
 
				-ZLR  - Zero length reads from PCI devices
			
 
				-IOVA - IO Virtual address.
			
 
				+- DMAR - DMA remapping
			
 
				+- DRHD - DMA Remapping Hardware Unit Definition
			
 
				+- RMRR - Reserved memory Region Reporting Structure
			
 
				+- ZLR  - Zero length reads from PCI devices
			
 
				+- IOVA - IO Virtual address.
			
 
				 
			
 
				 Basic stuff
			
 
				 -----------
			
@@ -33,7 +34,7 @@ devices that need to access these regions. OS is expected to setup
 
				 unity mappings for these regions for these devices to access these regions.
			
 
				 
			
 
				 How is IOVA generated?
			
 
				----------------------
			
 
				+----------------------
			
 
				 
			
 
				 Well behaved drivers call pci_map_*() calls before sending command to device
			
 
				 that needs to perform DMA. Once DMA is completed and mapping is no longer
			
@@ -82,14 +83,14 @@ in ACPI.
 
				 ACPI: DMAR (v001 A M I  OEMDMAR  0x00000001 MSFT 0x00000097) @ 0x000000007f5b5ef0
			
 
				 
			
 
				 When DMAR is being processed and initialized by ACPI, prints DMAR locations
			
 
				-and any RMRR's processed.
			
 
				+and any RMRR's processed::
			
 
				 
			
 
				-ACPI DMAR:Host address width 36
			
 
				-ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
			
 
				-ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
			
 
				-ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
			
 
				-ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
			
 
				-ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
			
 
				+	ACPI DMAR:Host address width 36
			
 
				+	ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed90000
			
 
				+	ACPI DMAR:DRHD (flags: 0x00000000)base: 0x00000000fed91000
			
 
				+	ACPI DMAR:DRHD (flags: 0x00000001)base: 0x00000000fed93000
			
 
				+	ACPI DMAR:RMRR base: 0x00000000000ed000 end: 0x00000000000effff
			
 
				+	ACPI DMAR:RMRR base: 0x000000007f600000 end: 0x000000007fffffff
			
 
				 
			
 
				 When DMAR is enabled for use, you will notice..
			
 
				 
			
@@ -98,10 +99,12 @@ PCI-DMA: Using DMAR IOMMU
 
				 Fault reporting
			
 
				 ---------------
			
 
				 
			
 
				-DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
			
 
				-DMAR:[fault reason 05] PTE Write access is not set
			
 
				-DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
			
 
				-DMAR:[fault reason 05] PTE Write access is not set
			
 
				+::
			
 
				+
			
 
				+	DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
			
 
				+	DMAR:[fault reason 05] PTE Write access is not set
			
 
				+	DMAR:[DMA Write] Request device [00:02.0] fault addr 6df084000
			
 
				+	DMAR:[fault reason 05] PTE Write access is not set
			
 
				 
			
 
				 TBD
			
 
				 ----
			
--- a/Documentation/Makefile
+++ b/Documentation/Makefile
@@ -1 +1,126 @@
 
				+# -*- makefile -*-
			
 
				+# Makefile for Sphinx documentation
			
 
				+#
			
 
				+
			
 
				 subdir-y :=
			
 
				+
			
 
				+# You can set these variables from the command line.
			
 
				+SPHINXBUILD   = sphinx-build
			
 
				+SPHINXOPTS    =
			
 
				+SPHINXDIRS    = .
			
 
				+_SPHINXDIRS   = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
			
 
				+SPHINX_CONF   = conf.py
			
 
				+PAPER         =
			
 
				+BUILDDIR      = $(obj)/output
			
 
				+PDFLATEX      = xelatex
			
 
				+LATEXOPTS     = -interaction=batchmode
			
 
				+
			
 
				+# User-friendly check for sphinx-build
			
 
				+HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
			
 
				+
			
 
				+ifeq ($(HAVE_SPHINX),0)
			
 
				+
			
 
				+.DEFAULT:
			
 
				+	$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
			
 
				+	@echo "  SKIP    Sphinx $@ target."
			
 
				+
			
 
				+else # HAVE_SPHINX
			
 
				+
			
 
				+# User-friendly check for pdflatex
			
 
				+HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
			
 
				+
			
 
				+# Internal variables.
			
 
				+PAPEROPT_a4     = -D latex_paper_size=a4
			
 
				+PAPEROPT_letter = -D latex_paper_size=letter
			
 
				+KERNELDOC       = $(srctree)/scripts/kernel-doc
			
 
				+KERNELDOC_CONF  = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
			
 
				+ALLSPHINXOPTS   =  $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
			
 
				+# the i18n builder cannot share the environment and doctrees with the others
			
 
				+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
			
 
				+
			
 
				+# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
			
 
				+loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
			
 
				+
			
 
				+# $2 sphinx builder e.g. "html"
			
 
				+# $3 name of the build subfolder / e.g. "media", used as:
			
 
				+#    * dest folder relative to $(BUILDDIR) and
			
 
				+#    * cache folder relative to $(BUILDDIR)/.doctrees
			
 
				+# $4 dest subfolder e.g. "man" for man pages at media/man
			
 
				+# $5 reST source folder relative to $(srctree)/$(src),
			
 
				+#    e.g. "media" for the linux-tv book-set at ./Documentation/media
			
 
				+
			
 
				+quiet_cmd_sphinx = SPHINX  $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
			
 
				+      cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
			
 
				+	PYTHONDONTWRITEBYTECODE=1 \
			
 
				+	BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
			
 
				+	$(SPHINXBUILD) \
			
 
				+	-b $2 \
			
 
				+	-c $(abspath $(srctree)/$(src)) \
			
 
				+	-d $(abspath $(BUILDDIR)/.doctrees/$3) \
			
 
				+	-D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
			
 
				+	$(ALLSPHINXOPTS) \
			
 
				+	$(abspath $(srctree)/$(src)/$5) \
			
 
				+	$(abspath $(BUILDDIR)/$3/$4)
			
 
				+
			
 
				+htmldocs:
			
 
				+	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
			
 
				+
			
 
				+linkcheckdocs:
			
 
				+	@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
			
 
				+
			
 
				+latexdocs:
			
 
				+	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
			
 
				+
			
 
				+ifeq ($(HAVE_PDFLATEX),0)
			
 
				+
			
 
				+pdfdocs:
			
 
				+	$(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
			
 
				+	@echo "  SKIP    Sphinx $@ target."
			
 
				+
			
 
				+else # HAVE_PDFLATEX
			
 
				+
			
 
				+pdfdocs: latexdocs
			
 
				+	$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
			
 
				+
			
 
				+endif # HAVE_PDFLATEX
			
 
				+
			
 
				+epubdocs:
			
 
				+	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
			
 
				+
			
 
				+xmldocs:
			
 
				+	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
			
 
				+
			
 
				+endif # HAVE_SPHINX
			
 
				+
			
 
				+# The following targets are independent of HAVE_SPHINX, and the rules should
			
 
				+# work or silently pass without Sphinx.
			
 
				+
			
 
				+# no-ops for the Sphinx toolchain
			
 
				+sgmldocs:
			
 
				+	@:
			
 
				+psdocs:
			
 
				+	@:
			
 
				+mandocs:
			
 
				+	@:
			
 
				+installmandocs:
			
 
				+	@:
			
 
				+
			
 
				+cleandocs:
			
 
				+	$(Q)rm -rf $(BUILDDIR)
			
 
				+	$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
			
 
				+
			
 
				+dochelp:
			
 
				+	@echo  ' Linux kernel internal documentation in different formats from ReST:'
			
 
				+	@echo  '  htmldocs        - HTML'
			
 
				+	@echo  '  latexdocs       - LaTeX'
			
 
				+	@echo  '  pdfdocs         - PDF'
			
 
				+	@echo  '  epubdocs        - EPUB'
			
 
				+	@echo  '  xmldocs         - XML'
			
 
				+	@echo  '  linkcheckdocs   - check for broken external links (will connect to external hosts)'
			
 
				+	@echo  '  cleandocs       - clean all generated files'
			
 
				+	@echo
			
 
				+	@echo  '  make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
			
 
				+	@echo  '  valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
			
 
				+	@echo
			
 
				+	@echo  '  make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
			
 
				+	@echo  '  configuration. This is e.g. useful to build with nit-picking config.'
			
--- a/Documentation/Makefile.sphinx
+++ b/Documentation/Makefile.sphinx
@@ -1,130 +0,0 @@
 
				-# -*- makefile -*-
			
 
				-# Makefile for Sphinx documentation
			
 
				-#
			
 
				-
			
 
				-# You can set these variables from the command line.
			
 
				-SPHINXBUILD   = sphinx-build
			
 
				-SPHINXOPTS    =
			
 
				-SPHINXDIRS    = .
			
 
				-_SPHINXDIRS   = $(patsubst $(srctree)/Documentation/%/conf.py,%,$(wildcard $(srctree)/Documentation/*/conf.py))
			
 
				-SPHINX_CONF   = conf.py
			
 
				-PAPER         =
			
 
				-BUILDDIR      = $(obj)/output
			
 
				-PDFLATEX      = xelatex
			
 
				-LATEXOPTS     = -interaction=batchmode
			
 
				-
			
 
				-# User-friendly check for sphinx-build
			
 
				-HAVE_SPHINX := $(shell if which $(SPHINXBUILD) >/dev/null 2>&1; then echo 1; else echo 0; fi)
			
 
				-
			
 
				-ifeq ($(HAVE_SPHINX),0)
			
 
				-
			
 
				-.DEFAULT:
			
 
				-	$(warning The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed and in PATH, or set the SPHINXBUILD make variable to point to the full path of the '$(SPHINXBUILD)' executable.)
			
 
				-	@echo "  SKIP    Sphinx $@ target."
			
 
				-
			
 
				-else ifneq ($(DOCBOOKS),)
			
 
				-
			
 
				-# Skip Sphinx build if the user explicitly requested DOCBOOKS.
			
 
				-.DEFAULT:
			
 
				-	@echo "  SKIP    Sphinx $@ target (DOCBOOKS specified)."
			
 
				-
			
 
				-else # HAVE_SPHINX
			
 
				-
			
 
				-# User-friendly check for pdflatex
			
 
				-HAVE_PDFLATEX := $(shell if which $(PDFLATEX) >/dev/null 2>&1; then echo 1; else echo 0; fi)
			
 
				-
			
 
				-# Internal variables.
			
 
				-PAPEROPT_a4     = -D latex_paper_size=a4
			
 
				-PAPEROPT_letter = -D latex_paper_size=letter
			
 
				-KERNELDOC       = $(srctree)/scripts/kernel-doc
			
 
				-KERNELDOC_CONF  = -D kerneldoc_srctree=$(srctree) -D kerneldoc_bin=$(KERNELDOC)
			
 
				-ALLSPHINXOPTS   =  $(KERNELDOC_CONF) $(PAPEROPT_$(PAPER)) $(SPHINXOPTS)
			
 
				-# the i18n builder cannot share the environment and doctrees with the others
			
 
				-I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
			
 
				-
			
 
				-# commands; the 'cmd' from scripts/Kbuild.include is not *loopable*
			
 
				-loop_cmd = $(echo-cmd) $(cmd_$(1)) || exit;
			
 
				-
			
 
				-# $2 sphinx builder e.g. "html"
			
 
				-# $3 name of the build subfolder / e.g. "media", used as:
			
 
				-#    * dest folder relative to $(BUILDDIR) and
			
 
				-#    * cache folder relative to $(BUILDDIR)/.doctrees
			
 
				-# $4 dest subfolder e.g. "man" for man pages at media/man
			
 
				-# $5 reST source folder relative to $(srctree)/$(src),
			
 
				-#    e.g. "media" for the linux-tv book-set at ./Documentation/media
			
 
				-
			
 
				-quiet_cmd_sphinx = SPHINX  $@ --> file://$(abspath $(BUILDDIR)/$3/$4)
			
 
				-      cmd_sphinx = $(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media $2 && \
			
 
				-	PYTHONDONTWRITEBYTECODE=1 \
			
 
				-	BUILDDIR=$(abspath $(BUILDDIR)) SPHINX_CONF=$(abspath $(srctree)/$(src)/$5/$(SPHINX_CONF)) \
			
 
				-	$(SPHINXBUILD) \
			
 
				-	-b $2 \
			
 
				-	-c $(abspath $(srctree)/$(src)) \
			
 
				-	-d $(abspath $(BUILDDIR)/.doctrees/$3) \
			
 
				-	-D version=$(KERNELVERSION) -D release=$(KERNELRELEASE) \
			
 
				-	$(ALLSPHINXOPTS) \
			
 
				-	$(abspath $(srctree)/$(src)/$5) \
			
 
				-	$(abspath $(BUILDDIR)/$3/$4)
			
 
				-
			
 
				-htmldocs:
			
 
				-	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,html,$(var),,$(var)))
			
 
				-
			
 
				-linkcheckdocs:
			
 
				-	@$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,linkcheck,$(var),,$(var)))
			
 
				-
			
 
				-latexdocs:
			
 
				-	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,latex,$(var),latex,$(var)))
			
 
				-
			
 
				-ifeq ($(HAVE_PDFLATEX),0)
			
 
				-
			
 
				-pdfdocs:
			
 
				-	$(warning The '$(PDFLATEX)' command was not found. Make sure you have it installed and in PATH to produce PDF output.)
			
 
				-	@echo "  SKIP    Sphinx $@ target."
			
 
				-
			
 
				-else # HAVE_PDFLATEX
			
 
				-
			
 
				-pdfdocs: latexdocs
			
 
				-	$(foreach var,$(SPHINXDIRS), $(MAKE) PDFLATEX=$(PDFLATEX) LATEXOPTS="$(LATEXOPTS)" -C $(BUILDDIR)/$(var)/latex || exit;)
			
 
				-
			
 
				-endif # HAVE_PDFLATEX
			
 
				-
			
 
				-epubdocs:
			
 
				-	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,epub,$(var),epub,$(var)))
			
 
				-
			
 
				-xmldocs:
			
 
				-	@+$(foreach var,$(SPHINXDIRS),$(call loop_cmd,sphinx,xml,$(var),xml,$(var)))
			
 
				-
			
 
				-endif # HAVE_SPHINX
			
 
				-
			
 
				-# The following targets are independent of HAVE_SPHINX, and the rules should
			
 
				-# work or silently pass without Sphinx.
			
 
				-
			
 
				-# no-ops for the Sphinx toolchain
			
 
				-sgmldocs:
			
 
				-	@:
			
 
				-psdocs:
			
 
				-	@:
			
 
				-mandocs:
			
 
				-	@:
			
 
				-installmandocs:
			
 
				-	@:
			
 
				-
			
 
				-cleandocs:
			
 
				-	$(Q)rm -rf $(BUILDDIR)
			
 
				-	$(Q)$(MAKE) BUILDDIR=$(abspath $(BUILDDIR)) $(build)=Documentation/media clean
			
 
				-
			
 
				-dochelp:
			
 
				-	@echo  ' Linux kernel internal documentation in different formats (Sphinx):'
			
 
				-	@echo  '  htmldocs        - HTML'
			
 
				-	@echo  '  latexdocs       - LaTeX'
			
 
				-	@echo  '  pdfdocs         - PDF'
			
 
				-	@echo  '  epubdocs        - EPUB'
			
 
				-	@echo  '  xmldocs         - XML'
			
 
				-	@echo  '  linkcheckdocs   - check for broken external links (will connect to external hosts)'
			
 
				-	@echo  '  cleandocs       - clean all generated files'
			
 
				-	@echo
			
 
				-	@echo  '  make SPHINXDIRS="s1 s2" [target] Generate only docs of folder s1, s2'
			
 
				-	@echo  '  valid values for SPHINXDIRS are: $(_SPHINXDIRS)'
			
 
				-	@echo
			
 
				-	@echo  '  make SPHINX_CONF={conf-file} [target] use *additional* sphinx-build'
			
 
				-	@echo  '  configuration. This is e.g. useful to build with nit-picking config.'
			
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -186,7 +186,7 @@ must disable interrupts while the lock is held.  If the device sends
 
				 a different interrupt, the driver will deadlock trying to recursively
			
 
				 acquire the spinlock.  Such deadlocks can be avoided by using
			
 
				 spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
			
 
				-and acquire the lock (see Documentation/DocBook/kernel-locking).
			
 
				+and acquire the lock (see Documentation/kernel-hacking/locking.rst).
			
 
				 
			
 
				 4.5 How to tell whether MSI/MSI-X is enabled on a device
			
 
				 
			
--- a/Documentation/RCU/00-INDEX
+++ b/Documentation/RCU/00-INDEX
@@ -28,8 +28,6 @@ stallwarn.txt
 
				 	- RCU CPU stall warnings (module parameter rcu_cpu_stall_suppress)
			
 
				 torture.txt
			
 
				 	- RCU Torture Test Operation (CONFIG_RCU_TORTURE_TEST)
			
 
				-trace.txt
			
 
				-	- CONFIG_RCU_TRACE debugfs files and formats
			
 
				 UP.txt
			
 
				 	- RCU on Uniprocessor Systems
			
 
				 whatisRCU.txt
			
--- a/Documentation/RCU/Design/Requirements/Requirements.html
+++ b/Documentation/RCU/Design/Requirements/Requirements.html
@@ -559,9 +559,7 @@ The <tt>rcu_access_pointer()</tt> on line&nbsp;6 is similar to
 
				 	For <tt>remove_gp_synchronous()</tt>, as long as all modifications
			
 
				 	to <tt>gp</tt> are carried out while holding <tt>gp_lock</tt>,
			
 
				 	the above optimizations are harmless.
			
 
				-	However,
			
 
				-	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt>,
			
 
				-	<tt>sparse</tt> will complain if you
			
 
				+	However, <tt>sparse</tt> will complain if you
			
 
				 	define <tt>gp</tt> with <tt>__rcu</tt> and then
			
 
				 	access it without using
			
 
				 	either <tt>rcu_access_pointer()</tt> or <tt>rcu_dereference()</tt>.
			
@@ -1849,7 +1847,8 @@ mass storage, or user patience, whichever comes first.
 
				 If the nesting is not visible to the compiler, as is the case with
			
 
				 mutually recursive functions each in its own translation unit,
			
 
				 stack overflow will result.
			
 
				-If the nesting takes the form of loops, either the control variable
			
 
				+If the nesting takes the form of loops, perhaps in the guise of tail
			
 
				+recursion, either the control variable
			
 
				 will overflow or (in the Linux kernel) you will get an RCU CPU stall warning.
			
 
				 Nevertheless, this class of RCU implementations is one
			
 
				 of the most composable constructs in existence.
			
@@ -1977,9 +1976,8 @@ guard against mishaps and misuse:
 
				 	and <tt>rcu_dereference()</tt>, perhaps (incorrectly)
			
 
				 	substituting a simple assignment.
			
 
				 	To catch this sort of error, a given RCU-protected pointer may be
			
 
				-	tagged with <tt>__rcu</tt>, after which running sparse
			
 
				-	with <tt>CONFIG_SPARSE_RCU_POINTER=y</tt> will complain
			
 
				-	about simple-assignment accesses to that pointer.
			
 
				+	tagged with <tt>__rcu</tt>, after which sparse
			
 
				+	will complain about simple-assignment accesses to that pointer.
			
 
				 	Arnd Bergmann made me aware of this requirement, and also
			
 
				 	supplied the needed
			
 
				 	<a href="https://lwn.net/Articles/376011/">patch series</a>.
			
@@ -2036,7 +2034,7 @@ guard against mishaps and misuse:
 
				 	some other synchronization mechanism, for example, reference
			
 
				 	counting.
			
 
				 <li>	In kernels built with <tt>CONFIG_RCU_TRACE=y</tt>, RCU-related
			
 
				-	information is provided via both debugfs and event tracing.
			
 
				+	information is provided via event tracing.
			
 
				 <li>	Open-coded use of <tt>rcu_assign_pointer()</tt> and
			
 
				 	<tt>rcu_dereference()</tt> to create typical linked
			
 
				 	data structures can be surprisingly error-prone.
			
@@ -2519,11 +2517,7 @@ It is similarly socially unacceptable to interrupt an
 
				 <tt>nohz_full</tt> CPU running in userspace.
			
 
				 RCU must therefore track <tt>nohz_full</tt> userspace
			
 
				 execution.
			
 
				-And in
			
 
				-<a href="https://lwn.net/Articles/558284/"><tt>CONFIG_NO_HZ_FULL_SYSIDLE=y</tt></a>
			
 
				-kernels, RCU must separately track idle CPUs on the one hand and
			
 
				-CPUs that are either idle or executing in userspace on the other.
			
 
				-In both cases, RCU must be able to sample state at two points in
			
 
				+RCU must therefore be able to sample state at two points in
			
 
				 time, and be able to determine whether or not some other CPU spent
			
 
				 any time idle and/or executing in userspace.
			
 
				 
			
@@ -2935,6 +2929,20 @@ The reason that this is possible is that SRCU is insensitive
 
				 to whether or not a CPU is online, which means that <tt>srcu_barrier()</tt>
			
 
				 need not exclude CPU-hotplug operations.
			
 
				 
			
 
				+<p>
			
 
				+SRCU also differs from other RCU flavors in that SRCU's expedited and
			
 
				+non-expedited grace periods are implemented by the same mechanism.
			
 
				+This means that in the current SRCU implementation, expediting a
			
 
				+future grace period has the side effect of expediting all prior
			
 
				+grace periods that have not yet completed.
			
 
				+(But please note that this is a property of the current implementation,
			
 
				+not necessarily of future implementations.)
			
 
				+In addition, if SRCU has been idle for longer than the interval
			
 
				+specified by the <tt>srcutree.exp_holdoff</tt> kernel boot parameter
			
 
				+(25&nbsp;microseconds by default),
			
 
				+and if a <tt>synchronize_srcu()</tt> invocation ends this idle period,
			
 
				+that invocation will be automatically expedited.
			
 
				+
			
 
				 <p>
			
 
				 As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating
			
 
				 a locking bottleneck present in prior kernel versions.
			
--- a/Documentation/RCU/checklist.txt
+++ b/Documentation/RCU/checklist.txt
@@ -413,11 +413,11 @@ over a rather long period of time, but improvements are always welcome!
 
				 	read-side critical sections.  It is the responsibility of the
			
 
				 	RCU update-side primitives to deal with this.
			
 
				 
			
 
				-17.	Use CONFIG_PROVE_RCU, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
			
 
				-	__rcu sparse checks (enabled by CONFIG_SPARSE_RCU_POINTER) to
			
 
				-	validate your RCU code.  These can help find problems as follows:
			
 
				+17.	Use CONFIG_PROVE_LOCKING, CONFIG_DEBUG_OBJECTS_RCU_HEAD, and the
			
 
				+	__rcu sparse checks to validate your RCU code.	These can help
			
 
				+	find problems as follows:
			
 
				 
			
 
				-	CONFIG_PROVE_RCU: check that accesses to RCU-protected data
			
 
				+	CONFIG_PROVE_LOCKING: check that accesses to RCU-protected data
			
 
				 		structures are carried out under the proper RCU
			
 
				 		read-side critical section, while holding the right
			
 
				 		combination of locks, or whatever other conditions
			
--- a/Documentation/RCU/trace.txt
+++ b/Documentation/RCU/trace.txt
@@ -1,535 +0,0 @@
 
				-CONFIG_RCU_TRACE debugfs Files and Formats
			
 
				-
			
 
				-
			
 
				-The rcutree and rcutiny implementations of RCU provide debugfs trace
			
 
				-output that summarizes counters and state.  This information is useful for
			
 
				-debugging RCU itself, and can sometimes also help to debug abuses of RCU.
			
 
				-The following sections describe the debugfs files and formats, first
			
 
				-for rcutree and next for rcutiny.
			
 
				-
			
 
				-
			
 
				-CONFIG_TREE_RCU and CONFIG_PREEMPT_RCU debugfs Files and Formats
			
 
				-
			
 
				-These implementations of RCU provide several debugfs directories under the
			
 
				-top-level directory "rcu":
			
 
				-
			
 
				-rcu/rcu_bh
			
 
				-rcu/rcu_preempt
			
 
				-rcu/rcu_sched
			
 
				-
			
 
				-Each directory contains files for the corresponding flavor of RCU.
			
 
				-Note that rcu/rcu_preempt is only present for CONFIG_PREEMPT_RCU.
			
 
				-For CONFIG_TREE_RCU, the RCU flavor maps onto the RCU-sched flavor,
			
 
				-so that activity for both appears in rcu/rcu_sched.
			
 
				-
			
 
				-In addition, the following file appears in the top-level directory:
			
 
				-rcu/rcutorture.  This file displays rcutorture test progress.  The output
			
 
				-of "cat rcu/rcutorture" looks as follows:
			
 
				-
			
 
				-rcutorture test sequence: 0 (test in progress)
			
 
				-rcutorture update version number: 615
			
 
				-
			
 
				-The first line shows the number of rcutorture tests that have completed
			
 
				-since boot.  If a test is currently running, the "(test in progress)"
			
 
				-string will appear as shown above.  The second line shows the number of
			
 
				-update cycles that the current test has started, or zero if there is
			
 
				-no test in progress.
			
 
				-
			
 
				-
			
 
				-Within each flavor directory (rcu/rcu_bh, rcu/rcu_sched, and possibly
			
 
				-also rcu/rcu_preempt) the following files will be present:
			
 
				-
			
 
				-rcudata:
			
 
				-	Displays fields in struct rcu_data.
			
 
				-rcuexp:
			
 
				-	Displays statistics for expedited grace periods.
			
 
				-rcugp:
			
 
				-	Displays grace-period counters.
			
 
				-rcuhier:
			
 
				-	Displays the struct rcu_node hierarchy.
			
 
				-rcu_pending:
			
 
				-	Displays counts of the reasons rcu_pending() decided that RCU had
			
 
				-	work to do.
			
 
				-rcuboost:
			
 
				-	Displays RCU boosting statistics.  Only present if
			
 
				-	CONFIG_RCU_BOOST=y.
			
 
				-
			
 
				-The output of "cat rcu/rcu_preempt/rcudata" looks as follows:
			
 
				-
			
 
				-  0!c=30455 g=30456 cnq=1/0:1 dt=126535/140000000000000/0 df=2002 of=4 ql=0/0 qs=N... b=10 ci=74572 nci=0 co=1131 ca=716
			
 
				-  1!c=30719 g=30720 cnq=1/0:0 dt=132007/140000000000000/0 df=1874 of=10 ql=0/0 qs=N... b=10 ci=123209 nci=0 co=685 ca=982
			
 
				-  2!c=30150 g=30151 cnq=1/1:1 dt=138537/140000000000000/0 df=1707 of=8 ql=0/0 qs=N... b=10 ci=80132 nci=0 co=1328 ca=1458
			
 
				-  3 c=31249 g=31250 cnq=1/1:0 dt=107255/140000000000000/0 df=1749 of=6 ql=0/450 qs=NRW. b=10 ci=151700 nci=0 co=509 ca=622
			
 
				-  4!c=29502 g=29503 cnq=1/0:1 dt=83647/140000000000000/0 df=965 of=5 ql=0/0 qs=N... b=10 ci=65643 nci=0 co=1373 ca=1521
			
 
				-  5 c=31201 g=31202 cnq=1/0:1 dt=70422/0/0 df=535 of=7 ql=0/0 qs=.... b=10 ci=58500 nci=0 co=764 ca=698
			
 
				-  6!c=30253 g=30254 cnq=1/0:1 dt=95363/140000000000000/0 df=780 of=5 ql=0/0 qs=N... b=10 ci=100607 nci=0 co=1414 ca=1353
			
 
				-  7 c=31178 g=31178 cnq=1/0:0 dt=91536/0/0 df=547 of=4 ql=0/0 qs=.... b=10 ci=109819 nci=0 co=1115 ca=969
			
 
				-
			
 
				-This file has one line per CPU, or eight for this 8-CPU system.
			
 
				-The fields are as follows:
			
 
				-
			
 
				-o	The number at the beginning of each line is the CPU number.
			
 
				-	CPUs numbers followed by an exclamation mark are offline,
			
 
				-	but have been online at least once since boot.	There will be
			
 
				-	no output for CPUs that have never been online, which can be
			
 
				-	a good thing in the surprisingly common case where NR_CPUS is
			
 
				-	substantially larger than the number of actual CPUs.
			
 
				-
			
 
				-o	"c" is the count of grace periods that this CPU believes have
			
 
				-	completed.  Offlined CPUs and CPUs in dynticks idle mode may lag
			
 
				-	quite a ways behind, for example, CPU 4 under "rcu_sched" above,
			
 
				-	which has been offline through 16 RCU grace periods.  It is not
			
 
				-	unusual to see offline CPUs lagging by thousands of grace periods.
			
 
				-	Note that although the grace-period number is an unsigned long,
			
 
				-	it is printed out as a signed long to allow more human-friendly
			
 
				-	representation near boot time.
			
 
				-
			
 
				-o	"g" is the count of grace periods that this CPU believes have
			
 
				-	started.  Again, offlined CPUs and CPUs in dynticks idle mode
			
 
				-	may lag behind.  If the "c" and "g" values are equal, this CPU
			
 
				-	has already reported a quiescent state for the last RCU grace
			
 
				-	period that it is aware of, otherwise, the CPU believes that it
			
 
				-	owes RCU a quiescent state.
			
 
				-
			
 
				-o	"pq" indicates that this CPU has passed through a quiescent state
			
 
				-	for the current grace period.  It is possible for "pq" to be
			
 
				-	"1" and "c" different than "g", which indicates that although
			
 
				-	the CPU has passed through a quiescent state, either (1) this
			
 
				-	CPU has not yet reported that fact, (2) some other CPU has not
			
 
				-	yet reported for this grace period, or (3) both.
			
 
				-
			
 
				-o	"qp" indicates that RCU still expects a quiescent state from
			
 
				-	this CPU.  Offlined CPUs and CPUs in dyntick idle mode might
			
 
				-	well have qp=1, which is OK: RCU is still ignoring them.
			
 
				-
			
 
				-o	"dt" is the current value of the dyntick counter that is incremented
			
 
				-	when entering or leaving idle, either due to a context switch or
			
 
				-	due to an interrupt.  This number is even if the CPU is in idle
			
 
				-	from RCU's viewpoint and odd otherwise.  The number after the
			
 
				-	first "/" is the interrupt nesting depth when in idle state,
			
 
				-	or a large number added to the interrupt-nesting depth when
			
 
				-	running a non-idle task.  Some architectures do not accurately
			
 
				-	count interrupt nesting when running in non-idle kernel context,
			
 
				-	which can result in interesting anomalies such as negative
			
 
				-	interrupt-nesting levels.  The number after the second "/"
			
 
				-	is the NMI nesting depth.
			
 
				-
			
 
				-o	"df" is the number of times that some other CPU has forced a
			
 
				-	quiescent state on behalf of this CPU due to this CPU being in
			
 
				-	idle state.
			
 
				-
			
 
				-o	"of" is the number of times that some other CPU has forced a
			
 
				-	quiescent state on behalf of this CPU due to this CPU being
			
 
				-	offline.  In a perfect world, this might never happen, but it
			
 
				-	turns out that offlining and onlining a CPU can take several grace
			
 
				-	periods, and so there is likely to be an extended period of time
			
 
				-	when RCU believes that the CPU is online when it really is not.
			
 
				-	Please note that erring in the other direction (RCU believing a
			
 
				-	CPU is offline when it is really alive and kicking) is a fatal
			
 
				-	error, so it makes sense to err conservatively.
			
 
				-
			
 
				-o	"ql" is the number of RCU callbacks currently residing on
			
 
				-	this CPU.  The first number is the number of "lazy" callbacks
			
 
				-	that are known to RCU to only be freeing memory, and the number
			
 
				-	after the "/" is the total number of callbacks, lazy or not.
			
 
				-	These counters count callbacks regardless of what phase of
			
 
				-	grace-period processing that they are in (new, waiting for
			
 
				-	grace period to start, waiting for grace period to end, ready
			
 
				-	to invoke).
			
 
				-
			
 
				-o	"qs" gives an indication of the state of the callback queue
			
 
				-	with four characters:
			
 
				-
			
 
				-	"N"	Indicates that there are callbacks queued that are not
			
 
				-		ready to be handled by the next grace period, and thus
			
 
				-		will be handled by the grace period following the next
			
 
				-		one.
			
 
				-
			
 
				-	"R"	Indicates that there are callbacks queued that are
			
 
				-		ready to be handled by the next grace period.
			
 
				-
			
 
				-	"W"	Indicates that there are callbacks queued that are
			
 
				-		waiting on the current grace period.
			
 
				-
			
 
				-	"D"	Indicates that there are callbacks queued that have
			
 
				-		already been handled by a prior grace period, and are
			
 
				-		thus waiting to be invoked.  Note that callbacks in
			
 
				-		the process of being invoked are not counted here.
			
 
				-		Callbacks in the process of being invoked are those
			
 
				-		that have been removed from the rcu_data structures
			
 
				-		queues by rcu_do_batch(), but which have not yet been
			
 
				-		invoked.
			
 
				-
			
 
				-	If there are no callbacks in a given one of the above states,
			
 
				-	the corresponding character is replaced by ".".
			
 
				-
			
 
				-o	"b" is the batch limit for this CPU.  If more than this number
			
 
				-	of RCU callbacks is ready to invoke, then the remainder will
			
 
				-	be deferred.
			
 
				-
			
 
				-o	"ci" is the number of RCU callbacks that have been invoked for
			
 
				-	this CPU.  Note that ci+nci+ql is the number of callbacks that have
			
 
				-	been registered in absence of CPU-hotplug activity.
			
 
				-
			
 
				-o	"nci" is the number of RCU callbacks that have been offloaded from
			
 
				-	this CPU.  This will always be zero unless the kernel was built
			
 
				-	with CONFIG_RCU_NOCB_CPU=y and the "rcu_nocbs=" kernel boot
			
 
				-	parameter was specified.
			
 
				-
			
 
				-o	"co" is the number of RCU callbacks that have been orphaned due to
			
 
				-	this CPU going offline.  These orphaned callbacks have been moved
			
 
				-	to an arbitrarily chosen online CPU.
			
 
				-
			
 
				-o	"ca" is the number of RCU callbacks that have been adopted by this
			
 
				-	CPU due to other CPUs going offline.  Note that ci+co-ca+ql is
			
 
				-	the number of RCU callbacks registered on this CPU.
			
 
				-
			
 
				-
			
 
				-Kernels compiled with CONFIG_RCU_BOOST=y display the following from
			
 
				-/debug/rcu/rcu_preempt/rcudata:
			
 
				-
			
 
				-  0!c=12865 g=12866 cnq=1/0:1 dt=83113/140000000000000/0 df=288 of=11 ql=0/0 qs=N... kt=0/O ktl=944 b=10 ci=60709 nci=0 co=748 ca=871
			
 
				-  1 c=14407 g=14408 cnq=1/0:0 dt=100679/140000000000000/0 df=378 of=7 ql=0/119 qs=NRW. kt=0/W ktl=9b6 b=10 ci=109740 nci=0 co=589 ca=485
			
 
				-  2 c=14407 g=14408 cnq=1/0:0 dt=105486/0/0 df=90 of=9 ql=0/89 qs=NRW. kt=0/W ktl=c0c b=10 ci=83113 nci=0 co=533 ca=490
			
 
				-  3 c=14407 g=14408 cnq=1/0:0 dt=107138/0/0 df=142 of=8 ql=0/188 qs=NRW. kt=0/W ktl=b96 b=10 ci=121114 nci=0 co=426 ca=290
			
 
				-  4 c=14405 g=14406 cnq=1/0:1 dt=50238/0/0 df=706 of=7 ql=0/0 qs=.... kt=0/W ktl=812 b=10 ci=34929 nci=0 co=643 ca=114
			
 
				-  5!c=14168 g=14169 cnq=1/0:0 dt=45465/140000000000000/0 df=161 of=11 ql=0/0 qs=N... kt=0/O ktl=b4d b=10 ci=47712 nci=0 co=677 ca=722
			
 
				-  6 c=14404 g=14405 cnq=1/0:0 dt=59454/0/0 df=94 of=6 ql=0/0 qs=.... kt=0/W ktl=e57 b=10 ci=55597 nci=0 co=701 ca=811
			
 
				-  7 c=14407 g=14408 cnq=1/0:1 dt=68850/0/0 df=31 of=8 ql=0/0 qs=.... kt=0/W ktl=14bd b=10 ci=77475 nci=0 co=508 ca=1042
			
 
				-
			
 
				-This is similar to the output discussed above, but contains the following
			
 
				-additional fields:
			
 
				-
			
 
				-o	"kt" is the per-CPU kernel-thread state.  The digit preceding
			
 
				-	the first slash is zero if there is no work pending and 1
			
 
				-	otherwise.  The character between the first pair of slashes is
			
 
				-	as follows:
			
 
				-
			
 
				-	"S"	The kernel thread is stopped, in other words, all
			
 
				-		CPUs corresponding to this rcu_node structure are
			
 
				-		offline.
			
 
				-
			
 
				-	"R"	The kernel thread is running.
			
 
				-
			
 
				-	"W"	The kernel thread is waiting because there is no work
			
 
				-		for it to do.
			
 
				-
			
 
				-	"O"	The kernel thread is waiting because it has been
			
 
				-		forced off of its designated CPU or because its
			
 
				-		->cpus_allowed mask permits it to run on other than
			
 
				-		its designated CPU.
			
 
				-
			
 
				-	"Y"	The kernel thread is yielding to avoid hogging CPU.
			
 
				-
			
 
				-	"?"	Unknown value, indicates a bug.
			
 
				-
			
 
				-	The number after the final slash is the CPU that the kthread
			
 
				-	is actually running on.
			
 
				-
			
 
				-	This field is displayed only for CONFIG_RCU_BOOST kernels.
			
 
				-
			
 
				-o	"ktl" is the low-order 16 bits (in hexadecimal) of the count of
			
 
				-	the number of times that this CPU's per-CPU kthread has gone
			
 
				-	through its loop servicing invoke_rcu_cpu_kthread() requests.
			
 
				-
			
 
				-	This field is displayed only for CONFIG_RCU_BOOST kernels.
			
 
				-
			
 
				-
			
 
				-The output of "cat rcu/rcu_preempt/rcuexp" looks as follows:
			
 
				-
			
 
				-s=21872 wd1=0 wd2=0 wd3=5 enq=0 sc=21872
			
 
				-
			
 
				-These fields are as follows:
			
 
				-
			
 
				-o	"s" is the sequence number, with an odd number indicating that
			
 
				-	an expedited grace period is in progress.
			
 
				-
			
 
				-o	"wd1", "wd2", and "wd3" are the number of times that an attempt
			
 
				-	to start an expedited grace period found that someone else had
			
 
				-	completed an expedited grace period that satisfies the attempted
			
 
				-	request.  "Our work is done."
			
 
				-
			
 
				-o	"enq" is the number of quiescent states still outstanding.
			
 
				-
			
 
				-o	"sc" is the number of times that the attempt to start a
			
 
				-	new expedited grace period succeeded.
			
 
				-
			
 
				-
			
 
				-The output of "cat rcu/rcu_preempt/rcugp" looks as follows:
			
 
				-
			
 
				-completed=31249  gpnum=31250  age=1  max=18
			
 
				-
			
 
				-These fields are taken from the rcu_state structure, and are as follows:
			
 
				-
			
 
				-o	"completed" is the number of grace periods that have completed.
			
 
				-	It is comparable to the "c" field from rcu/rcudata in that a
			
 
				-	CPU whose "c" field matches the value of "completed" is aware
			
 
				-	that the corresponding RCU grace period has completed.
			
 
				-
			
 
				-o	"gpnum" is the number of grace periods that have started.  It is
			
 
				-	similarly comparable to the "g" field from rcu/rcudata in that
			
 
				-	a CPU whose "g" field matches the value of "gpnum" is aware that
			
 
				-	the corresponding RCU grace period has started.
			
 
				-
			
 
				-	If these two fields are equal, then there is no grace period
			
 
				-	in progress, in other words, RCU is idle.  On the other hand,
			
 
				-	if the two fields differ (as they are above), then an RCU grace
			
 
				-	period is in progress.
			
 
				-
			
 
				-o	"age" is the number of jiffies that the current grace period
			
 
				-	has extended for, or zero if there is no grace period currently
			
 
				-	in effect.
			
 
				-
			
 
				-o	"max" is the age in jiffies of the longest-duration grace period
			
 
				-	thus far.
			
 
				-
			
 
				-The output of "cat rcu/rcu_preempt/rcuhier" looks as follows:
			
 
				-
			
 
				-c=14407 g=14408 s=0 jfq=2 j=c863 nfqs=12040/nfqsng=0(12040) fqlh=1051 oqlen=0/0
			
 
				-3/3 ..>. 0:7 ^0
			
 
				-e/e ..>. 0:3 ^0    d/d ..>. 4:7 ^1
			
 
				-
			
 
				-The fields are as follows:
			
 
				-
			
 
				-o	"c" is exactly the same as "completed" under rcu/rcu_preempt/rcugp.
			
 
				-
			
 
				-o	"g" is exactly the same as "gpnum" under rcu/rcu_preempt/rcugp.
			
 
				-
			
 
				-o	"s" is the current state of the force_quiescent_state()
			
 
				-	state machine.
			
 
				-
			
 
				-o	"jfq" is the number of jiffies remaining for this grace period
			
 
				-	before force_quiescent_state() is invoked to help push things
			
 
				-	along.	Note that CPUs in idle mode throughout the grace period
			
 
				-	will not report on their own, but rather must be check by some
			
 
				-	other CPU via force_quiescent_state().
			
 
				-
			
 
				-o	"j" is the low-order four hex digits of the jiffies counter.
			
 
				-	Yes, Paul did run into a number of problems that turned out to
			
 
				-	be due to the jiffies counter no longer counting.  Why do you ask?
			
 
				-
			
 
				-o	"nfqs" is the number of calls to force_quiescent_state() since
			
 
				-	boot.
			
 
				-
			
 
				-o	"nfqsng" is the number of useless calls to force_quiescent_state(),
			
 
				-	where there wasn't actually a grace period active.  This can
			
 
				-	no longer happen due to grace-period processing being pushed
			
 
				-	into a kthread.  The number in parentheses is the difference
			
 
				-	between "nfqs" and "nfqsng", or the number of times that
			
 
				-	force_quiescent_state() actually did some real work.
			
 
				-
			
 
				-o	"fqlh" is the number of calls to force_quiescent_state() that
			
 
				-	exited immediately (without even being counted in nfqs above)
			
 
				-	due to contention on ->fqslock.
			
 
				-
			
 
				-o	Each element of the form "3/3 ..>. 0:7 ^0" represents one rcu_node
			
 
				-	structure.  Each line represents one level of the hierarchy,
			
 
				-	from root to leaves.  It is best to think of the rcu_data
			
 
				-	structures as forming yet another level after the leaves.
			
 
				-	Note that there might be either one, two, three, or even four
			
 
				-	levels of rcu_node structures, depending on the relationship
			
 
				-	between CONFIG_RCU_FANOUT, CONFIG_RCU_FANOUT_LEAF (possibly
			
 
				-	adjusted using the rcu_fanout_leaf kernel boot parameter), and
			
 
				-	CONFIG_NR_CPUS (possibly adjusted using the nr_cpu_ids count of
			
 
				-	possible CPUs for the booting hardware).
			
 
				-
			
 
				-	o	The numbers separated by the "/" are the qsmask followed
			
 
				-		by the qsmaskinit.  The qsmask will have one bit
			
 
				-		set for each entity in the next lower level that has
			
 
				-		not yet checked in for the current grace period ("e"
			
 
				-		indicating CPUs 5, 6, and 7 in the example above).
			
 
				-		The qsmaskinit will have one bit for each entity that is
			
 
				-		currently expected to check in during each grace period.
			
 
				-		The value of qsmaskinit is assigned to that of qsmask
			
 
				-		at the beginning of each grace period.
			
 
				-
			
 
				-	o	The characters separated by the ">" indicate the state
			
 
				-		of the blocked-tasks lists.  A "G" preceding the ">"
			
 
				-		indicates that at least one task blocked in an RCU
			
 
				-		read-side critical section blocks the current grace
			
 
				-		period, while a "E" preceding the ">" indicates that
			
 
				-		at least one task blocked in an RCU read-side critical
			
 
				-		section blocks the current expedited grace period.
			
 
				-		A "T" character following the ">" indicates that at
			
 
				-		least one task is blocked within an RCU read-side
			
 
				-		critical section, regardless of whether any current
			
 
				-		grace period (expedited or normal) is inconvenienced.
			
 
				-		A "." character appears if the corresponding condition
			
 
				-		does not hold, so that "..>." indicates that no tasks
			
 
				-		are blocked.  In contrast, "GE>T" indicates maximal
			
 
				-		inconvenience from blocked tasks.  CONFIG_TREE_RCU
			
 
				-		builds of the kernel will always show "..>.".
			
 
				-
			
 
				-	o	The numbers separated by the ":" are the range of CPUs
			
 
				-		served by this struct rcu_node.  This can be helpful
			
 
				-		in working out how the hierarchy is wired together.
			
 
				-
			
 
				-		For example, the example rcu_node structure shown above
			
 
				-		has "0:7", indicating that it covers CPUs 0 through 7.
			
 
				-
			
 
				-	o	The number after the "^" indicates the bit in the
			
 
				-		next higher level rcu_node structure that this rcu_node
			
 
				-		structure corresponds to.  For example, the "d/d ..>. 4:7
			
 
				-		^1" has a "1" in this position, indicating that it
			
 
				-		corresponds to the "1" bit in the "3" shown in the
			
 
				-		"3/3 ..>. 0:7 ^0" entry on the next level up.
			
 
				-
			
 
				-
			
 
				-The output of "cat rcu/rcu_sched/rcu_pending" looks as follows:
			
 
				-
			
 
				-  0!np=26111 qsp=29 rpq=5386 cbr=1 cng=570 gpc=3674 gps=577 nn=15903 ndw=0
			
 
				-  1!np=28913 qsp=35 rpq=6097 cbr=1 cng=448 gpc=3700 gps=554 nn=18113 ndw=0
			
 
				-  2!np=32740 qsp=37 rpq=6202 cbr=0 cng=476 gpc=4627 gps=546 nn=20889 ndw=0
			
 
				-  3 np=23679 qsp=22 rpq=5044 cbr=1 cng=415 gpc=3403 gps=347 nn=14469 ndw=0
			
 
				-  4!np=30714 qsp=4 rpq=5574 cbr=0 cng=528 gpc=3931 gps=639 nn=20042 ndw=0
			
 
				-  5 np=28910 qsp=2 rpq=5246 cbr=0 cng=428 gpc=4105 gps=709 nn=18422 ndw=0
			
 
				-  6!np=38648 qsp=5 rpq=7076 cbr=0 cng=840 gpc=4072 gps=961 nn=25699 ndw=0
			
 
				-  7 np=37275 qsp=2 rpq=6873 cbr=0 cng=868 gpc=3416 gps=971 nn=25147 ndw=0
			
 
				-
			
 
				-The fields are as follows:
			
 
				-
			
 
				-o	The leading number is the CPU number, with "!" indicating
			
 
				-	an offline CPU.
			
 
				-
			
 
				-o	"np" is the number of times that __rcu_pending() has been invoked
			
 
				-	for the corresponding flavor of RCU.
			
 
				-
			
 
				-o	"qsp" is the number of times that the RCU was waiting for a
			
 
				-	quiescent state from this CPU.
			
 
				-
			
 
				-o	"rpq" is the number of times that the CPU had passed through
			
 
				-	a quiescent state, but not yet reported it to RCU.
			
 
				-
			
 
				-o	"cbr" is the number of times that this CPU had RCU callbacks
			
 
				-	that had passed through a grace period, and were thus ready
			
 
				-	to be invoked.
			
 
				-
			
 
				-o	"cng" is the number of times that this CPU needed another
			
 
				-	grace period while RCU was idle.
			
 
				-
			
 
				-o	"gpc" is the number of times that an old grace period had
			
 
				-	completed, but this CPU was not yet aware of it.
			
 
				-
			
 
				-o	"gps" is the number of times that a new grace period had started,
			
 
				-	but this CPU was not yet aware of it.
			
 
				-
			
 
				-o	"ndw" is the number of times that a wakeup of an rcuo
			
 
				-	callback-offload kthread had to be deferred in order to avoid
			
 
				-	deadlock.
			
 
				-
			
 
				-o	"nn" is the number of times that this CPU needed nothing.
			
 
				-
			
 
				-
			
 
				-The output of "cat rcu/rcuboost" looks as follows:
			
 
				-
			
 
				-0:3 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
			
 
				-    balk: nt=0 egt=4695 bt=0 nb=0 ny=56 nos=0
			
 
				-4:7 tasks=.... kt=W ntb=0 neb=0 nnb=0 j=c864 bt=c894
			
 
				-    balk: nt=0 egt=6541 bt=0 nb=0 ny=126 nos=0
			
 
				-
			
 
				-This information is output only for rcu_preempt.  Each two-line entry
			
 
				-corresponds to a leaf rcu_node structure.  The fields are as follows:
			
 
				-
			
 
				-o	"n:m" is the CPU-number range for the corresponding two-line
			
 
				-	entry.  In the sample output above, the first entry covers
			
 
				-	CPUs zero through three and the second entry covers CPUs four
			
 
				-	through seven.
			
 
				-
			
 
				-o	"tasks=TNEB" gives the state of the various segments of the
			
 
				-	rnp->blocked_tasks list:
			
 
				-
			
 
				-	"T"	This indicates that there are some tasks that blocked
			
 
				-		while running on one of the corresponding CPUs while
			
 
				-		in an RCU read-side critical section.
			
 
				-
			
 
				-	"N"	This indicates that some of the blocked tasks are preventing
			
 
				-		the current normal (non-expedited) grace period from
			
 
				-		completing.
			
 
				-
			
 
				-	"E"	This indicates that some of the blocked tasks are preventing
			
 
				-		the current expedited grace period from completing.
			
 
				-
			
 
				-	"B"	This indicates that some of the blocked tasks are in
			
 
				-		need of RCU priority boosting.
			
 
				-
			
 
				-	Each character is replaced with "." if the corresponding
			
 
				-	condition does not hold.
			
 
				-
			
 
				-o	"kt" is the state of the RCU priority-boosting kernel
			
 
				-	thread associated with the corresponding rcu_node structure.
			
 
				-	The state can be one of the following:
			
 
				-
			
 
				-	"S"	The kernel thread is stopped, in other words, all
			
 
				-		CPUs corresponding to this rcu_node structure are
			
 
				-		offline.
			
 
				-
			
 
				-	"R"	The kernel thread is running.
			
 
				-
			
 
				-	"W"	The kernel thread is waiting because there is no work
			
 
				-		for it to do.
			
 
				-
			
 
				-	"Y"	The kernel thread is yielding to avoid hogging CPU.
			
 
				-
			
 
				-	"?"	Unknown value, indicates a bug.
			
 
				-
			
 
				-o	"ntb" is the number of tasks boosted.
			
 
				-
			
 
				-o	"neb" is the number of tasks boosted in order to complete an
			
 
				-	expedited grace period.
			
 
				-
			
 
				-o	"nnb" is the number of tasks boosted in order to complete a
			
 
				-	normal (non-expedited) grace period.  When boosting a task
			
 
				-	that was blocking both an expedited and a normal grace period,
			
 
				-	it is counted against the expedited total above.
			
 
				-
			
 
				-o	"j" is the low-order 16 bits of the jiffies counter in
			
 
				-	hexadecimal.
			
 
				-
			
 
				-o	"bt" is the low-order 16 bits of the value that the jiffies
			
 
				-	counter will have when we next start boosting, assuming that
			
 
				-	the current grace period does not end beforehand.  This is
			
 
				-	also in hexadecimal.
			
 
				-
			
 
				-o	"balk: nt" counts the number of times we didn't boost (in
			
 
				-	other words, we balked) even though it was time to boost because
			
 
				-	there were no blocked tasks to boost.  This situation occurs
			
 
				-	when there is one blocked task on one rcu_node structure and
			
 
				-	none on some other rcu_node structure.
			
 
				-
			
 
				-o	"egt" counts the number of times we balked because although
			
 
				-	there were blocked tasks, none of them were blocking the
			
 
				-	current grace period, whether expedited or otherwise.
			
 
				-
			
 
				-o	"bt" counts the number of times we balked because boosting
			
 
				-	had already been initiated for the current grace period.
			
 
				-
			
 
				-o	"nb" counts the number of times we balked because there
			
 
				-	was at least one task blocking the current non-expedited grace
			
 
				-	period that never had blocked.  If it is already running, it
			
 
				-	just won't help to boost its priority!
			
 
				-
			
 
				-o	"ny" counts the number of times we balked because it was
			
 
				-	not yet time to start boosting.
			
 
				-
			
 
				-o	"nos" counts the number of times we balked for other
			
 
				-	reasons, e.g., the grace period ended first.
			
 
				-
			
 
				-
			
 
				-CONFIG_TINY_RCU debugfs Files and Formats
			
 
				-
			
 
				-These implementations of RCU provides a single debugfs file under the
			
 
				-top-level directory RCU, namely rcu/rcudata, which displays fields in
			
 
				-rcu_bh_ctrlblk and rcu_sched_ctrlblk.
			
 
				-
			
 
				-The output of "cat rcu/rcudata" is as follows:
			
 
				-
			
 
				-rcu_sched: qlen: 0
			
 
				-rcu_bh: qlen: 0
			
 
				-
			
 
				-This is split into rcu_sched and rcu_bh sections.  The field is as
			
 
				-follows:
			
 
				-
			
 
				-o	"qlen" is the number of RCU callbacks currently waiting either
			
 
				-	for an RCU grace period or waiting to be invoked.  This is the
			
 
				-	only field present for rcu_sched and rcu_bh, due to the
			
 
				-	short-circuiting of grace period in those two cases.
			
--- a/Documentation/SAK.txt
+++ b/Documentation/SAK.txt
@@ -1,5 +1,9 @@
 
				-Linux 2.4.2 Secure Attention Key (SAK) handling
			
 
				-18 March 2001, Andrew Morton
			
 
				+=========================================
			
 
				+Linux Secure Attention Key (SAK) handling
			
 
				+=========================================
			
 
				+
			
 
				+:Date: 18 March 2001
			
 
				+:Author: Andrew Morton
			
 
				 
			
 
				 An operating system's Secure Attention Key is a security tool which is
			
 
				 provided as protection against trojan password capturing programs.  It
			
@@ -13,7 +17,7 @@ this sequence.  It is only available if the kernel was compiled with
 
				 sysrq support.
			
 
				 
			
 
				 The proper way of generating a SAK is to define the key sequence using
			
 
				-`loadkeys'.  This will work whether or not sysrq support is compiled
			
 
				+``loadkeys``.  This will work whether or not sysrq support is compiled
			
 
				 into the kernel.
			
 
				 
			
 
				 SAK works correctly when the keyboard is in raw mode.  This means that
			
@@ -25,64 +29,63 @@ What key sequence should you use? Well, CTRL-ALT-DEL is used to reboot
 
				 the machine.  CTRL-ALT-BACKSPACE is magical to the X server.  We'll
			
 
				 choose CTRL-ALT-PAUSE.
			
 
				 
			
 
				-In your rc.sysinit (or rc.local) file, add the command
			
 
				+In your rc.sysinit (or rc.local) file, add the command::
			
 
				 
			
 
				 	echo "control alt keycode 101 = SAK" | /bin/loadkeys
			
 
				 
			
 
				 And that's it!  Only the superuser may reprogram the SAK key.
			
 
				 
			
 
				 
			
 
				-NOTES
			
 
				-=====
			
 
				+.. note::
			
 
				 
			
 
				-1: Linux SAK is said to be not a "true SAK" as is required by
			
 
				-   systems which implement C2 level security.  This author does not
			
 
				-   know why.
			
 
				+  1. Linux SAK is said to be not a "true SAK" as is required by
			
 
				+     systems which implement C2 level security.  This author does not
			
 
				+     know why.
			
 
				 
			
 
				 
			
 
				-2: On the PC keyboard, SAK kills all applications which have
			
 
				-   /dev/console opened.
			
 
				+  2. On the PC keyboard, SAK kills all applications which have
			
 
				+     /dev/console opened.
			
 
				 
			
 
				-   Unfortunately this includes a number of things which you don't
			
 
				-   actually want killed.  This is because these applications are
			
 
				-   incorrectly holding /dev/console open.  Be sure to complain to your
			
 
				-   Linux distributor about this!
			
 
				+     Unfortunately this includes a number of things which you don't
			
 
				+     actually want killed.  This is because these applications are
			
 
				+     incorrectly holding /dev/console open.  Be sure to complain to your
			
 
				+     Linux distributor about this!
			
 
				 
			
 
				-   You can identify processes which will be killed by SAK with the
			
 
				-   command
			
 
				+     You can identify processes which will be killed by SAK with the
			
 
				+     command::
			
 
				 
			
 
				 	# ls -l /proc/[0-9]*/fd/* | grep console
			
 
				 	l-wx------    1 root     root           64 Mar 18 00:46 /proc/579/fd/0 -> /dev/console
			
 
				 
			
 
				-   Then:
			
 
				+     Then::
			
 
				 
			
 
				 	# ps aux|grep 579
			
 
				 	root       579  0.0  0.1  1088  436 ?        S    00:43   0:00 gpm -t ps/2
			
 
				 
			
 
				-   So `gpm' will be killed by SAK.  This is a bug in gpm.  It should
			
 
				-   be closing standard input.  You can work around this by finding the
			
 
				-   initscript which launches gpm and changing it thusly:
			
 
				+     So ``gpm`` will be killed by SAK.  This is a bug in gpm.  It should
			
 
				+     be closing standard input.  You can work around this by finding the
			
 
				+     initscript which launches gpm and changing it thusly:
			
 
				 
			
 
				-   Old:
			
 
				+     Old::
			
 
				 
			
 
				 	daemon gpm
			
 
				 
			
 
				-   New:
			
 
				+     New::
			
 
				 
			
 
				 	daemon gpm < /dev/null
			
 
				 
			
 
				-   Vixie cron also seems to have this problem, and needs the same treatment.
			
 
				+     Vixie cron also seems to have this problem, and needs the same treatment.
			
 
				 
			
 
				-   Also, one prominent Linux distribution has the following three
			
 
				-   lines in its rc.sysinit and rc scripts:
			
 
				+     Also, one prominent Linux distribution has the following three
			
 
				+     lines in its rc.sysinit and rc scripts::
			
 
				 
			
 
				 	exec 3<&0
			
 
				 	exec 4>&1
			
 
				 	exec 5>&2
			
 
				 
			
 
				-   These commands cause *all* daemons which are launched by the
			
 
				-   initscripts to have file descriptors 3, 4 and 5 attached to
			
 
				-   /dev/console.  So SAK kills them all.  A workaround is to simply
			
 
				-   delete these lines, but this may cause system management
			
 
				-   applications to malfunction - test everything well.
			
 
				+     These commands cause **all** daemons which are launched by the
			
 
				+     initscripts to have file descriptors 3, 4 and 5 attached to
			
 
				+     /dev/console.  So SAK kills them all.  A workaround is to simply
			
 
				+     delete these lines, but this may cause system management
			
 
				+     applications to malfunction - test everything well.
			
 
				 
			
--- a/Documentation/SM501.txt
+++ b/Documentation/SM501.txt
@@ -1,7 +1,10 @@
 
				-			SM501 Driver
			
 
				-			============
			
 
				+.. include:: <isonum.txt>
			
 
				 
			
 
				-Copyright 2006, 2007 Simtec Electronics
			
 
				+============
			
 
				+SM501 Driver
			
 
				+============
			
 
				+
			
 
				+:Copyright: |copy| 2006, 2007 Simtec Electronics
			
 
				 
			
 
				 The Silicon Motion SM501 multimedia companion chip is a multifunction device
			
 
				 which may provide numerous interfaces including USB host controller USB gadget,
			
--- a/Documentation/acpi/gpio-properties.txt
+++ b/Documentation/acpi/gpio-properties.txt
@@ -156,3 +156,68 @@ pointed to by its first argument.  That should be done in the driver's .probe()
 
				 routine.  On removal, the driver should unregister its GPIO mapping table by
			
 
				 calling acpi_dev_remove_driver_gpios() on the ACPI device object where that
			
 
				 table was previously registered.
			
 
				+
			
 
				+Using the _CRS fallback
			
 
				+-----------------------
			
 
				+
			
 
				+If a device does not have _DSD or the driver does not create ACPI GPIO
			
 
				+mapping, the Linux GPIO framework refuses to return any GPIOs. This is
			
 
				+because the driver does not know what it actually gets. For example if we
			
 
				+have a device like below:
			
 
				+
			
 
				+  Device (BTH)
			
 
				+  {
			
 
				+      Name (_HID, ...)
			
 
				+
			
 
				+      Name (_CRS, ResourceTemplate () {
			
 
				+          GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
			
 
				+                  "\\_SB.GPO0", 0, ResourceConsumer) {15}
			
 
				+          GpioIo (Exclusive, PullNone, 0, 0, IoRestrictionNone,
			
 
				+                  "\\_SB.GPO0", 0, ResourceConsumer) {27}
			
 
				+      })
			
 
				+  }
			
 
				+
			
 
				+The driver might expect to get the right GPIO when it does:
			
 
				+
			
 
				+  desc = gpiod_get(dev, "reset", GPIOD_OUT_LOW);
			
 
				+
			
 
				+but since there is no way to know the mapping between "reset" and
			
 
				+the GpioIo() in _CRS desc will hold ERR_PTR(-ENOENT).
			
 
				+
			
 
				+The driver author can solve this by passing the mapping explictly
			
 
				+(the recommended way and documented in the above chapter).
			
 
				+
			
 
				+The ACPI GPIO mapping tables should not contaminate drivers that are not
			
 
				+knowing about which exact device they are servicing on. It implies that
			
 
				+the ACPI GPIO mapping tables are hardly linked to ACPI ID and certain
			
 
				+objects, as listed in the above chapter, of the device in question.
			
 
				+
			
 
				+Getting GPIO descriptor
			
 
				+-----------------------
			
 
				+
			
 
				+There are two main approaches to get GPIO resource from ACPI:
			
 
				+	desc = gpiod_get(dev, connection_id, flags);
			
 
				+	desc = gpiod_get_index(dev, connection_id, index, flags);
			
 
				+
			
 
				+We may consider two different cases here, i.e. when connection ID is
			
 
				+provided and otherwise.
			
 
				+
			
 
				+Case 1:
			
 
				+	desc = gpiod_get(dev, "non-null-connection-id", flags);
			
 
				+	desc = gpiod_get_index(dev, "non-null-connection-id", index, flags);
			
 
				+
			
 
				+Case 2:
			
 
				+	desc = gpiod_get(dev, NULL, flags);
			
 
				+	desc = gpiod_get_index(dev, NULL, index, flags);
			
 
				+
			
 
				+Case 1 assumes that corresponding ACPI device description must have
			
 
				+defined device properties and will prevent to getting any GPIO resources
			
 
				+otherwise.
			
 
				+
			
 
				+Case 2 explicitly tells GPIO core to look for resources in _CRS.
			
 
				+
			
 
				+Be aware that gpiod_get_index() in cases 1 and 2, assuming that there
			
 
				+are two versions of ACPI device description provided and no mapping is
			
 
				+present in the driver, will return different resources. That's why a
			
 
				+certain driver has to handle them carefully as explained in previous
			
 
				+chapter.
			
--- a/Documentation/admin-guide/LSM/LoadPin.rst
+++ b/Documentation/admin-guide/LSM/LoadPin.rst
@@ -1,3 +1,7 @@
 
				+=======
			
 
				+LoadPin
			
 
				+=======
			
 
				+
			
 
				 LoadPin is a Linux Security Module that ensures all kernel-loaded files
			
 
				 (modules, firmware, etc) all originate from the same filesystem, with
			
 
				 the expectation that such a filesystem is backed by a read-only device
			
@@ -5,13 +9,13 @@ such as dm-verity or CDROM. This allows systems that have a verified
 
				 and/or unchangeable filesystem to enforce module and firmware loading
			
 
				 restrictions without needing to sign the files individually.
			
 
				 
			
 
				-The LSM is selectable at build-time with CONFIG_SECURITY_LOADPIN, and
			
 
				+The LSM is selectable at build-time with ``CONFIG_SECURITY_LOADPIN``, and
			
 
				 can be controlled at boot-time with the kernel command line option
			
 
				-"loadpin.enabled". By default, it is enabled, but can be disabled at
			
 
				-boot ("loadpin.enabled=0").
			
 
				+"``loadpin.enabled``". By default, it is enabled, but can be disabled at
			
 
				+boot ("``loadpin.enabled=0``").
			
 
				 
			
 
				 LoadPin starts pinning when it sees the first file loaded. If the
			
 
				 block device backing the filesystem is not read-only, a sysctl is
			
 
				-created to toggle pinning: /proc/sys/kernel/loadpin/enabled. (Having
			
 
				+created to toggle pinning: ``/proc/sys/kernel/loadpin/enabled``. (Having
			
 
				 a mutable filesystem means pinning is mutable too, but having the
			
 
				 sysctl allows for easy testing on systems with a mutable filesystem.)
			
--- a/Documentation/admin-guide/LSM/SELinux.rst
+++ b/Documentation/admin-guide/LSM/SELinux.rst
@@ -1,27 +1,33 @@
 
				+=======
			
 
				+SELinux
			
 
				+=======
			
 
				+
			
 
				 If you want to use SELinux, chances are you will want
			
 
				 to use the distro-provided policies, or install the
			
 
				 latest reference policy release from
			
 
				+
			
 
				 	http://oss.tresys.com/projects/refpolicy
			
 
				 
			
 
				 However, if you want to install a dummy policy for
			
 
				-testing, you can do using 'mdp' provided under
			
 
				+testing, you can do using ``mdp`` provided under
			
 
				 scripts/selinux.  Note that this requires the selinux
			
 
				 userspace to be installed - in particular you will
			
 
				 need checkpolicy to compile a kernel, and setfiles and
			
 
				 fixfiles to label the filesystem.
			
 
				 
			
 
				 	1. Compile the kernel with selinux enabled.
			
 
				-	2. Type 'make' to compile mdp.
			
 
				+	2. Type ``make`` to compile ``mdp``.
			
 
				 	3. Make sure that you are not running with
			
 
				 	   SELinux enabled and a real policy.  If
			
 
				 	   you are, reboot with selinux disabled
			
 
				 	   before continuing.
			
 
				-	4. Run install_policy.sh:
			
 
				+	4. Run install_policy.sh::
			
 
				+
			
 
				 		cd scripts/selinux
			
 
				 		sh install_policy.sh
			
 
				 
			
 
				 Step 4 will create a new dummy policy valid for your
			
 
				 kernel, with a single selinux user, role, and type.
			
 
				-It will compile the policy, will set your SELINUXTYPE to
			
 
				-dummy in /etc/selinux/config, install the compiled policy
			
 
				-as 'dummy', and relabel your filesystem.
			
 
				+It will compile the policy, will set your ``SELINUXTYPE`` to
			
 
				+``dummy`` in ``/etc/selinux/config``, install the compiled policy
			
 
				+as ``dummy``, and relabel your filesystem.
			
--- a/Documentation/admin-guide/LSM/Smack.rst
+++ b/Documentation/admin-guide/LSM/Smack.rst
@@ -1,3 +1,6 @@
 
				+=====
			
 
				+Smack
			
 
				+=====
			
 
				 
			
 
				 
			
 
				     "Good for you, you've decided to clean the elevator!"
			
@@ -14,6 +17,7 @@ available to determine which is best suited to the problem
 
				 at hand.
			
 
				 
			
 
				 Smack consists of three major components:
			
 
				+
			
 
				     - The kernel
			
 
				     - Basic utilities, which are helpful but not required
			
 
				     - Configuration data
			
@@ -39,16 +43,24 @@ The current git repository for Smack user space is:
 
				 This should make and install on most modern distributions.
			
 
				 There are five commands included in smackutil:
			
 
				 
			
 
				-chsmack    - display or set Smack extended attribute values
			
 
				-smackctl   - load the Smack access rules
			
 
				-smackaccess - report if a process with one label has access
			
 
				-              to an object with another
			
 
				+chsmack:
			
 
				+	display or set Smack extended attribute values
			
 
				+
			
 
				+smackctl:
			
 
				+	load the Smack access rules
			
 
				+
			
 
				+smackaccess:
			
 
				+	report if a process with one label has access
			
 
				+	to an object with another
			
 
				 
			
 
				 These two commands are obsolete with the introduction of
			
 
				 the smackfs/load2 and smackfs/cipso2 interfaces.
			
 
				 
			
 
				-smackload  - properly formats data for writing to smackfs/load
			
 
				-smackcipso - properly formats data for writing to smackfs/cipso
			
 
				+smackload:
			
 
				+	properly formats data for writing to smackfs/load
			
 
				+
			
 
				+smackcipso:
			
 
				+	properly formats data for writing to smackfs/cipso
			
 
				 
			
 
				 In keeping with the intent of Smack, configuration data is
			
 
				 minimal and not strictly required. The most important
			
@@ -56,15 +68,15 @@ configuration step is mounting the smackfs pseudo filesystem.
 
				 If smackutil is installed the startup script will take care
			
 
				 of this, but it can be manually as well.
			
 
				 
			
 
				-Add this line to /etc/fstab:
			
 
				+Add this line to ``/etc/fstab``::
			
 
				 
			
 
				     smackfs /sys/fs/smackfs smackfs defaults 0 0
			
 
				 
			
 
				-The /sys/fs/smackfs directory is created by the kernel.
			
 
				+The ``/sys/fs/smackfs`` directory is created by the kernel.
			
 
				 
			
 
				 Smack uses extended attributes (xattrs) to store labels on filesystem
			
 
				 objects. The attributes are stored in the extended attribute security
			
 
				-name space. A process must have CAP_MAC_ADMIN to change any of these
			
 
				+name space. A process must have ``CAP_MAC_ADMIN`` to change any of these
			
 
				 attributes.
			
 
				 
			
 
				 The extended attributes that Smack uses are:
			
@@ -73,14 +85,17 @@ SMACK64
 
				 	Used to make access control decisions. In almost all cases
			
 
				 	the label given to a new filesystem object will be the label
			
 
				 	of the process that created it.
			
 
				+
			
 
				 SMACK64EXEC
			
 
				 	The Smack label of a process that execs a program file with
			
 
				 	this attribute set will run with this attribute's value.
			
 
				+
			
 
				 SMACK64MMAP
			
 
				 	Don't allow the file to be mmapped by a process whose Smack
			
 
				 	label does not allow all of the access permitted to a process
			
 
				 	with the label contained in this attribute. This is a very
			
 
				 	specific use case for shared libraries.
			
 
				+
			
 
				 SMACK64TRANSMUTE
			
 
				 	Can only have the value "TRUE". If this attribute is present
			
 
				 	on a directory when an object is created in the directory and
			
@@ -89,27 +104,29 @@ SMACK64TRANSMUTE
 
				 	gets the label of the directory instead of the label of the
			
 
				 	creating process. If the object being created is a directory
			
 
				 	the SMACK64TRANSMUTE attribute is set as well.
			
 
				+
			
 
				 SMACK64IPIN
			
 
				 	This attribute is only available on file descriptors for sockets.
			
 
				 	Use the Smack label in this attribute for access control
			
 
				 	decisions on packets being delivered to this socket.
			
 
				+
			
 
				 SMACK64IPOUT
			
 
				 	This attribute is only available on file descriptors for sockets.
			
 
				 	Use the Smack label in this attribute for access control
			
 
				 	decisions on packets coming from this socket.
			
 
				 
			
 
				-There are multiple ways to set a Smack label on a file:
			
 
				+There are multiple ways to set a Smack label on a file::
			
 
				 
			
 
				     # attr -S -s SMACK64 -V "value" path
			
 
				     # chsmack -a value path
			
 
				 
			
 
				 A process can see the Smack label it is running with by
			
 
				-reading /proc/self/attr/current. A process with CAP_MAC_ADMIN
			
 
				+reading ``/proc/self/attr/current``. A process with ``CAP_MAC_ADMIN``
			
 
				 can set the process Smack by writing there.
			
 
				 
			
 
				 Most Smack configuration is accomplished by writing to files
			
 
				 in the smackfs filesystem. This pseudo-filesystem is mounted
			
 
				-on /sys/fs/smackfs.
			
 
				+on ``/sys/fs/smackfs``.
			
 
				 
			
 
				 access
			
 
				 	Provided for backward compatibility. The access2 interface
			
@@ -120,6 +137,7 @@ access
 
				 	this file. The next read will indicate whether the access
			
 
				 	would be permitted. The text will be either "1" indicating
			
 
				 	access, or "0" indicating denial.
			
 
				+
			
 
				 access2
			
 
				 	This interface reports whether a subject with the specified
			
 
				 	Smack label has a particular access to an object with a
			
@@ -127,13 +145,17 @@ access2
 
				 	this file. The next read will indicate whether the access
			
 
				 	would be permitted. The text will be either "1" indicating
			
 
				 	access, or "0" indicating denial.
			
 
				+
			
 
				 ambient
			
 
				 	This contains the Smack label applied to unlabeled network
			
 
				 	packets.
			
 
				+
			
 
				 change-rule
			
 
				 	This interface allows modification of existing access control rules.
			
 
				-	The format accepted on write is:
			
 
				+	The format accepted on write is::
			
 
				+
			
 
				 		"%s %s %s %s"
			
 
				+
			
 
				 	where the first string is the subject label, the second the
			
 
				 	object label, the third the access to allow and the fourth the
			
 
				 	access to deny. The access strings may contain only the characters
			
@@ -141,47 +163,63 @@ change-rule
 
				 	modified by enabling the permissions in the third string and disabling
			
 
				 	those in the fourth string. If there is no such rule it will be
			
 
				 	created using the access specified in the third and the fourth strings.
			
 
				+
			
 
				 cipso
			
 
				 	Provided for backward compatibility. The cipso2 interface
			
 
				 	is preferred and should be used instead.
			
 
				 	This interface allows a specific CIPSO header to be assigned
			
 
				-	to a Smack label. The format accepted on write is:
			
 
				+	to a Smack label. The format accepted on write is::
			
 
				+
			
 
				 		"%24s%4d%4d"["%4d"]...
			
 
				+
			
 
				 	The first string is a fixed Smack label. The first number is
			
 
				 	the level to use. The second number is the number of categories.
			
 
				-	The following numbers are the categories.
			
 
				-	"level-3-cats-5-19          3   2   5  19"
			
 
				+	The following numbers are the categories::
			
 
				+
			
 
				+		"level-3-cats-5-19          3   2   5  19"
			
 
				+
			
 
				 cipso2
			
 
				 	This interface allows a specific CIPSO header to be assigned
			
 
				-	to a Smack label. The format accepted on write is:
			
 
				-	"%s%4d%4d"["%4d"]...
			
 
				+	to a Smack label. The format accepted on write is::
			
 
				+
			
 
				+		"%s%4d%4d"["%4d"]...
			
 
				+
			
 
				 	The first string is a long Smack label. The first number is
			
 
				 	the level to use. The second number is the number of categories.
			
 
				-	The following numbers are the categories.
			
 
				-	"level-3-cats-5-19   3   2   5  19"
			
 
				+	The following numbers are the categories::
			
 
				+
			
 
				+		"level-3-cats-5-19   3   2   5  19"
			
 
				+
			
 
				 direct
			
 
				 	This contains the CIPSO level used for Smack direct label
			
 
				 	representation in network packets.
			
 
				+
			
 
				 doi
			
 
				 	This contains the CIPSO domain of interpretation used in
			
 
				 	network packets.
			
 
				+
			
 
				 ipv6host
			
 
				 	This interface allows specific IPv6 internet addresses to be
			
 
				 	treated as single label hosts. Packets are sent to single
			
 
				 	label hosts only from processes that have Smack write access
			
 
				 	to the host label. All packets received from single label hosts
			
 
				-	are given the specified label. The format accepted on write is:
			
 
				+	are given the specified label. The format accepted on write is::
			
 
				+
			
 
				 		"%h:%h:%h:%h:%h:%h:%h:%h label" or
			
 
				 		"%h:%h:%h:%h:%h:%h:%h:%h/%d label".
			
 
				+
			
 
				 	The "::" address shortcut is not supported.
			
 
				 	If label is "-DELETE" a matched entry will be deleted.
			
 
				+
			
 
				 load
			
 
				 	Provided for backward compatibility. The load2 interface
			
 
				 	is preferred and should be used instead.
			
 
				 	This interface allows access control rules in addition to
			
 
				 	the system defined rules to be specified. The format accepted
			
 
				-	on write is:
			
 
				+	on write is::
			
 
				+
			
 
				 		"%24s%24s%5s"
			
 
				+
			
 
				 	where the first string is the subject label, the second the
			
 
				 	object label, and the third the requested access. The access
			
 
				 	string may contain only the characters "rwxat-", and specifies
			
@@ -189,17 +227,21 @@ load
 
				 	permissions that are not allowed. The string "r-x--" would
			
 
				 	specify read and execute access. Labels are limited to 23
			
 
				 	characters in length.
			
 
				+
			
 
				 load2
			
 
				 	This interface allows access control rules in addition to
			
 
				 	the system defined rules to be specified. The format accepted
			
 
				-	on write is:
			
 
				+	on write is::
			
 
				+
			
 
				 		"%s %s %s"
			
 
				+
			
 
				 	where the first string is the subject label, the second the
			
 
				 	object label, and the third the requested access. The access
			
 
				 	string may contain only the characters "rwxat-", and specifies
			
 
				 	which sort of access is allowed. The "-" is a placeholder for
			
 
				 	permissions that are not allowed. The string "r-x--" would
			
 
				 	specify read and execute access.
			
 
				+
			
 
				 load-self
			
 
				 	Provided for backward compatibility. The load-self2 interface
			
 
				 	is preferred and should be used instead.
			
@@ -208,66 +250,83 @@ load-self
 
				 	otherwise be permitted, and are intended to provide additional
			
 
				 	restrictions on the process. The format is the same as for
			
 
				 	the load interface.
			
 
				+
			
 
				 load-self2
			
 
				 	This interface allows process specific access rules to be
			
 
				 	defined. These rules are only consulted if access would
			
 
				 	otherwise be permitted, and are intended to provide additional
			
 
				 	restrictions on the process. The format is the same as for
			
 
				 	the load2 interface.
			
 
				+
			
 
				 logging
			
 
				 	This contains the Smack logging state.
			
 
				+
			
 
				 mapped
			
 
				 	This contains the CIPSO level used for Smack mapped label
			
 
				 	representation in network packets.
			
 
				+
			
 
				 netlabel
			
 
				 	This interface allows specific internet addresses to be
			
 
				 	treated as single label hosts. Packets are sent to single
			
 
				 	label hosts without CIPSO headers, but only from processes
			
 
				 	that have Smack write access to the host label. All packets
			
 
				 	received from single label hosts are given the specified
			
 
				-	label. The format accepted on write is:
			
 
				+	label. The format accepted on write is::
			
 
				+
			
 
				 		"%d.%d.%d.%d label" or "%d.%d.%d.%d/%d label".
			
 
				+
			
 
				 	If the label specified is "-CIPSO" the address is treated
			
 
				 	as a host that supports CIPSO headers.
			
 
				+
			
 
				 onlycap
			
 
				 	This contains labels processes must have for CAP_MAC_ADMIN
			
 
				-	and CAP_MAC_OVERRIDE to be effective. If this file is empty
			
 
				+	and ``CAP_MAC_OVERRIDE`` to be effective. If this file is empty
			
 
				 	these capabilities are effective at for processes with any
			
 
				 	label. The values are set by writing the desired labels, separated
			
 
				 	by spaces, to the file or cleared by writing "-" to the file.
			
 
				+
			
 
				 ptrace
			
 
				 	This is used to define the current ptrace policy
			
 
				-	0 - default: this is the policy that relies on Smack access rules.
			
 
				-	    For the PTRACE_READ a subject needs to have a read access on
			
 
				-	    object. For the PTRACE_ATTACH a read-write access is required.
			
 
				-	1 - exact: this is the policy that limits PTRACE_ATTACH. Attach is
			
 
				+
			
 
				+	0 - default:
			
 
				+	    this is the policy that relies on Smack access rules.
			
 
				+	    For the ``PTRACE_READ`` a subject needs to have a read access on
			
 
				+	    object. For the ``PTRACE_ATTACH`` a read-write access is required.
			
 
				+
			
 
				+	1 - exact:
			
 
				+	    this is the policy that limits ``PTRACE_ATTACH``. Attach is
			
 
				 	    only allowed when subject's and object's labels are equal.
			
 
				-	    PTRACE_READ is not affected. Can be overridden with CAP_SYS_PTRACE.
			
 
				-	2 - draconian: this policy behaves like the 'exact' above with an
			
 
				-	    exception that it can't be overridden with CAP_SYS_PTRACE.
			
 
				+	    ``PTRACE_READ`` is not affected. Can be overridden with ``CAP_SYS_PTRACE``.
			
 
				+
			
 
				+	2 - draconian:
			
 
				+	    this policy behaves like the 'exact' above with an
			
 
				+	    exception that it can't be overridden with ``CAP_SYS_PTRACE``.
			
 
				+
			
 
				 revoke-subject
			
 
				 	Writing a Smack label here sets the access to '-' for all access
			
 
				 	rules with that subject label.
			
 
				+
			
 
				 unconfined
			
 
				-	If the kernel is configured with CONFIG_SECURITY_SMACK_BRINGUP
			
 
				-	a process with CAP_MAC_ADMIN can write a label into this interface.
			
 
				+	If the kernel is configured with ``CONFIG_SECURITY_SMACK_BRINGUP``
			
 
				+	a process with ``CAP_MAC_ADMIN`` can write a label into this interface.
			
 
				 	Thereafter, accesses that involve that label will be logged and
			
 
				 	the access permitted if it wouldn't be otherwise. Note that this
			
 
				 	is dangerous and can ruin the proper labeling of your system.
			
 
				 	It should never be used in production.
			
 
				+
			
 
				 relabel-self
			
 
				 	This interface contains a list of labels to which the process can
			
 
				-	transition to, by writing to /proc/self/attr/current.
			
 
				+	transition to, by writing to ``/proc/self/attr/current``.
			
 
				 	Normally a process can change its own label to any legal value, but only
			
 
				-	if it has CAP_MAC_ADMIN. This interface allows a process without
			
 
				-	CAP_MAC_ADMIN to relabel itself to one of labels from predefined list.
			
 
				-	A process without CAP_MAC_ADMIN can change its label only once. When it
			
 
				+	if it has ``CAP_MAC_ADMIN``. This interface allows a process without
			
 
				+	``CAP_MAC_ADMIN`` to relabel itself to one of labels from predefined list.
			
 
				+	A process without ``CAP_MAC_ADMIN`` can change its label only once. When it
			
 
				 	does, this list will be cleared.
			
 
				 	The values are set by writing the desired labels, separated
			
 
				 	by spaces, to the file or cleared by writing "-" to the file.
			
 
				 
			
 
				 If you are using the smackload utility
			
 
				-you can add access rules in /etc/smack/accesses. They take the form:
			
 
				+you can add access rules in ``/etc/smack/accesses``. They take the form::
			
 
				 
			
 
				     subjectlabel objectlabel access
			
 
				 
			
@@ -277,14 +336,14 @@ object with objectlabel. If there is no rule no access is allowed.
 
				 
			
 
				 Look for additional programs on http://schaufler-ca.com
			
 
				 
			
 
				-From the Smack Whitepaper:
			
 
				-
			
 
				-The Simplified Mandatory Access Control Kernel
			
 
				+The Simplified Mandatory Access Control Kernel (Whitepaper)
			
 
				+===========================================================
			
 
				 
			
 
				 Casey Schaufler
			
 
				 casey@schaufler-ca.com
			
 
				 
			
 
				 Mandatory Access Control
			
 
				+------------------------
			
 
				 
			
 
				 Computer systems employ a variety of schemes to constrain how information is
			
 
				 shared among the people and services using the machine. Some of these schemes
			
@@ -297,6 +356,7 @@ access control mechanisms because you don't have a choice regarding the users
 
				 or programs that have access to pieces of data.
			
 
				 
			
 
				 Bell & LaPadula
			
 
				+---------------
			
 
				 
			
 
				 From the middle of the 1980's until the turn of the century Mandatory Access
			
 
				 Control (MAC) was very closely associated with the Bell & LaPadula security
			
@@ -306,6 +366,7 @@ within the Capital Beltway and Scandinavian supercomputer centers but was
 
				 often sited as failing to address general needs.
			
 
				 
			
 
				 Domain Type Enforcement
			
 
				+-----------------------
			
 
				 
			
 
				 Around the turn of the century Domain Type Enforcement (DTE) became popular.
			
 
				 This scheme organizes users, programs, and data into domains that are
			
@@ -316,6 +377,7 @@ necessary to provide a secure domain mapping leads to the scheme being
 
				 disabled or used in limited ways in the majority of cases.
			
 
				 
			
 
				 Smack
			
 
				+-----
			
 
				 
			
 
				 Smack is a Mandatory Access Control mechanism designed to provide useful MAC
			
 
				 while avoiding the pitfalls of its predecessors. The limitations of Bell &
			
@@ -326,46 +388,55 @@ Enforcement and avoided by defining access controls in terms of the access
 
				 modes already in use.
			
 
				 
			
 
				 Smack Terminology
			
 
				+-----------------
			
 
				 
			
 
				 The jargon used to talk about Smack will be familiar to those who have dealt
			
 
				 with other MAC systems and shouldn't be too difficult for the uninitiated to
			
 
				 pick up. There are four terms that are used in a specific way and that are
			
 
				 especially important:
			
 
				 
			
 
				-	Subject: A subject is an active entity on the computer system.
			
 
				+  Subject:
			
 
				+	A subject is an active entity on the computer system.
			
 
				 	On Smack a subject is a task, which is in turn the basic unit
			
 
				 	of execution.
			
 
				 
			
 
				-	Object: An object is a passive entity on the computer system.
			
 
				+  Object:
			
 
				+	An object is a passive entity on the computer system.
			
 
				 	On Smack files of all types, IPC, and tasks can be objects.
			
 
				 
			
 
				-	Access: Any attempt by a subject to put information into or get
			
 
				+  Access:
			
 
				+	Any attempt by a subject to put information into or get
			
 
				 	information from an object is an access.
			
 
				 
			
 
				-	Label: Data that identifies the Mandatory Access Control
			
 
				+  Label:
			
 
				+	Data that identifies the Mandatory Access Control
			
 
				 	characteristics of a subject or an object.
			
 
				 
			
 
				 These definitions are consistent with the traditional use in the security
			
 
				 community. There are also some terms from Linux that are likely to crop up:
			
 
				 
			
 
				-	Capability: A task that possesses a capability has permission to
			
 
				+  Capability:
			
 
				+	A task that possesses a capability has permission to
			
 
				 	violate an aspect of the system security policy, as identified by
			
 
				 	the specific capability. A task that possesses one or more
			
 
				 	capabilities is a privileged task, whereas a task with no
			
 
				 	capabilities is an unprivileged task.
			
 
				 
			
 
				-	Privilege: A task that is allowed to violate the system security
			
 
				+  Privilege:
			
 
				+	A task that is allowed to violate the system security
			
 
				 	policy is said to have privilege. As of this writing a task can
			
 
				 	have privilege either by possessing capabilities or by having an
			
 
				 	effective user of root.
			
 
				 
			
 
				 Smack Basics
			
 
				+------------
			
 
				 
			
 
				 Smack is an extension to a Linux system. It enforces additional restrictions
			
 
				 on what subjects can access which objects, based on the labels attached to
			
 
				 each of the subject and the object.
			
 
				 
			
 
				 Labels
			
 
				+~~~~~~
			
 
				 
			
 
				 Smack labels are ASCII character strings. They can be up to 255 characters
			
 
				 long, but keeping them to twenty-three characters is recommended.
			
@@ -377,7 +448,7 @@ contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
 
				 (quote) and '"' (double-quote) characters.
			
 
				 Smack labels cannot begin with a '-'. This is reserved for special options.
			
 
				 
			
 
				-There are some predefined labels:
			
 
				+There are some predefined labels::
			
 
				 
			
 
				 	_ 	Pronounced "floor", a single underscore character.
			
 
				 	^ 	Pronounced "hat", a single circumflex character.
			
@@ -390,14 +461,18 @@ of a process will usually be assigned by the system initialization
 
				 mechanism.
			
 
				 
			
 
				 Access Rules
			
 
				+~~~~~~~~~~~~
			
 
				 
			
 
				 Smack uses the traditional access modes of Linux. These modes are read,
			
 
				 execute, write, and occasionally append. There are a few cases where the
			
 
				 access mode may not be obvious. These include:
			
 
				 
			
 
				-	Signals: A signal is a write operation from the subject task to
			
 
				+  Signals:
			
 
				+	A signal is a write operation from the subject task to
			
 
				 	the object task.
			
 
				-	Internet Domain IPC: Transmission of a packet is considered a
			
 
				+
			
 
				+  Internet Domain IPC:
			
 
				+	Transmission of a packet is considered a
			
 
				 	write operation from the source task to the destination task.
			
 
				 
			
 
				 Smack restricts access based on the label attached to a subject and the label
			
@@ -417,6 +492,7 @@ order:
 
				 	7. Any other access is denied.
			
 
				 
			
 
				 Smack Access Rules
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 With the isolation provided by Smack access separation is simple. There are
			
 
				 many interesting cases where limited access by subjects to objects with
			
@@ -427,8 +503,9 @@ be "born" highly classified. To accommodate such schemes Smack includes a
 
				 mechanism for specifying rules allowing access between labels.
			
 
				 
			
 
				 Access Rule Format
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				-The format of an access rule is:
			
 
				+The format of an access rule is::
			
 
				 
			
 
				 	subject-label object-label access
			
 
				 
			
@@ -446,7 +523,7 @@ describe access modes:
 
				 
			
 
				 Uppercase values for the specification letters are allowed as well.
			
 
				 Access mode specifications can be in any order. Examples of acceptable rules
			
 
				-are:
			
 
				+are::
			
 
				 
			
 
				 	TopSecret Secret  rx
			
 
				 	Secret    Unclass R
			
@@ -456,7 +533,7 @@ are:
 
				 	New       Old     rRrRr
			
 
				 	Closed    Off     -
			
 
				 
			
 
				-Examples of unacceptable rules are:
			
 
				+Examples of unacceptable rules are::
			
 
				 
			
 
				 	Top Secret Secret     rx
			
 
				 	Ace        Ace        r
			
@@ -469,6 +546,7 @@ access specifications. The dash is a placeholder, so "a-r" is the same
 
				 as "ar". A lone dash is used to specify that no access should be allowed.
			
 
				 
			
 
				 Applying Access Rules
			
 
				+~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 The developers of Linux rarely define new sorts of things, usually importing
			
 
				 schemes and concepts from other systems. Most often, the other systems are
			
@@ -511,6 +589,7 @@ one process to another requires that the sender have write access to the
 
				 receiver. The receiver is not required to have read access to the sender.
			
 
				 
			
 
				 Setting Access Rules
			
 
				+~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 The configuration file /etc/smack/accesses contains the rules to be set at
			
 
				 system startup. The contents are written to the special file
			
@@ -520,6 +599,7 @@ one rule, with the most recently specified overriding any earlier
 
				 specification.
			
 
				 
			
 
				 Task Attribute
			
 
				+~~~~~~~~~~~~~~
			
 
				 
			
 
				 The Smack label of a process can be read from /proc/<pid>/attr/current. A
			
 
				 process can read its own Smack label from /proc/self/attr/current. A
			
@@ -527,12 +607,14 @@ privileged process can change its own Smack label by writing to
 
				 /proc/self/attr/current but not the label of another process.
			
 
				 
			
 
				 File Attribute
			
 
				+~~~~~~~~~~~~~~
			
 
				 
			
 
				 The Smack label of a filesystem object is stored as an extended attribute
			
 
				 named SMACK64 on the file. This attribute is in the security namespace. It can
			
 
				 only be changed by a process with privilege.
			
 
				 
			
 
				 Privilege
			
 
				+~~~~~~~~~
			
 
				 
			
 
				 A process with CAP_MAC_OVERRIDE or CAP_MAC_ADMIN is privileged.
			
 
				 CAP_MAC_OVERRIDE allows the process access to objects it would
			
@@ -540,6 +622,7 @@ be denied otherwise. CAP_MAC_ADMIN allows a process to change
 
				 Smack data, including rules and attributes.
			
 
				 
			
 
				 Smack Networking
			
 
				+~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 As mentioned before, Smack enforces access control on network protocol
			
 
				 transmissions. Every packet sent by a Smack process is tagged with its Smack
			
@@ -551,6 +634,7 @@ packet has write access to the receiving process and if that is not the case
 
				 the packet is dropped.
			
 
				 
			
 
				 CIPSO Configuration
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 It is normally unnecessary to specify the CIPSO configuration. The default
			
 
				 values used by the system handle all internal cases. Smack will compose CIPSO
			
@@ -571,13 +655,13 @@ discarded. The DOI is 3 by default. The value can be read from
 
				 The label and category set are mapped to a Smack label as defined in
			
 
				 /etc/smack/cipso.
			
 
				 
			
 
				-A Smack/CIPSO mapping has the form:
			
 
				+A Smack/CIPSO mapping has the form::
			
 
				 
			
 
				 	smack level [category [category]*]
			
 
				 
			
 
				 Smack does not expect the level or category sets to be related in any
			
 
				 particular way and does not assume or assign accesses based on them. Some
			
 
				-examples of mappings:
			
 
				+examples of mappings::
			
 
				 
			
 
				 	TopSecret 7
			
 
				 	TS:A,B    7 1 2
			
@@ -597,25 +681,30 @@ value can be read from /sys/fs/smackfs/direct and changed by writing to
 
				 /sys/fs/smackfs/direct.
			
 
				 
			
 
				 Socket Attributes
			
 
				+~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 There are two attributes that are associated with sockets. These attributes
			
 
				 can only be set by privileged tasks, but any task can read them for their own
			
 
				 sockets.
			
 
				 
			
 
				-	SMACK64IPIN: The Smack label of the task object. A privileged
			
 
				+  SMACK64IPIN:
			
 
				+	The Smack label of the task object. A privileged
			
 
				 	program that will enforce policy may set this to the star label.
			
 
				 
			
 
				-	SMACK64IPOUT: The Smack label transmitted with outgoing packets.
			
 
				+  SMACK64IPOUT:
			
 
				+	The Smack label transmitted with outgoing packets.
			
 
				 	A privileged program may set this to match the label of another
			
 
				 	task with which it hopes to communicate.
			
 
				 
			
 
				 Smack Netlabel Exceptions
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 You will often find that your labeled application has to talk to the outside,
			
 
				 unlabeled world. To do this there's a special file /sys/fs/smackfs/netlabel
			
 
				-where you can add some exceptions in the form of :
			
 
				-@IP1	   LABEL1 or
			
 
				-@IP2/MASK  LABEL2
			
 
				+where you can add some exceptions in the form of::
			
 
				+
			
 
				+	@IP1	   LABEL1 or
			
 
				+	@IP2/MASK  LABEL2
			
 
				 
			
 
				 It means that your application will have unlabeled access to @IP1 if it has
			
 
				 write access on LABEL1, and access to the subnet @IP2/MASK if it has write
			
@@ -624,28 +713,32 @@ access on LABEL2.
 
				 Entries in the /sys/fs/smackfs/netlabel file are matched by longest mask
			
 
				 first, like in classless IPv4 routing.
			
 
				 
			
 
				-A special label '@' and an option '-CIPSO' can be used there :
			
 
				-@      means Internet, any application with any label has access to it
			
 
				--CIPSO means standard CIPSO networking
			
 
				+A special label '@' and an option '-CIPSO' can be used there::
			
 
				 
			
 
				-If you don't know what CIPSO is and don't plan to use it, you can just do :
			
 
				-echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
			
 
				-echo 0.0.0.0/0 @      > /sys/fs/smackfs/netlabel
			
 
				+	@      means Internet, any application with any label has access to it
			
 
				+	-CIPSO means standard CIPSO networking
			
 
				+
			
 
				+If you don't know what CIPSO is and don't plan to use it, you can just do::
			
 
				+
			
 
				+	echo 127.0.0.1 -CIPSO > /sys/fs/smackfs/netlabel
			
 
				+	echo 0.0.0.0/0 @      > /sys/fs/smackfs/netlabel
			
 
				 
			
 
				 If you use CIPSO on your 192.168.0.0/16 local network and need also unlabeled
			
 
				-Internet access, you can have :
			
 
				-echo 127.0.0.1      -CIPSO > /sys/fs/smackfs/netlabel
			
 
				-echo 192.168.0.0/16 -CIPSO > /sys/fs/smackfs/netlabel
			
 
				-echo 0.0.0.0/0      @      > /sys/fs/smackfs/netlabel
			
 
				+Internet access, you can have::
			
 
				 
			
 
				+	echo 127.0.0.1      -CIPSO > /sys/fs/smackfs/netlabel
			
 
				+	echo 192.168.0.0/16 -CIPSO > /sys/fs/smackfs/netlabel
			
 
				+	echo 0.0.0.0/0      @      > /sys/fs/smackfs/netlabel
			
 
				 
			
 
				 Writing Applications for Smack
			
 
				+------------------------------
			
 
				 
			
 
				 There are three sorts of applications that will run on a Smack system. How an
			
 
				 application interacts with Smack will determine what it will have to do to
			
 
				 work properly under Smack.
			
 
				 
			
 
				 Smack Ignorant Applications
			
 
				+---------------------------
			
 
				 
			
 
				 By far the majority of applications have no reason whatever to care about the
			
 
				 unique properties of Smack. Since invoking a program has no impact on the
			
@@ -653,12 +746,14 @@ Smack label associated with the process the only concern likely to arise is
 
				 whether the process has execute access to the program.
			
 
				 
			
 
				 Smack Relevant Applications
			
 
				+---------------------------
			
 
				 
			
 
				 Some programs can be improved by teaching them about Smack, but do not make
			
 
				 any security decisions themselves. The utility ls(1) is one example of such a
			
 
				 program.
			
 
				 
			
 
				 Smack Enforcing Applications
			
 
				+----------------------------
			
 
				 
			
 
				 These are special programs that not only know about Smack, but participate in
			
 
				 the enforcement of system policy. In most cases these are the programs that
			
@@ -666,15 +761,16 @@ set up user sessions. There are also network services that provide information
 
				 to processes running with various labels.
			
 
				 
			
 
				 File System Interfaces
			
 
				+----------------------
			
 
				 
			
 
				 Smack maintains labels on file system objects using extended attributes. The
			
 
				 Smack label of a file, directory, or other file system object can be obtained
			
 
				-using getxattr(2).
			
 
				+using getxattr(2)::
			
 
				 
			
 
				 	len = getxattr("/", "security.SMACK64", value, sizeof (value));
			
 
				 
			
 
				 will put the Smack label of the root directory into value. A privileged
			
 
				-process can set the Smack label of a file system object with setxattr(2).
			
 
				+process can set the Smack label of a file system object with setxattr(2)::
			
 
				 
			
 
				 	len = strlen("Rubble");
			
 
				 	rc = setxattr("/foo", "security.SMACK64", "Rubble", len, 0);
			
@@ -683,17 +779,18 @@ will set the Smack label of /foo to "Rubble" if the program has appropriate
 
				 privilege.
			
 
				 
			
 
				 Socket Interfaces
			
 
				+-----------------
			
 
				 
			
 
				 The socket attributes can be read using fgetxattr(2).
			
 
				 
			
 
				 A privileged process can set the Smack label of outgoing packets with
			
 
				-fsetxattr(2).
			
 
				+fsetxattr(2)::
			
 
				 
			
 
				 	len = strlen("Rubble");
			
 
				 	rc = fsetxattr(fd, "security.SMACK64IPOUT", "Rubble", len, 0);
			
 
				 
			
 
				 will set the Smack label "Rubble" on packets going out from the socket if the
			
 
				-program has appropriate privilege.
			
 
				+program has appropriate privilege::
			
 
				 
			
 
				 	rc = fsetxattr(fd, "security.SMACK64IPIN, "*", strlen("*"), 0);
			
 
				 
			
@@ -701,33 +798,40 @@ will set the Smack label "*" as the object label against which incoming
 
				 packets will be checked if the program has appropriate privilege.
			
 
				 
			
 
				 Administration
			
 
				+--------------
			
 
				 
			
 
				 Smack supports some mount options:
			
 
				 
			
 
				-	smackfsdef=label: specifies the label to give files that lack
			
 
				+  smackfsdef=label:
			
 
				+	specifies the label to give files that lack
			
 
				 	the Smack label extended attribute.
			
 
				 
			
 
				-	smackfsroot=label: specifies the label to assign the root of the
			
 
				+  smackfsroot=label:
			
 
				+	specifies the label to assign the root of the
			
 
				 	file system if it lacks the Smack extended attribute.
			
 
				 
			
 
				-	smackfshat=label: specifies a label that must have read access to
			
 
				+  smackfshat=label:
			
 
				+	specifies a label that must have read access to
			
 
				 	all labels set on the filesystem. Not yet enforced.
			
 
				 
			
 
				-	smackfsfloor=label: specifies a label to which all labels set on the
			
 
				+  smackfsfloor=label:
			
 
				+	specifies a label to which all labels set on the
			
 
				 	filesystem must have read access. Not yet enforced.
			
 
				 
			
 
				 These mount options apply to all file system types.
			
 
				 
			
 
				 Smack auditing
			
 
				+--------------
			
 
				 
			
 
				 If you want Smack auditing of security events, you need to set CONFIG_AUDIT
			
 
				 in your kernel configuration.
			
 
				 By default, all denied events will be audited. You can change this behavior by
			
 
				-writing a single character to the /sys/fs/smackfs/logging file :
			
 
				-0 : no logging
			
 
				-1 : log denied (default)
			
 
				-2 : log accepted
			
 
				-3 : log denied & accepted
			
 
				+writing a single character to the /sys/fs/smackfs/logging file::
			
 
				+
			
 
				+	0 : no logging
			
 
				+	1 : log denied (default)
			
 
				+	2 : log accepted
			
 
				+	3 : log denied & accepted
			
 
				 
			
 
				 Events are logged as 'key=value' pairs, for each event you at least will get
			
 
				 the subject, the object, the rights requested, the action, the kernel function
			
@@ -735,6 +839,7 @@ that triggered the event, plus other pairs depending on the type of event
 
				 audited.
			
 
				 
			
 
				 Bringup Mode
			
 
				+------------
			
 
				 
			
 
				 Bringup mode provides logging features that can make application
			
 
				 configuration and system bringup easier. Configure the kernel with
			
--- a/Documentation/admin-guide/LSM/Yama.rst
+++ b/Documentation/admin-guide/LSM/Yama.rst
@@ -1,13 +1,14 @@
 
				+====
			
 
				+Yama
			
 
				+====
			
 
				+
			
 
				 Yama is a Linux Security Module that collects system-wide DAC security
			
 
				 protections that are not handled by the core kernel itself. This is
			
 
				-selectable at build-time with CONFIG_SECURITY_YAMA, and can be controlled
			
 
				-at run-time through sysctls in /proc/sys/kernel/yama:
			
 
				-
			
 
				-- ptrace_scope
			
 
				+selectable at build-time with ``CONFIG_SECURITY_YAMA``, and can be controlled
			
 
				+at run-time through sysctls in ``/proc/sys/kernel/yama``:
			
 
				 
			
 
				-==============================================================
			
 
				-
			
 
				-ptrace_scope:
			
 
				+ptrace_scope
			
 
				+============
			
 
				 
			
 
				 As Linux grows in popularity, it will become a larger target for
			
 
				 malware. One particularly troubling weakness of the Linux process
			
@@ -25,47 +26,49 @@ exist and remain possible if ptrace is allowed to operate as before.
 
				 Since ptrace is not commonly used by non-developers and non-admins, system
			
 
				 builders should be allowed the option to disable this debugging system.
			
 
				 
			
 
				-For a solution, some applications use prctl(PR_SET_DUMPABLE, ...) to
			
 
				+For a solution, some applications use ``prctl(PR_SET_DUMPABLE, ...)`` to
			
 
				 specifically disallow such ptrace attachment (e.g. ssh-agent), but many
			
 
				 do not. A more general solution is to only allow ptrace directly from a
			
 
				 parent to a child process (i.e. direct "gdb EXE" and "strace EXE" still
			
 
				-work), or with CAP_SYS_PTRACE (i.e. "gdb --pid=PID", and "strace -p PID"
			
 
				+work), or with ``CAP_SYS_PTRACE`` (i.e. "gdb --pid=PID", and "strace -p PID"
			
 
				 still work as root).
			
 
				 
			
 
				 In mode 1, software that has defined application-specific relationships
			
 
				 between a debugging process and its inferior (crash handlers, etc),
			
 
				-prctl(PR_SET_PTRACER, pid, ...) can be used. An inferior can declare which
			
 
				-other process (and its descendants) are allowed to call PTRACE_ATTACH
			
 
				+``prctl(PR_SET_PTRACER, pid, ...)`` can be used. An inferior can declare which
			
 
				+other process (and its descendants) are allowed to call ``PTRACE_ATTACH``
			
 
				 against it. Only one such declared debugging process can exists for
			
 
				 each inferior at a time. For example, this is used by KDE, Chromium, and
			
 
				 Firefox's crash handlers, and by Wine for allowing only Wine processes
			
 
				 to ptrace each other. If a process wishes to entirely disable these ptrace
			
 
				-restrictions, it can call prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)
			
 
				+restrictions, it can call ``prctl(PR_SET_PTRACER, PR_SET_PTRACER_ANY, ...)``
			
 
				 so that any otherwise allowed process (even those in external pid namespaces)
			
 
				 may attach.
			
 
				 
			
 
				-The sysctl settings (writable only with CAP_SYS_PTRACE) are:
			
 
				+The sysctl settings (writable only with ``CAP_SYS_PTRACE``) are:
			
 
				 
			
 
				-0 - classic ptrace permissions: a process can PTRACE_ATTACH to any other
			
 
				+0 - classic ptrace permissions:
			
 
				+    a process can ``PTRACE_ATTACH`` to any other
			
 
				     process running under the same uid, as long as it is dumpable (i.e.
			
 
				     did not transition uids, start privileged, or have called
			
 
				-    prctl(PR_SET_DUMPABLE...) already). Similarly, PTRACE_TRACEME is
			
 
				+    ``prctl(PR_SET_DUMPABLE...)`` already). Similarly, ``PTRACE_TRACEME`` is
			
 
				     unchanged.
			
 
				 
			
 
				-1 - restricted ptrace: a process must have a predefined relationship
			
 
				-    with the inferior it wants to call PTRACE_ATTACH on. By default,
			
 
				+1 - restricted ptrace:
			
 
				+    a process must have a predefined relationship
			
 
				+    with the inferior it wants to call ``PTRACE_ATTACH`` on. By default,
			
 
				     this relationship is that of only its descendants when the above
			
 
				     classic criteria is also met. To change the relationship, an
			
 
				-    inferior can call prctl(PR_SET_PTRACER, debugger, ...) to declare
			
 
				-    an allowed debugger PID to call PTRACE_ATTACH on the inferior.
			
 
				-    Using PTRACE_TRACEME is unchanged.
			
 
				+    inferior can call ``prctl(PR_SET_PTRACER, debugger, ...)`` to declare
			
 
				+    an allowed debugger PID to call ``PTRACE_ATTACH`` on the inferior.
			
 
				+    Using ``PTRACE_TRACEME`` is unchanged.
			
 
				 
			
 
				-2 - admin-only attach: only processes with CAP_SYS_PTRACE may use ptrace
			
 
				-    with PTRACE_ATTACH, or through children calling PTRACE_TRACEME.
			
 
				+2 - admin-only attach:
			
 
				+    only processes with ``CAP_SYS_PTRACE`` may use ptrace
			
 
				+    with ``PTRACE_ATTACH``, or through children calling ``PTRACE_TRACEME``.
			
 
				 
			
 
				-3 - no attach: no processes may use ptrace with PTRACE_ATTACH nor via
			
 
				-    PTRACE_TRACEME. Once set, this sysctl value cannot be changed.
			
 
				+3 - no attach:
			
 
				+    no processes may use ptrace with ``PTRACE_ATTACH`` nor via
			
 
				+    ``PTRACE_TRACEME``. Once set, this sysctl value cannot be changed.
			
 
				 
			
 
				 The original children-only logic was based on the restrictions in grsecurity.
			
 
				-
			
 
				-==============================================================
			
--- a/Documentation/admin-guide/LSM/apparmor.rst
+++ b/Documentation/admin-guide/LSM/apparmor.rst
@@ -1,4 +1,9 @@
 
				---- What is AppArmor? ---
			
 
				+========
			
 
				+AppArmor
			
 
				+========
			
 
				+
			
 
				+What is AppArmor?
			
 
				+=================
			
 
				 
			
 
				 AppArmor is MAC style security extension for the Linux kernel.  It implements
			
 
				 a task centered policy, with task "profiles" being created and loaded
			
@@ -6,34 +11,41 @@ from user space.  Tasks on the system that do not have a profile defined for
 
				 them run in an unconfined state which is equivalent to standard Linux DAC
			
 
				 permissions.
			
 
				 
			
 
				---- How to enable/disable ---
			
 
				+How to enable/disable
			
 
				+=====================
			
 
				+
			
 
				+set ``CONFIG_SECURITY_APPARMOR=y``
			
 
				 
			
 
				-set CONFIG_SECURITY_APPARMOR=y
			
 
				+If AppArmor should be selected as the default security module then set::
			
 
				 
			
 
				-If AppArmor should be selected as the default security module then
			
 
				-   set CONFIG_DEFAULT_SECURITY="apparmor"
			
 
				-   and CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1
			
 
				+   CONFIG_DEFAULT_SECURITY="apparmor"
			
 
				+   CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1
			
 
				 
			
 
				 Build the kernel
			
 
				 
			
 
				 If AppArmor is not the default security module it can be enabled by passing
			
 
				-security=apparmor on the kernel's command line.
			
 
				+``security=apparmor`` on the kernel's command line.
			
 
				 
			
 
				 If AppArmor is the default security module it can be disabled by passing
			
 
				-apparmor=0, security=XXXX (where XXX is valid security module), on the
			
 
				-kernel's command line
			
 
				+``apparmor=0, security=XXXX`` (where ``XXXX`` is valid security module), on the
			
 
				+kernel's command line.
			
 
				 
			
 
				 For AppArmor to enforce any restrictions beyond standard Linux DAC permissions
			
 
				 policy must be loaded into the kernel from user space (see the Documentation
			
 
				 and tools links).
			
 
				 
			
 
				---- Documentation ---
			
 
				+Documentation
			
 
				+=============
			
 
				 
			
 
				-Documentation can be found on the wiki.
			
 
				+Documentation can be found on the wiki, linked below.
			
 
				 
			
 
				---- Links ---
			
 
				+Links
			
 
				+=====
			
 
				 
			
 
				 Mailing List - apparmor@lists.ubuntu.com
			
 
				+
			
 
				 Wiki - http://apparmor.wiki.kernel.org/
			
 
				+
			
 
				 User space tools - https://launchpad.net/apparmor
			
 
				+
			
 
				 Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
			
--- a/Documentation/admin-guide/LSM/index.rst
+++ b/Documentation/admin-guide/LSM/index.rst
@@ -1,12 +1,13 @@
 
				-Linux Security Module framework
			
 
				--------------------------------
			
 
				+===========================
			
 
				+Linux Security Module Usage
			
 
				+===========================
			
 
				 
			
 
				 The Linux Security Module (LSM) framework provides a mechanism for
			
 
				 various security checks to be hooked by new kernel extensions. The name
			
 
				 "module" is a bit of a misnomer since these extensions are not actually
			
 
				 loadable kernel modules. Instead, they are selectable at build-time via
			
 
				 CONFIG_DEFAULT_SECURITY and can be overridden at boot-time via the
			
 
				-"security=..." kernel command line argument, in the case where multiple
			
 
				+``"security=..."`` kernel command line argument, in the case where multiple
			
 
				 LSMs were built into a given kernel.
			
 
				 
			
 
				 The primary users of the LSM interface are Mandatory Access Control
			
@@ -19,23 +20,22 @@ in the core functionality of Linux itself.
 
				 Without a specific LSM built into the kernel, the default LSM will be the
			
 
				 Linux capabilities system. Most LSMs choose to extend the capabilities
			
 
				 system, building their checks on top of the defined capability hooks.
			
 
				-For more details on capabilities, see capabilities(7) in the Linux
			
 
				+For more details on capabilities, see ``capabilities(7)`` in the Linux
			
 
				 man-pages project.
			
 
				 
			
 
				 A list of the active security modules can be found by reading
			
 
				-/sys/kernel/security/lsm. This is a comma separated list, and
			
 
				+``/sys/kernel/security/lsm``. This is a comma separated list, and
			
 
				 will always include the capability module. The list reflects the
			
 
				 order in which checks are made. The capability module will always
			
 
				 be first, followed by any "minor" modules (e.g. Yama) and then
			
 
				 the one "major" module (e.g. SELinux) if there is one configured.
			
 
				 
			
 
				-Based on https://lkml.org/lkml/2007/10/26/215,
			
 
				-a new LSM is accepted into the kernel when its intent (a description of
			
 
				-what it tries to protect against and in what cases one would expect to
			
 
				-use it) has been appropriately documented in Documentation/security/.
			
 
				-This allows an LSM's code to be easily compared to its goals, and so
			
 
				-that end users and distros can make a more informed decision about which
			
 
				-LSMs suit their requirements.
			
 
				+.. toctree::
			
 
				+   :maxdepth: 1
			
 
				 
			
 
				-For extensive documentation on the available LSM hook interfaces, please
			
 
				-see include/linux/security.h.
			
 
				+   apparmor
			
 
				+   LoadPin
			
 
				+   SELinux
			
 
				+   Smack
			
 
				+   tomoyo
			
 
				+   Yama
			
--- a/Documentation/admin-guide/LSM/tomoyo.rst
+++ b/Documentation/admin-guide/LSM/tomoyo.rst
@@ -1,21 +1,30 @@
 
				---- What is TOMOYO? ---
			
 
				+======
			
 
				+TOMOYO
			
 
				+======
			
 
				+
			
 
				+What is TOMOYO?
			
 
				+===============
			
 
				 
			
 
				 TOMOYO is a name-based MAC extension (LSM module) for the Linux kernel.
			
 
				 
			
 
				 LiveCD-based tutorials are available at
			
 
				+
			
 
				 http://tomoyo.sourceforge.jp/1.7/1st-step/ubuntu10.04-live/
			
 
				-http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/ .
			
 
				+http://tomoyo.sourceforge.jp/1.7/1st-step/centos5-live/
			
 
				+
			
 
				 Though these tutorials use non-LSM version of TOMOYO, they are useful for you
			
 
				 to know what TOMOYO is.
			
 
				 
			
 
				---- How to enable TOMOYO? ---
			
 
				+How to enable TOMOYO?
			
 
				+=====================
			
 
				 
			
 
				-Build the kernel with CONFIG_SECURITY_TOMOYO=y and pass "security=tomoyo" on
			
 
				+Build the kernel with ``CONFIG_SECURITY_TOMOYO=y`` and pass ``security=tomoyo`` on
			
 
				 kernel's command line.
			
 
				 
			
 
				 Please see http://tomoyo.sourceforge.jp/2.3/ for details.
			
 
				 
			
 
				---- Where is documentation? ---
			
 
				+Where is documentation?
			
 
				+=======================
			
 
				 
			
 
				 User <-> Kernel interface documentation is available at
			
 
				 http://tomoyo.sourceforge.jp/2.3/policy-reference.html .
			
@@ -42,7 +51,8 @@ History of TOMOYO?
 
				   Realities of Mainlining
			
 
				     http://sourceforge.jp/projects/tomoyo/docs/lfj2008.pdf
			
 
				 
			
 
				---- What is future plan? ---
			
 
				+What is future plan?
			
 
				+====================
			
 
				 
			
 
				 We believe that inode based security and name based security are complementary
			
 
				 and both should be used together. But unfortunately, so far, we cannot enable
			
--- a/Documentation/admin-guide/README.rst
+++ b/Documentation/admin-guide/README.rst
@@ -55,12 +55,6 @@ Documentation
 
				    contains information about the problems, which may result by upgrading
			
 
				    your kernel.
			
 
				 
			
 
				- - The Documentation/DocBook/ subdirectory contains several guides for
			
 
				-   kernel developers and users.  These guides can be rendered in a
			
 
				-   number of formats:  PostScript (.ps), PDF, HTML, & man-pages, among others.
			
 
				-   After installation, ``make psdocs``, ``make pdfdocs``, ``make htmldocs``,
			
 
				-   or ``make mandocs`` will render the documentation in the requested format.
			
 
				-
			
 
				 Installing the kernel source
			
 
				 ----------------------------
			
 
				 
			
--- a/Documentation/admin-guide/devices.txt
+++ b/Documentation/admin-guide/devices.txt
@@ -369,8 +369,10 @@
 
				 		237 = /dev/loop-control Loopback control device
			
 
				 		238 = /dev/vhost-net	Host kernel accelerator for virtio net
			
 
				 		239 = /dev/uhid		User-space I/O driver support for HID subsystem
			
 
				+		240 = /dev/userio	Serio driver testing device
			
 
				+		241 = /dev/vhost-vsock	Host kernel driver for virtio vsock
			
 
				 
			
 
				-		240-254			Reserved for local use
			
 
				+		242-254			Reserved for local use
			
 
				 		255			Reserved for MISC_DYNAMIC_MINOR
			
 
				 
			
 
				   11 char	Raw keyboard device	(Linux/SPARC only)
			
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -61,6 +61,8 @@ configure specific aspects of kernel behavior to your liking.
 
				    java
			
 
				    ras
			
 
				    pm/index
			
 
				+   thunderbolt
			
 
				+   LSM/index
			
 
				 
			
 
				 .. only::  subproject and html
			
 
				 
			
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -649,6 +649,13 @@
 
				 			/proc/<pid>/coredump_filter.
			
 
				 			See also Documentation/filesystems/proc.txt.
			
 
				 
			
 
				+	coresight_cpu_debug.enable
			
 
				+			[ARM,ARM64]
			
 
				+			Format: <bool>
			
 
				+			Enable/disable the CPU sampling based debugging.
			
 
				+			0: default value, disable debugging
			
 
				+			1: enable debugging at boot time
			
 
				+
			
 
				 	cpuidle.off=1	[CPU_IDLE]
			
 
				 			disable the cpuidle sub-system
			
 
				 
			
@@ -720,7 +727,8 @@
 
				 			See also Documentation/input/joystick-parport.txt
			
 
				 
			
 
				 	ddebug_query=   [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
			
 
				-			time. See Documentation/dynamic-debug-howto.txt for
			
 
				+			time. See
			
 
				+			Documentation/admin-guide/dynamic-debug-howto.rst for
			
 
				 			details.  Deprecated, see dyndbg.
			
 
				 
			
 
				 	debug		[KNL] Enable kernel debugging (events log level).
			
@@ -883,7 +891,8 @@
 
				 	dyndbg[="val"]		[KNL,DYNAMIC_DEBUG]
			
 
				 	module.dyndbg[="val"]
			
 
				 			Enable debug messages at boot time.  See
			
 
				-			Documentation/dynamic-debug-howto.txt for details.
			
 
				+			Documentation/admin-guide/dynamic-debug-howto.rst
			
 
				+			for details.
			
 
				 
			
 
				 	nompx		[X86] Disables Intel Memory Protection Extensions.
			
 
				 			See Documentation/x86/intel_mpx.txt for more
			
@@ -954,6 +963,12 @@
 
				 			must already be setup and configured. Options are not
			
 
				 			yet supported.
			
 
				 
			
 
				+		owl,<addr>
			
 
				+			Start an early, polled-mode console on a serial port
			
 
				+			of an Actions Semi SoC, such as S500 or S900, at the
			
 
				+			specified address. The serial port must already be
			
 
				+			setup and configured. Options are not yet supported.
			
 
				+
			
 
				 		smh	Use ARM semihosting calls for early console.
			
 
				 
			
 
				 		s3c2410,<addr>
			
@@ -1486,12 +1501,21 @@
 
				 			in crypto/hash_info.h.
			
 
				 
			
 
				 	ima_policy=	[IMA]
			
 
				-			The builtin measurement policy to load during IMA
			
 
				-			setup.  Specyfing "tcb" as the value, measures all
			
 
				-			programs exec'd, files mmap'd for exec, and all files
			
 
				-			opened with the read mode bit set by either the
			
 
				-			effective uid (euid=0) or uid=0.
			
 
				-			Format: "tcb"
			
 
				+			The builtin policies to load during IMA setup.
			
 
				+			Format: "tcb | appraise_tcb | secure_boot"
			
 
				+
			
 
				+			The "tcb" policy measures all programs exec'd, files
			
 
				+			mmap'd for exec, and all files opened with the read
			
 
				+			mode bit set by either the effective uid (euid=0) or
			
 
				+			uid=0.
			
 
				+
			
 
				+			The "appraise_tcb" policy appraises the integrity of
			
 
				+			all files owned by root. (This is the equivalent
			
 
				+			of ima_appraise_tcb.)
			
 
				+
			
 
				+			The "secure_boot" policy appraises the integrity
			
 
				+			of files (eg. kexec kernel image, kernel modules,
			
 
				+			firmware, policy, etc) based on file signatures.
			
 
				 
			
 
				 	ima_tcb		[IMA] Deprecated.  Use ima_policy= instead.
			
 
				 			Load a policy which meets the needs of the Trusted
			
@@ -1838,6 +1862,18 @@
 
				 			for all guests.
			
 
				 			Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
			
 
				 
			
 
				+	kvm-arm.vgic_v3_group0_trap=
			
 
				+			[KVM,ARM] Trap guest accesses to GICv3 group-0
			
 
				+			system registers
			
 
				+
			
 
				+	kvm-arm.vgic_v3_group1_trap=
			
 
				+			[KVM,ARM] Trap guest accesses to GICv3 group-1
			
 
				+			system registers
			
 
				+
			
 
				+	kvm-arm.vgic_v3_common_trap=
			
 
				+			[KVM,ARM] Trap guest accesses to GICv3 common
			
 
				+			system registers
			
 
				+
			
 
				 	kvm-intel.ept=	[KVM,Intel] Disable extended page tables
			
 
				 			(virtualized MMU) support on capable Intel chips.
			
 
				 			Default is 1 (enabled)
			
@@ -2136,6 +2172,12 @@
 
				 	memmap=nn[KMG]@ss[KMG]
			
 
				 			[KNL] Force usage of a specific region of memory.
			
 
				 			Region of memory to be used is from ss to ss+nn.
			
 
				+			If @ss[KMG] is omitted, it is equivalent to mem=nn[KMG],
			
 
				+			which limits max address to nn[KMG].
			
 
				+			Multiple different regions can be specified,
			
 
				+			comma delimited.
			
 
				+			Example:
			
 
				+				memmap=100M@2G,100M#3G,1G!1024G
			
 
				 
			
 
				 	memmap=nn[KMG]#ss[KMG]
			
 
				 			[KNL,ACPI] Mark specific memory as ACPI data.
			
@@ -2148,6 +2190,9 @@
 
				 			         memmap=64K$0x18690000
			
 
				 			         or
			
 
				 			         memmap=0x10000$0x18690000
			
 
				+			Some bootloaders may need an escape character before '$',
			
 
				+			like Grub2, otherwise '$' and the following number
			
 
				+			will be eaten.
			
 
				 
			
 
				 	memmap=nn[KMG]!ss[KMG]
			
 
				 			[KNL,X86] Mark specific memory as protected.
			
@@ -2270,8 +2315,11 @@
 
				 			that the amount of memory usable for all allocations
			
 
				 			is not too small.
			
 
				 
			
 
				-	movable_node	[KNL] Boot-time switch to enable the effects
			
 
				-			of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details.
			
 
				+	movable_node	[KNL] Boot-time switch to make hotplugable memory
			
 
				+			NUMA nodes to be movable. This means that the memory
			
 
				+			of such nodes will be usable only for movable
			
 
				+			allocations which rules out almost all kernel
			
 
				+			allocations. Use with caution!
			
 
				 
			
 
				 	MTD_Partition=	[MTD]
			
 
				 			Format: <name>,<region-number>,<size>,<offset>
			
@@ -3238,21 +3286,17 @@
 
				 
			
 
				 	rcutree.gp_cleanup_delay=	[KNL]
			
 
				 			Set the number of jiffies to delay each step of
			
 
				-			RCU grace-period cleanup.  This only has effect
			
 
				-			when CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP is set.
			
 
				+			RCU grace-period cleanup.
			
 
				 
			
 
				 	rcutree.gp_init_delay=	[KNL]
			
 
				 			Set the number of jiffies to delay each step of
			
 
				-			RCU grace-period initialization.  This only has
			
 
				-			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT
			
 
				-			is set.
			
 
				+			RCU grace-period initialization.
			
 
				 
			
 
				 	rcutree.gp_preinit_delay=	[KNL]
			
 
				 			Set the number of jiffies to delay each step of
			
 
				 			RCU grace-period pre-initialization, that is,
			
 
				 			the propagation of recent CPU-hotplug changes up
			
 
				-			the rcu_node combining tree.  This only has effect
			
 
				-			when CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT is set.
			
 
				+			the rcu_node combining tree.
			
 
				 
			
 
				 	rcutree.rcu_fanout_exact= [KNL]
			
 
				 			Disable autobalancing of the rcu_node combining
			
@@ -3328,6 +3372,17 @@
 
				 			This wake_up() will be accompanied by a
			
 
				 			WARN_ONCE() splat and an ftrace_dump().
			
 
				 
			
 
				+	rcuperf.gp_async= [KNL]
			
 
				+			Measure performance of asynchronous
			
 
				+			grace-period primitives such as call_rcu().
			
 
				+
			
 
				+	rcuperf.gp_async_max= [KNL]
			
 
				+			Specify the maximum number of outstanding
			
 
				+			callbacks per writer thread.  When a writer
			
 
				+			thread exceeds this limit, it invokes the
			
 
				+			corresponding flavor of rcu_barrier() to allow
			
 
				+			previously posted callbacks to drain.
			
 
				+
			
 
				 	rcuperf.gp_exp= [KNL]
			
 
				 			Measure performance of expedited synchronous
			
 
				 			grace-period primitives.
			
@@ -3355,17 +3410,22 @@
 
				 	rcuperf.perf_runnable= [BOOT]
			
 
				 			Start rcuperf running at boot time.
			
 
				 
			
 
				+	rcuperf.perf_type= [KNL]
			
 
				+			Specify the RCU implementation to test.
			
 
				+
			
 
				 	rcuperf.shutdown= [KNL]
			
 
				 			Shut the system down after performance tests
			
 
				 			complete.  This is useful for hands-off automated
			
 
				 			testing.
			
 
				 
			
 
				-	rcuperf.perf_type= [KNL]
			
 
				-			Specify the RCU implementation to test.
			
 
				-
			
 
				 	rcuperf.verbose= [KNL]
			
 
				 			Enable additional printk() statements.
			
 
				 
			
 
				+	rcuperf.writer_holdoff= [KNL]
			
 
				+			Write-side holdoff between grace periods,
			
 
				+			in microseconds.  The default of zero says
			
 
				+			no holdoff.
			
 
				+
			
 
				 	rcutorture.cbflood_inter_holdoff= [KNL]
			
 
				 			Set holdoff time (jiffies) between successive
			
 
				 			callback-flood tests.
			
@@ -3715,8 +3775,14 @@
 
				 	slab_nomerge	[MM]
			
 
				 			Disable merging of slabs with similar size. May be
			
 
				 			necessary if there is some reason to distinguish
			
 
				-			allocs to different slabs. Debug options disable
			
 
				-			merging on their own.
			
 
				+			allocs to different slabs, especially in hardened
			
 
				+			environments where the risk of heap overflows and
			
 
				+			layout control by attackers can usually be
			
 
				+			frustrated by disabling merging. This will reduce
			
 
				+			most of the exposure of a heap attack to a single
			
 
				+			cache (risks via metadata attacks are mostly
			
 
				+			unchanged). Debug options disable merging on their
			
 
				+			own.
			
 
				 			For more information see Documentation/vm/slub.txt.
			
 
				 
			
 
				 	slab_max_order=	[MM, SLAB]
			
@@ -3803,6 +3869,15 @@
 
				 	spia_pedr=
			
 
				 	spia_peddr=
			
 
				 
			
 
				+	srcutree.counter_wrap_check [KNL]
			
 
				+			Specifies how frequently to check for
			
 
				+			grace-period sequence counter wrap for the
			
 
				+			srcu_data structure's ->srcu_gp_seq_needed field.
			
 
				+			The greater the number of bits set in this kernel
			
 
				+			parameter, the less frequently counter wrap will
			
 
				+			be checked for.  Note that the bottom two bits
			
 
				+			are ignored.
			
 
				+
			
 
				 	srcutree.exp_holdoff [KNL]
			
 
				 			Specifies how many nanoseconds must elapse
			
 
				 			since the end of the last SRCU grace period for
			
--- a/Documentation/admin-guide/pm/cpufreq.rst
+++ b/Documentation/admin-guide/pm/cpufreq.rst
@@ -269,16 +269,16 @@ are the following:
 
				 ``scaling_cur_freq``
			
 
				 	Current frequency of all of the CPUs belonging to this policy (in kHz).
			
 
				 
			
 
				-	For the majority of scaling drivers, this is the frequency of the last
			
 
				-	P-state requested by the driver from the hardware using the scaling
			
 
				+	In the majority of cases, this is the frequency of the last P-state
			
 
				+	requested by the scaling driver from the hardware using the scaling
			
 
				 	interface provided by it, which may or may not reflect the frequency
			
 
				 	the CPU is actually running at (due to hardware design and other
			
 
				 	limitations).
			
 
				 
			
 
				-	Some scaling drivers (e.g. |intel_pstate|) attempt to provide
			
 
				-	information more precisely reflecting the current CPU frequency through
			
 
				-	this attribute, but that still may not be the exact current CPU
			
 
				-	frequency as seen by the hardware at the moment.
			
 
				+	Some architectures (e.g. ``x86``) may attempt to provide information
			
 
				+	more precisely reflecting the current CPU frequency through this
			
 
				+	attribute, but that still may not be the exact current CPU frequency as
			
 
				+	seen by the hardware at the moment.
			
 
				 
			
 
				 ``scaling_driver``
			
 
				 	The scaling driver currently in use.
			
--- a/Documentation/admin-guide/pm/intel_pstate.rst
+++ b/Documentation/admin-guide/pm/intel_pstate.rst
@@ -157,10 +157,8 @@ Without HWP, this P-state selection algorithm is always the same regardless of
 
				 the processor model and platform configuration.
			
 
				 
			
 
				 It selects the maximum P-state it is allowed to use, subject to limits set via
			
 
				-``sysfs``, every time the P-state selection computations are carried out by the
			
 
				-driver's utilization update callback for the given CPU (that does not happen
			
 
				-more often than every 10 ms), but the hardware configuration will not be changed
			
 
				-if the new P-state is the same as the current one.
			
 
				+``sysfs``, every time the driver configuration for the given CPU is updated
			
 
				+(e.g. via ``sysfs``).
			
 
				 
			
 
				 This is the default P-state selection algorithm if the
			
 
				 :c:macro:`CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE` kernel configuration option
			
--- a/Documentation/admin-guide/ras.rst
+++ b/Documentation/admin-guide/ras.rst
@@ -344,9 +344,9 @@ for more than 2 channels, like Fully Buffered DIMMs (FB-DIMMs) memory
 
				 controllers. The following example will assume 2 channels:
			
 
				 
			
 
				 	+------------+-----------------------+
			
 
				-	| Chip       |       Channels        |
			
 
				-	| Select     +-----------+-----------+
			
 
				-	| rows       |  ``ch0``  |  ``ch1``  |
			
 
				+	| CS Rows    |       Channels        |
			
 
				+	+------------+-----------+-----------+
			
 
				+	|            |  ``ch0``  |  ``ch1``  |
			
 
				 	+============+===========+===========+
			
 
				 	| ``csrow0`` |  DIMM_A0  |  DIMM_B0  |
			
 
				 	+------------+           |           |
			
@@ -698,7 +698,7 @@ information indicating that errors have been detected::
 
				 The structure of the message is:
			
 
				 
			
 
				 	+---------------------------------------+-------------+
			
 
				-	| Content                               + Example     |
			
 
				+	| Content                               | Example     |
			
 
				 	+=======================================+=============+
			
 
				 	| The memory controller                 | MC0         |
			
 
				 	+---------------------------------------+-------------+
			
@@ -713,7 +713,7 @@ The structure of the message is:
 
				 	+---------------------------------------+-------------+
			
 
				 	| The error syndrome                    | 0xb741      |
			
 
				 	+---------------------------------------+-------------+
			
 
				-	| Memory row                            | row 0       +
			
 
				+	| Memory row                            | row 0       |
			
 
				 	+---------------------------------------+-------------+
			
 
				 	| Memory channel                        | channel 1   |
			
 
				 	+---------------------------------------+-------------+
			
--- a/Documentation/admin-guide/thunderbolt.rst
+++ b/Documentation/admin-guide/thunderbolt.rst
@@ -0,0 +1,199 @@
 
				+=============
			
 
				+ Thunderbolt
			
 
				+=============
			
 
				+The interface presented here is not meant for end users. Instead there
			
 
				+should be a userspace tool that handles all the low-level details, keeps
			
 
				+database of the authorized devices and prompts user for new connections.
			
 
				+
			
 
				+More details about the sysfs interface for Thunderbolt devices can be
			
 
				+found in ``Documentation/ABI/testing/sysfs-bus-thunderbolt``.
			
 
				+
			
 
				+Those users who just want to connect any device without any sort of
			
 
				+manual work, can add following line to
			
 
				+``/etc/udev/rules.d/99-local.rules``::
			
 
				+
			
 
				+  ACTION=="add", SUBSYSTEM=="thunderbolt", ATTR{authorized}=="0", ATTR{authorized}="1"
			
 
				+
			
 
				+This will authorize all devices automatically when they appear. However,
			
 
				+keep in mind that this bypasses the security levels and makes the system
			
 
				+vulnerable to DMA attacks.
			
 
				+
			
 
				+Security levels and how to use them
			
 
				+-----------------------------------
			
 
				+Starting from Intel Falcon Ridge Thunderbolt controller there are 4
			
 
				+security levels available. The reason for these is the fact that the
			
 
				+connected devices can be DMA masters and thus read contents of the host
			
 
				+memory without CPU and OS knowing about it. There are ways to prevent
			
 
				+this by setting up an IOMMU but it is not always available for various
			
 
				+reasons.
			
 
				+
			
 
				+The security levels are as follows:
			
 
				+
			
 
				+  none
			
 
				+    All devices are automatically connected by the firmware. No user
			
 
				+    approval is needed. In BIOS settings this is typically called
			
 
				+    *Legacy mode*.
			
 
				+
			
 
				+  user
			
 
				+    User is asked whether the device is allowed to be connected.
			
 
				+    Based on the device identification information available through
			
 
				+    ``/sys/bus/thunderbolt/devices``. user then can do the decision.
			
 
				+    In BIOS settings this is typically called *Unique ID*.
			
 
				+
			
 
				+  secure
			
 
				+    User is asked whether the device is allowed to be connected. In
			
 
				+    addition to UUID the device (if it supports secure connect) is sent
			
 
				+    a challenge that should match the expected one based on a random key
			
 
				+    written to ``key`` sysfs attribute. In BIOS settings this is
			
 
				+    typically called *One time saved key*.
			
 
				+
			
 
				+  dponly
			
 
				+    The firmware automatically creates tunnels for Display Port and
			
 
				+    USB. No PCIe tunneling is done. In BIOS settings this is
			
 
				+    typically called *Display Port Only*.
			
 
				+
			
 
				+The current security level can be read from
			
 
				+``/sys/bus/thunderbolt/devices/domainX/security`` where ``domainX`` is
			
 
				+the Thunderbolt domain the host controller manages. There is typically
			
 
				+one domain per Thunderbolt host controller.
			
 
				+
			
 
				+If the security level reads as ``user`` or ``secure`` the connected
			
 
				+device must be authorized by the user before PCIe tunnels are created
			
 
				+(e.g the PCIe device appears).
			
 
				+
			
 
				+Each Thunderbolt device plugged in will appear in sysfs under
			
 
				+``/sys/bus/thunderbolt/devices``. The device directory carries
			
 
				+information that can be used to identify the particular device,
			
 
				+including its name and UUID.
			
 
				+
			
 
				+Authorizing devices when security level is ``user`` or ``secure``
			
 
				+-----------------------------------------------------------------
			
 
				+When a device is plugged in it will appear in sysfs as follows::
			
 
				+
			
 
				+  /sys/bus/thunderbolt/devices/0-1/authorized	- 0
			
 
				+  /sys/bus/thunderbolt/devices/0-1/device	- 0x8004
			
 
				+  /sys/bus/thunderbolt/devices/0-1/device_name	- Thunderbolt to FireWire Adapter
			
 
				+  /sys/bus/thunderbolt/devices/0-1/vendor	- 0x1
			
 
				+  /sys/bus/thunderbolt/devices/0-1/vendor_name	- Apple, Inc.
			
 
				+  /sys/bus/thunderbolt/devices/0-1/unique_id	- e0376f00-0300-0100-ffff-ffffffffffff
			
 
				+
			
 
				+The ``authorized`` attribute reads 0 which means no PCIe tunnels are
			
 
				+created yet. The user can authorize the device by simply::
			
 
				+
			
 
				+  # echo 1 > /sys/bus/thunderbolt/devices/0-1/authorized
			
 
				+
			
 
				+This will create the PCIe tunnels and the device is now connected.
			
 
				+
			
 
				+If the device supports secure connect, and the domain security level is
			
 
				+set to ``secure``, it has an additional attribute ``key`` which can hold
			
 
				+a random 32 byte value used for authorization and challenging the device in
			
 
				+future connects::
			
 
				+
			
 
				+  /sys/bus/thunderbolt/devices/0-3/authorized	- 0
			
 
				+  /sys/bus/thunderbolt/devices/0-3/device	- 0x305
			
 
				+  /sys/bus/thunderbolt/devices/0-3/device_name	- AKiTiO Thunder3 PCIe Box
			
 
				+  /sys/bus/thunderbolt/devices/0-3/key		-
			
 
				+  /sys/bus/thunderbolt/devices/0-3/vendor	- 0x41
			
 
				+  /sys/bus/thunderbolt/devices/0-3/vendor_name	- inXtron
			
 
				+  /sys/bus/thunderbolt/devices/0-3/unique_id	- dc010000-0000-8508-a22d-32ca6421cb16
			
 
				+
			
 
				+Notice the key is empty by default.
			
 
				+
			
 
				+If the user does not want to use secure connect it can just ``echo 1``
			
 
				+to the ``authorized`` attribute and the PCIe tunnels will be created in
			
 
				+the same way than in ``user`` security level.
			
 
				+
			
 
				+If the user wants to use secure connect, the first time the device is
			
 
				+plugged a key needs to be created and send to the device::
			
 
				+
			
 
				+  # key=$(openssl rand -hex 32)
			
 
				+  # echo $key > /sys/bus/thunderbolt/devices/0-3/key
			
 
				+  # echo 1 > /sys/bus/thunderbolt/devices/0-3/authorized
			
 
				+
			
 
				+Now the device is connected (PCIe tunnels are created) and in addition
			
 
				+the key is stored on the device NVM.
			
 
				+
			
 
				+Next time the device is plugged in the user can verify (challenge) the
			
 
				+device using the same key::
			
 
				+
			
 
				+  # echo $key > /sys/bus/thunderbolt/devices/0-3/key
			
 
				+  # echo 2 > /sys/bus/thunderbolt/devices/0-3/authorized
			
 
				+
			
 
				+If the challenge the device returns back matches the one we expect based
			
 
				+on the key, the device is connected and the PCIe tunnels are created.
			
 
				+However, if the challenge failed no tunnels are created and error is
			
 
				+returned to the user.
			
 
				+
			
 
				+If the user still wants to connect the device it can either approve
			
 
				+the device without a key or write new key and write 1 to the
			
 
				+``authorized`` file to get the new key stored on the device NVM.
			
 
				+
			
 
				+Upgrading NVM on Thunderbolt device or host
			
 
				+-------------------------------------------
			
 
				+Since most of the functionality is handled in a firmware running on a
			
 
				+host controller or a device, it is important that the firmware can be
			
 
				+upgraded to the latest where possible bugs in it have been fixed.
			
 
				+Typically OEMs provide this firmware from their support site.
			
 
				+
			
 
				+There is also a central site which has links where to download firmwares
			
 
				+for some machines:
			
 
				+
			
 
				+  `Thunderbolt Updates <https://thunderbolttechnology.net/updates>`_
			
 
				+
			
 
				+Before you upgrade firmware on a device or host, please make sure it is
			
 
				+the suitable. Failing to do that may render the device (or host) in a
			
 
				+state where it cannot be used properly anymore without special tools!
			
 
				+
			
 
				+Host NVM upgrade on Apple Macs is not supported.
			
 
				+
			
 
				+Once the NVM image has been downloaded, you need to plug in a
			
 
				+Thunderbolt device so that the host controller appears. It does not
			
 
				+matter which device is connected (unless you are upgrading NVM on a
			
 
				+device - then you need to connect that particular device).
			
 
				+
			
 
				+Note OEM-specific method to power the controller up ("force power") may
			
 
				+be available for your system in which case there is no need to plug in a
			
 
				+Thunderbolt device.
			
 
				+
			
 
				+After that we can write the firmware to the non-active parts of the NVM
			
 
				+of the host or device. As an example here is how Intel NUC6i7KYK (Skull
			
 
				+Canyon) Thunderbolt controller NVM is upgraded::
			
 
				+
			
 
				+  # dd if=KYK_TBT_FW_0018.bin of=/sys/bus/thunderbolt/devices/0-0/nvm_non_active0/nvmem
			
 
				+
			
 
				+Once the operation completes we can trigger NVM authentication and
			
 
				+upgrade process as follows::
			
 
				+
			
 
				+  # echo 1 > /sys/bus/thunderbolt/devices/0-0/nvm_authenticate
			
 
				+
			
 
				+If no errors are returned, the host controller shortly disappears. Once
			
 
				+it comes back the driver notices it and initiates a full power cycle.
			
 
				+After a while the host controller appears again and this time it should
			
 
				+be fully functional.
			
 
				+
			
 
				+We can verify that the new NVM firmware is active by running following
			
 
				+commands::
			
 
				+
			
 
				+  # cat /sys/bus/thunderbolt/devices/0-0/nvm_authenticate
			
 
				+  0x0
			
 
				+  # cat /sys/bus/thunderbolt/devices/0-0/nvm_version
			
 
				+  18.0
			
 
				+
			
 
				+If ``nvm_authenticate`` contains anything else than 0x0 it is the error
			
 
				+code from the last authentication cycle, which means the authentication
			
 
				+of the NVM image failed.
			
 
				+
			
 
				+Note names of the NVMem devices ``nvm_activeN`` and ``nvm_non_activeN``
			
 
				+depends on the order they are registered in the NVMem subsystem. N in
			
 
				+the name is the identifier added by the NVMem subsystem.
			
 
				+
			
 
				+Upgrading NVM when host controller is in safe mode
			
 
				+--------------------------------------------------
			
 
				+If the existing NVM is not properly authenticated (or is missing) the
			
 
				+host controller goes into safe mode which means that only available
			
 
				+functionality is flashing new NVM image. When in this mode the reading
			
 
				+``nvm_version`` fails with ``ENODATA`` and the device identification
			
 
				+information is missing.
			
 
				+
			
 
				+To recover from this mode, one needs to flash a valid NVM image to the
			
 
				+host host controller in the same way it is done in the previous chapter.
			
--- a/Documentation/arm/Atmel/README
+++ b/Documentation/arm/Atmel/README
@@ -16,7 +16,7 @@ git branches/tags and email subject always contain this "at91" sub-string.
 
				 
			
 
				 AT91 SoCs
			
 
				 ---------
			
 
				-Documentation and detailled datasheet for each product are available on
			
 
				+Documentation and detailed datasheet for each product are available on
			
 
				 the Atmel website: http://www.atmel.com.
			
 
				 
			
 
				   Flavors:
			
@@ -101,6 +101,42 @@ the Atmel website: http://www.atmel.com.
 
				         + Datasheet
			
 
				           http://www.atmel.com/Images/Atmel-11267-32-bit-Cortex-A5-Microcontroller-SAMA5D2_Datasheet.pdf
			
 
				 
			
 
				+    * ARM Cortex-M7 MCUs
			
 
				+      - sams70 family
			
 
				+        - sams70j19
			
 
				+        - sams70j20
			
 
				+        - sams70j21
			
 
				+        - sams70n19
			
 
				+        - sams70n20
			
 
				+        - sams70n21
			
 
				+        - sams70q19
			
 
				+        - sams70q20
			
 
				+        - sams70q21
			
 
				+        + Datasheet
			
 
				+          http://www.atmel.com/Images/Atmel-11242-32-bit-Cortex-M7-Microcontroller-SAM-S70Q-SAM-S70N-SAM-S70J_Datasheet.pdf
			
 
				+
			
 
				+      - samv70 family
			
 
				+        - samv70j19
			
 
				+        - samv70j20
			
 
				+        - samv70n19
			
 
				+        - samv70n20
			
 
				+        - samv70q19
			
 
				+        - samv70q20
			
 
				+        + Datasheet
			
 
				+          http://www.atmel.com/Images/Atmel-11297-32-bit-Cortex-M7-Microcontroller-SAM-V70Q-SAM-V70N-SAM-V70J_Datasheet.pdf
			
 
				+
			
 
				+      - samv71 family
			
 
				+        - samv71j19
			
 
				+        - samv71j20
			
 
				+        - samv71j21
			
 
				+        - samv71n19
			
 
				+        - samv71n20
			
 
				+        - samv71n21
			
 
				+        - samv71q19
			
 
				+        - samv71q20
			
 
				+        - samv71q21
			
 
				+        + Datasheet
			
 
				+          http://www.atmel.com/Images/Atmel-44003-32-bit-Cortex-M7-Microcontroller-SAM-V71Q-SAM-V71N-SAM-V71J_Datasheet.pdf
			
 
				 
			
 
				 Linux kernel information
			
 
				 ------------------------
			
--- a/Documentation/arm64/silicon-errata.txt
+++ b/Documentation/arm64/silicon-errata.txt
@@ -61,11 +61,15 @@ stable kernels.
 
				 | Cavium         | ThunderX ITS    | #23144          | CAVIUM_ERRATUM_23144        |
			
 
				 | Cavium         | ThunderX GICv3  | #23154          | CAVIUM_ERRATUM_23154        |
			
 
				 | Cavium         | ThunderX Core   | #27456          | CAVIUM_ERRATUM_27456        |
			
 
				+| Cavium         | ThunderX Core   | #30115          | CAVIUM_ERRATUM_30115        |
			
 
				 | Cavium         | ThunderX SMMUv2 | #27704          | N/A                         |
			
 
				+| Cavium         | ThunderX2 SMMUv3| #74             | N/A                         |
			
 
				+| Cavium         | ThunderX2 SMMUv3| #126            | N/A                         |
			
 
				 |                |                 |                 |                             |
			
 
				 | Freescale/NXP  | LS2080A/LS1043A | A-008585        | FSL_ERRATUM_A008585         |
			
 
				 |                |                 |                 |                             |
			
 
				 | Hisilicon      | Hip0{5,6,7}     | #161010101      | HISILICON_ERRATUM_161010101 |
			
 
				+| Hisilicon      | Hip0{6,7}       | #161010701      | N/A                         |
			
 
				 |                |                 |                 |                             |
			
 
				 | Qualcomm Tech. | Falkor v1       | E1003           | QCOM_FALKOR_ERRATUM_1003    |
			
 
				 | Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
			
--- a/Documentation/bcache.txt
+++ b/Documentation/bcache.txt
@@ -1,10 +1,15 @@
 
				+============================
			
 
				+A block layer cache (bcache)
			
 
				+============================
			
 
				+
			
 
				 Say you've got a big slow raid 6, and an ssd or three. Wouldn't it be
			
 
				 nice if you could use them as cache... Hence bcache.
			
 
				 
			
 
				 Wiki and git repositories are at:
			
 
				-  http://bcache.evilpiepirate.org
			
 
				-  http://evilpiepirate.org/git/linux-bcache.git
			
 
				-  http://evilpiepirate.org/git/bcache-tools.git
			
 
				+
			
 
				+  - http://bcache.evilpiepirate.org
			
 
				+  - http://evilpiepirate.org/git/linux-bcache.git
			
 
				+  - http://evilpiepirate.org/git/bcache-tools.git
			
 
				 
			
 
				 It's designed around the performance characteristics of SSDs - it only allocates
			
 
				 in erase block sized buckets, and it uses a hybrid btree/log to track cached
			
@@ -37,17 +42,19 @@ to be flushed.
 
				 
			
 
				 Getting started:
			
 
				 You'll need make-bcache from the bcache-tools repository. Both the cache device
			
 
				-and backing device must be formatted before use.
			
 
				+and backing device must be formatted before use::
			
 
				+
			
 
				   make-bcache -B /dev/sdb
			
 
				   make-bcache -C /dev/sdc
			
 
				 
			
 
				 make-bcache has the ability to format multiple devices at the same time - if
			
 
				 you format your backing devices and cache device at the same time, you won't
			
 
				-have to manually attach:
			
 
				+have to manually attach::
			
 
				+
			
 
				   make-bcache -B /dev/sda /dev/sdb -C /dev/sdc
			
 
				 
			
 
				 bcache-tools now ships udev rules, and bcache devices are known to the kernel
			
 
				-immediately.  Without udev, you can manually register devices like this:
			
 
				+immediately.  Without udev, you can manually register devices like this::
			
 
				 
			
 
				   echo /dev/sdb > /sys/fs/bcache/register
			
 
				   echo /dev/sdc > /sys/fs/bcache/register
			
@@ -60,16 +67,16 @@ slow devices as bcache backing devices without a cache, and you can choose to ad
 
				 a caching device later.
			
 
				 See 'ATTACHING' section below.
			
 
				 
			
 
				-The devices show up as:
			
 
				+The devices show up as::
			
 
				 
			
 
				   /dev/bcache<N>
			
 
				 
			
 
				-As well as (with udev):
			
 
				+As well as (with udev)::
			
 
				 
			
 
				   /dev/bcache/by-uuid/<uuid>
			
 
				   /dev/bcache/by-label/<label>
			
 
				 
			
 
				-To get started:
			
 
				+To get started::
			
 
				 
			
 
				   mkfs.ext4 /dev/bcache0
			
 
				   mount /dev/bcache0 /mnt
			
@@ -81,13 +88,13 @@ Cache devices are managed as sets; multiple caches per set isn't supported yet
 
				 but will allow for mirroring of metadata and dirty data in the future. Your new
			
 
				 cache set shows up as /sys/fs/bcache/<UUID>
			
 
				 
			
 
				-ATTACHING
			
 
				+Attaching
			
 
				 ---------
			
 
				 
			
 
				 After your cache device and backing device are registered, the backing device
			
 
				 must be attached to your cache set to enable caching. Attaching a backing
			
 
				 device to a cache set is done thusly, with the UUID of the cache set in
			
 
				-/sys/fs/bcache:
			
 
				+/sys/fs/bcache::
			
 
				 
			
 
				   echo <CSET-UUID> > /sys/block/bcache0/bcache/attach
			
 
				 
			
@@ -97,7 +104,7 @@ your bcache devices. If a backing device has data in a cache somewhere, the
 
				 important if you have writeback caching turned on.
			
 
				 
			
 
				 If you're booting up and your cache device is gone and never coming back, you
			
 
				-can force run the backing device:
			
 
				+can force run the backing device::
			
 
				 
			
 
				   echo 1 > /sys/block/sdb/bcache/running
			
 
				 
			
@@ -110,7 +117,7 @@ but all the cached data will be invalidated. If there was dirty data in the
 
				 cache, don't expect the filesystem to be recoverable - you will have massive
			
 
				 filesystem corruption, though ext4's fsck does work miracles.
			
 
				 
			
 
				-ERROR HANDLING
			
 
				+Error Handling
			
 
				 --------------
			
 
				 
			
 
				 Bcache tries to transparently handle IO errors to/from the cache device without
			
@@ -134,25 +141,27 @@ the backing devices to passthrough mode.
 
				    read some of the dirty data, though.
			
 
				 
			
 
				 
			
 
				-HOWTO/COOKBOOK
			
 
				+Howto/cookbook
			
 
				 --------------
			
 
				 
			
 
				 A) Starting a bcache with a missing caching device
			
 
				 
			
 
				 If registering the backing device doesn't help, it's already there, you just need
			
 
				-to force it to run without the cache:
			
 
				+to force it to run without the cache::
			
 
				+
			
 
				 	host:~# echo /dev/sdb1 > /sys/fs/bcache/register
			
 
				 	[  119.844831] bcache: register_bcache() error opening /dev/sdb1: device already registered
			
 
				 
			
 
				 Next, you try to register your caching device if it's present. However
			
 
				 if it's absent, or registration fails for some reason, you can still
			
 
				-start your bcache without its cache, like so:
			
 
				+start your bcache without its cache, like so::
			
 
				+
			
 
				 	host:/sys/block/sdb/sdb1/bcache# echo 1 > running
			
 
				 
			
 
				 Note that this may cause data loss if you were running in writeback mode.
			
 
				 
			
 
				 
			
 
				-B) Bcache does not find its cache
			
 
				+B) Bcache does not find its cache::
			
 
				 
			
 
				 	host:/sys/block/md5/bcache# echo 0226553a-37cf-41d5-b3ce-8b1e944543a8 > attach
			
 
				 	[ 1933.455082] bcache: bch_cached_dev_attach() Couldn't find uuid for md5 in set
			
@@ -160,7 +169,8 @@ B) Bcache does not find its cache
 
				 	[ 1933.478179] : cache set not found
			
 
				 
			
 
				 In this case, the caching device was simply not registered at boot
			
 
				-or disappeared and came back, and needs to be (re-)registered:
			
 
				+or disappeared and came back, and needs to be (re-)registered::
			
 
				+
			
 
				 	host:/sys/block/md5/bcache# echo /dev/sdh2 > /sys/fs/bcache/register
			
 
				 
			
 
				 
			
@@ -180,7 +190,8 @@ device is still available at an 8KiB offset. So either via a loopdev
 
				 of the backing device created with --offset 8K, or any value defined by
			
 
				 --data-offset when you originally formatted bcache with `make-bcache`.
			
 
				 
			
 
				-For example:
			
 
				+For example::
			
 
				+
			
 
				 	losetup -o 8192 /dev/loop0 /dev/your_bcache_backing_dev
			
 
				 
			
 
				 This should present your unmodified backing device data in /dev/loop0
			
@@ -191,33 +202,38 @@ cache device without loosing data.
 
				 
			
 
				 E) Wiping a cache device
			
 
				 
			
 
				-host:~# wipefs -a /dev/sdh2
			
 
				-16 bytes were erased at offset 0x1018 (bcache)
			
 
				-they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
			
 
				+::
			
 
				+
			
 
				+	host:~# wipefs -a /dev/sdh2
			
 
				+	16 bytes were erased at offset 0x1018 (bcache)
			
 
				+	they were: c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
			
 
				+
			
 
				+After you boot back with bcache enabled, you recreate the cache and attach it::
			
 
				 
			
 
				-After you boot back with bcache enabled, you recreate the cache and attach it:
			
 
				-host:~# make-bcache -C /dev/sdh2
			
 
				-UUID:                   7be7e175-8f4c-4f99-94b2-9c904d227045
			
 
				-Set UUID:               5bc072a8-ab17-446d-9744-e247949913c1
			
 
				-version:                0
			
 
				-nbuckets:               106874
			
 
				-block_size:             1
			
 
				-bucket_size:            1024
			
 
				-nr_in_set:              1
			
 
				-nr_this_dev:            0
			
 
				-first_bucket:           1
			
 
				-[  650.511912] bcache: run_cache_set() invalidating existing data
			
 
				-[  650.549228] bcache: register_cache() registered cache device sdh2
			
 
				+	host:~# make-bcache -C /dev/sdh2
			
 
				+	UUID:                   7be7e175-8f4c-4f99-94b2-9c904d227045
			
 
				+	Set UUID:               5bc072a8-ab17-446d-9744-e247949913c1
			
 
				+	version:                0
			
 
				+	nbuckets:               106874
			
 
				+	block_size:             1
			
 
				+	bucket_size:            1024
			
 
				+	nr_in_set:              1
			
 
				+	nr_this_dev:            0
			
 
				+	first_bucket:           1
			
 
				+	[  650.511912] bcache: run_cache_set() invalidating existing data
			
 
				+	[  650.549228] bcache: register_cache() registered cache device sdh2
			
 
				 
			
 
				-start backing device with missing cache:
			
 
				-host:/sys/block/md5/bcache# echo 1 > running
			
 
				+start backing device with missing cache::
			
 
				 
			
 
				-attach new cache:
			
 
				-host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
			
 
				-[  865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
			
 
				+	host:/sys/block/md5/bcache# echo 1 > running
			
 
				 
			
 
				+attach new cache::
			
 
				 
			
 
				-F) Remove or replace a caching device
			
 
				+	host:/sys/block/md5/bcache# echo 5bc072a8-ab17-446d-9744-e247949913c1 > attach
			
 
				+	[  865.276616] bcache: bch_cached_dev_attach() Caching md5 as bcache0 on set 5bc072a8-ab17-446d-9744-e247949913c1
			
 
				+
			
 
				+
			
 
				+F) Remove or replace a caching device::
			
 
				 
			
 
				 	host:/sys/block/sda/sda7/bcache# echo 1 > detach
			
 
				 	[  695.872542] bcache: cached_dev_detach_finish() Caching disabled for sda7
			
@@ -226,13 +242,15 @@ F) Remove or replace a caching device
 
				 	wipefs: error: /dev/nvme0n1p4: probing initialization failed: Device or resource busy
			
 
				 	Ooops, it's disabled, but not unregistered, so it's still protected
			
 
				 
			
 
				-We need to go and unregister it:
			
 
				+We need to go and unregister it::
			
 
				+
			
 
				 	host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# ls -l cache0
			
 
				 	lrwxrwxrwx 1 root root 0 Feb 25 18:33 cache0 -> ../../../devices/pci0000:00/0000:00:1d.0/0000:70:00.0/nvme/nvme0/nvme0n1/nvme0n1p4/bcache/
			
 
				 	host:/sys/fs/bcache/b7ba27a1-2398-4649-8ae3-0959f57ba128# echo 1 > stop
			
 
				 	kernel: [  917.041908] bcache: cache_set_free() Cache set b7ba27a1-2398-4649-8ae3-0959f57ba128 unregistered
			
 
				 
			
 
				-Now we can wipe it:
			
 
				+Now we can wipe it::
			
 
				+
			
 
				 	host:~# wipefs -a /dev/nvme0n1p4
			
 
				 	/dev/nvme0n1p4: 16 bytes were erased at offset 0x00001018 (bcache): c6 85 73 f6 4e 1a 45 ca 82 65 f5 7f 48 ba 6d 81
			
 
				 
			
@@ -252,40 +270,44 @@ if there are any active backing or caching devices left on it:
 
				 
			
 
				 1) Is it present in /dev/bcache* ? (there are times where it won't be)
			
 
				 
			
 
				-If so, it's easy:
			
 
				+   If so, it's easy::
			
 
				+
			
 
				 	host:/sys/block/bcache0/bcache# echo 1 > stop
			
 
				 
			
 
				-2) But if your backing device is gone, this won't work:
			
 
				+2) But if your backing device is gone, this won't work::
			
 
				+
			
 
				 	host:/sys/block/bcache0# cd bcache
			
 
				 	bash: cd: bcache: No such file or directory
			
 
				 
			
 
				-In this case, you may have to unregister the dmcrypt block device that
			
 
				-references this bcache to free it up:
			
 
				+   In this case, you may have to unregister the dmcrypt block device that
			
 
				+   references this bcache to free it up::
			
 
				+
			
 
				 	host:~# dmsetup remove oldds1
			
 
				 	bcache: bcache_device_free() bcache0 stopped
			
 
				 	bcache: cache_set_free() Cache set 5bc072a8-ab17-446d-9744-e247949913c1 unregistered
			
 
				 
			
 
				-This causes the backing bcache to be removed from /sys/fs/bcache and
			
 
				-then it can be reused.  This would be true of any block device stacking
			
 
				-where bcache is a lower device.
			
 
				+   This causes the backing bcache to be removed from /sys/fs/bcache and
			
 
				+   then it can be reused.  This would be true of any block device stacking
			
 
				+   where bcache is a lower device.
			
 
				+
			
 
				+3) In other cases, you can also look in /sys/fs/bcache/::
			
 
				 
			
 
				-3) In other cases, you can also look in /sys/fs/bcache/:
			
 
				+	host:/sys/fs/bcache# ls -l */{cache?,bdev?}
			
 
				+	lrwxrwxrwx 1 root root 0 Mar  5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
			
 
				+	lrwxrwxrwx 1 root root 0 Mar  5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
			
 
				+	lrwxrwxrwx 1 root root 0 Mar  5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
			
 
				 
			
 
				-host:/sys/fs/bcache# ls -l */{cache?,bdev?}
			
 
				-lrwxrwxrwx 1 root root 0 Mar  5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/bdev1 -> ../../../devices/virtual/block/dm-1/bcache/
			
 
				-lrwxrwxrwx 1 root root 0 Mar  5 09:39 0226553a-37cf-41d5-b3ce-8b1e944543a8/cache0 -> ../../../devices/virtual/block/dm-4/bcache/
			
 
				-lrwxrwxrwx 1 root root 0 Mar  5 09:39 5bc072a8-ab17-446d-9744-e247949913c1/cache0 -> ../../../devices/pci0000:00/0000:00:01.0/0000:01:00.0/ata10/host9/target9:0:0/9:0:0:0/block/sdl/sdl2/bcache/
			
 
				+   The device names will show which UUID is relevant, cd in that directory
			
 
				+   and stop the cache::
			
 
				 
			
 
				-The device names will show which UUID is relevant, cd in that directory
			
 
				-and stop the cache:
			
 
				 	host:/sys/fs/bcache/5bc072a8-ab17-446d-9744-e247949913c1# echo 1 > stop
			
 
				 
			
 
				-This will free up bcache references and let you reuse the partition for
			
 
				-other purposes.
			
 
				+   This will free up bcache references and let you reuse the partition for
			
 
				+   other purposes.
			
 
				 
			
 
				 
			
 
				 
			
 
				-TROUBLESHOOTING PERFORMANCE
			
 
				+Troubleshooting performance
			
 
				 ---------------------------
			
 
				 
			
 
				 Bcache has a bunch of config options and tunables. The defaults are intended to
			
@@ -301,11 +323,13 @@ want for getting the best possible numbers when benchmarking.
 
				    raid stripe size to get the disk multiples that you would like.
			
 
				 
			
 
				    For example:  If you have a 64k stripe size, then the following offset
			
 
				-   would provide alignment for many common RAID5 data spindle counts:
			
 
				+   would provide alignment for many common RAID5 data spindle counts::
			
 
				+
			
 
				 	64k * 2*2*2*3*3*5*7 bytes = 161280k
			
 
				 
			
 
				    That space is wasted, but for only 157.5MB you can grow your RAID 5
			
 
				-   volume to the following data-spindle counts without re-aligning:
			
 
				+   volume to the following data-spindle counts without re-aligning::
			
 
				+
			
 
				 	3,4,5,6,7,8,9,10,12,14,15,18,20,21 ...
			
 
				 
			
 
				  - Bad write performance
			
@@ -313,9 +337,9 @@ want for getting the best possible numbers when benchmarking.
 
				    If write performance is not what you expected, you probably wanted to be
			
 
				    running in writeback mode, which isn't the default (not due to a lack of
			
 
				    maturity, but simply because in writeback mode you'll lose data if something
			
 
				-   happens to your SSD)
			
 
				+   happens to your SSD)::
			
 
				 
			
 
				-   # echo writeback > /sys/block/bcache0/bcache/cache_mode
			
 
				+	# echo writeback > /sys/block/bcache0/bcache/cache_mode
			
 
				 
			
 
				  - Bad performance, or traffic not going to the SSD that you'd expect
			
 
				 
			
@@ -325,13 +349,13 @@ want for getting the best possible numbers when benchmarking.
 
				    accessed data out of your cache.
			
 
				 
			
 
				    But if you want to benchmark reads from cache, and you start out with fio
			
 
				-   writing an 8 gigabyte test file - so you want to disable that.
			
 
				+   writing an 8 gigabyte test file - so you want to disable that::
			
 
				 
			
 
				-   # echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
			
 
				+	# echo 0 > /sys/block/bcache0/bcache/sequential_cutoff
			
 
				 
			
 
				-   To set it back to the default (4 mb), do
			
 
				+   To set it back to the default (4 mb), do::
			
 
				 
			
 
				-   # echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
			
 
				+	# echo 4M > /sys/block/bcache0/bcache/sequential_cutoff
			
 
				 
			
 
				  - Traffic's still going to the spindle/still getting cache misses
			
 
				 
			
@@ -344,10 +368,10 @@ want for getting the best possible numbers when benchmarking.
 
				    throttles traffic if the latency exceeds a threshold (it does this by
			
 
				    cranking down the sequential bypass).
			
 
				 
			
 
				-   You can disable this if you need to by setting the thresholds to 0:
			
 
				+   You can disable this if you need to by setting the thresholds to 0::
			
 
				 
			
 
				-   # echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
			
 
				-   # echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
			
 
				+	# echo 0 > /sys/fs/bcache/<cache set>/congested_read_threshold_us
			
 
				+	# echo 0 > /sys/fs/bcache/<cache set>/congested_write_threshold_us
			
 
				 
			
 
				    The default is 2000 us (2 milliseconds) for reads, and 20000 for writes.
			
 
				 
			
@@ -369,7 +393,7 @@ want for getting the best possible numbers when benchmarking.
 
				    a fix for the issue there).
			
 
				 
			
 
				 
			
 
				-SYSFS - BACKING DEVICE
			
 
				+Sysfs - backing device
			
 
				 ----------------------
			
 
				 
			
 
				 Available at /sys/block/<bdev>/bcache, /sys/block/bcache*/bcache and
			
@@ -454,7 +478,8 @@ writeback_running
 
				   still be added to the cache until it is mostly full; only meant for
			
 
				   benchmarking. Defaults to on.
			
 
				 
			
 
				-SYSFS - BACKING DEVICE STATS:
			
 
				+Sysfs - backing device stats
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 There are directories with these numbers for a running total, as well as
			
 
				 versions that decay over the past day, hour and 5 minutes; they're also
			
@@ -463,14 +488,11 @@ aggregated in the cache set directory as well.
 
				 bypassed
			
 
				   Amount of IO (both reads and writes) that has bypassed the cache
			
 
				 
			
 
				-cache_hits
			
 
				-cache_misses
			
 
				-cache_hit_ratio
			
 
				+cache_hits, cache_misses, cache_hit_ratio
			
 
				   Hits and misses are counted per individual IO as bcache sees them; a
			
 
				   partial hit is counted as a miss.
			
 
				 
			
 
				-cache_bypass_hits
			
 
				-cache_bypass_misses
			
 
				+cache_bypass_hits, cache_bypass_misses
			
 
				   Hits and misses for IO that is intended to skip the cache are still counted,
			
 
				   but broken out here.
			
 
				 
			
@@ -482,7 +504,8 @@ cache_miss_collisions
 
				 cache_readaheads
			
 
				   Count of times readahead occurred.
			
 
				 
			
 
				-SYSFS - CACHE SET:
			
 
				+Sysfs - cache set
			
 
				+~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Available at /sys/fs/bcache/<cset-uuid>
			
 
				 
			
@@ -520,8 +543,7 @@ flash_vol_create
 
				   Echoing a size to this file (in human readable units, k/M/G) creates a thinly
			
 
				   provisioned volume backed by the cache set.
			
 
				 
			
 
				-io_error_halflife
			
 
				-io_error_limit
			
 
				+io_error_halflife, io_error_limit
			
 
				   These determines how many errors we accept before disabling the cache.
			
 
				   Each error is decayed by the half life (in # ios).  If the decaying count
			
 
				   reaches io_error_limit dirty data is written out and the cache is disabled.
			
@@ -545,7 +567,8 @@ unregister
 
				   Detaches all backing devices and closes the cache devices; if dirty data is
			
 
				   present it will disable writeback caching and wait for it to be flushed.
			
 
				 
			
 
				-SYSFS - CACHE SET INTERNAL:
			
 
				+Sysfs - cache set internal
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 This directory also exposes timings for a number of internal operations, with
			
 
				 separate files for average duration, average frequency, last occurrence and max
			
@@ -574,7 +597,8 @@ cache_read_races
 
				 trigger_gc
			
 
				   Writing to this file forces garbage collection to run.
			
 
				 
			
 
				-SYSFS - CACHE DEVICE:
			
 
				+Sysfs - Cache device
			
 
				+~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Available at /sys/block/<cdev>/bcache
			
 
				 
			
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -632,7 +632,7 @@ to i/o submission, if the bio fields are likely to be accessed after the
 
				 i/o is issued (since the bio may otherwise get freed in case i/o completion
			
 
				 happens in the meantime).
			
 
				 
			
 
				-The bio_clone() routine may be used to duplicate a bio, where the clone
			
 
				+The bio_clone_fast() routine may be used to duplicate a bio, where the clone
			
 
				 shares the bio_vec_list with the original bio (i.e. both point to the
			
 
				 same bio_vec_list). This would typically be used for splitting i/o requests
			
 
				 in lvm or md.
			
--- a/Documentation/block/data-integrity.txt
+++ b/Documentation/block/data-integrity.txt
@@ -192,7 +192,7 @@ will require extra work due to the application tag.
 
				     supported by the block device.
			
 
				 
			
 
				 
			
 
				-    int bio_integrity_prep(bio);
			
 
				+    bool bio_integrity_prep(bio);
			
 
				 
			
 
				       To generate IMD for WRITE and to set up buffers for READ, the
			
 
				       filesystem must call bio_integrity_prep(bio).
			
@@ -201,9 +201,7 @@ will require extra work due to the application tag.
 
				       sector must be set, and the bio should have all data pages
			
 
				       added.  It is up to the caller to ensure that the bio does not
			
 
				       change while I/O is in progress.
			
 
				-
			
 
				-      bio_integrity_prep() should only be called if
			
 
				-      bio_integrity_enabled() returned 1.
			
 
				+      Complete bio with error if prepare failed for some reson.
			
 
				 
			
 
				 
			
 
				 5.3 PASSING EXISTING INTEGRITY METADATA
			
--- a/Documentation/bt8xxgpio.txt
+++ b/Documentation/bt8xxgpio.txt
@@ -1,12 +1,8 @@
 
				-===============================================================
			
 
				-==  BT8XXGPIO driver                                         ==
			
 
				-==                                                           ==
			
 
				-==  A driver for a selfmade cheap BT8xx based PCI GPIO-card  ==
			
 
				-==                                                           ==
			
 
				-==  For advanced documentation, see                          ==
			
 
				-==  http://www.bu3sch.de/btgpio.php                          ==
			
 
				-===============================================================
			
 
				+===================================================================
			
 
				+A driver for a selfmade cheap BT8xx based PCI GPIO-card (bt8xxgpio)
			
 
				+===================================================================
			
 
				 
			
 
				+For advanced documentation, see http://www.bu3sch.de/btgpio.php
			
 
				 
			
 
				 A generic digital 24-port PCI GPIO card can be built out of an ordinary
			
 
				 Brooktree bt848, bt849, bt878 or bt879 based analog TV tuner card. The
			
@@ -17,9 +13,8 @@ The bt8xx chip does have 24 digital GPIO ports.
 
				 These ports are accessible via 24 pins on the SMD chip package.
			
 
				 
			
 
				 
			
 
				-==============================================
			
 
				-==  How to physically access the GPIO pins  ==
			
 
				-==============================================
			
 
				+How to physically access the GPIO pins
			
 
				+======================================
			
 
				 
			
 
				 The are several ways to access these pins. One might unsolder the whole chip
			
 
				 and put it on a custom PCI board, or one might only unsolder each individual
			
@@ -27,7 +22,7 @@ GPIO pin and solder that to some tiny wire. As the chip package really is tiny
 
				 there are some advanced soldering skills needed in any case.
			
 
				 
			
 
				 The physical pinouts are drawn in the following ASCII art.
			
 
				-The GPIO pins are marked with G00-G23
			
 
				+The GPIO pins are marked with G00-G23::
			
 
				 
			
 
				                                            G G G G G G G G G G G G     G G G G G G
			
 
				                                            0 0 0 0 0 0 0 0 0 0 1 1     1 1 1 1 1 1
			
--- a/Documentation/btmrvl.txt
+++ b/Documentation/btmrvl.txt
@@ -1,18 +1,16 @@
 
				-=======================================================================
			
 
				-		README for btmrvl driver
			
 
				-=======================================================================
			
 
				-
			
 
				+=============
			
 
				+btmrvl driver
			
 
				+=============
			
 
				 
			
 
				 All commands are used via debugfs interface.
			
 
				 
			
 
				-=====================
			
 
				-Set/get driver configurations:
			
 
				+Set/get driver configurations
			
 
				+=============================
			
 
				 
			
 
				 Path:	/debug/btmrvl/config/
			
 
				 
			
 
				-gpiogap=[n]
			
 
				-hscfgcmd
			
 
				-	These commands are used to configure the host sleep parameters.
			
 
				+gpiogap=[n], hscfgcmd
			
 
				+	These commands are used to configure the host sleep parameters::
			
 
				 	bit 8:0  -- Gap
			
 
				 	bit 16:8 -- GPIO
			
 
				 
			
@@ -23,7 +21,8 @@ hscfgcmd
 
				 	where Gap is the gap in milli seconds between wakeup signal and
			
 
				 	wakeup event, or 0xff for special host sleep setting.
			
 
				 
			
 
				-	Usage:
			
 
				+	Usage::
			
 
				+
			
 
				 		# Use SDIO interface to wake up the host and set GAP to 0x80:
			
 
				 		echo 0xff80 > /debug/btmrvl/config/gpiogap
			
 
				 		echo 1 > /debug/btmrvl/config/hscfgcmd
			
@@ -32,15 +31,16 @@ hscfgcmd
 
				 		echo 0x03ff >  /debug/btmrvl/config/gpiogap
			
 
				 		echo 1 > /debug/btmrvl/config/hscfgcmd
			
 
				 
			
 
				-psmode=[n]
			
 
				-pscmd
			
 
				+psmode=[n], pscmd
			
 
				 	These commands are used to enable/disable auto sleep mode
			
 
				 
			
 
				-	where the option is:
			
 
				+	where the option is::
			
 
				+
			
 
				 			1 	-- Enable auto sleep mode
			
 
				 			0 	-- Disable auto sleep mode
			
 
				 
			
 
				-	Usage:
			
 
				+	Usage::
			
 
				+
			
 
				 		# Enable auto sleep mode
			
 
				 		echo 1 > /debug/btmrvl/config/psmode
			
 
				 		echo 1 > /debug/btmrvl/config/pscmd
			
@@ -50,15 +50,16 @@ pscmd
 
				 		echo 1 > /debug/btmrvl/config/pscmd
			
 
				 
			
 
				 
			
 
				-hsmode=[n]
			
 
				-hscmd
			
 
				+hsmode=[n], hscmd
			
 
				 	These commands are used to enable host sleep or wake up firmware
			
 
				 
			
 
				-	where the option is:
			
 
				+	where the option is::
			
 
				+
			
 
				 			1	-- Enable host sleep
			
 
				 			0	-- Wake up firmware
			
 
				 
			
 
				-	Usage:
			
 
				+	Usage::
			
 
				+
			
 
				 		# Enable host sleep
			
 
				 		echo 1 > /debug/btmrvl/config/hsmode
			
 
				 		echo 1 > /debug/btmrvl/config/hscmd
			
@@ -68,12 +69,13 @@ hscmd
 
				 		echo 1 > /debug/btmrvl/config/hscmd
			
 
				 
			
 
				 
			
 
				-======================
			
 
				-Get driver status:
			
 
				+Get driver status
			
 
				+=================
			
 
				 
			
 
				 Path:	/debug/btmrvl/status/
			
 
				 
			
 
				-Usage:
			
 
				+Usage::
			
 
				+
			
 
				 	cat /debug/btmrvl/status/<args>
			
 
				 
			
 
				 where the args are:
			
@@ -90,14 +92,17 @@ hsstate
 
				 txdnldrdy
			
 
				 	This command displays the value of Tx download ready flag.
			
 
				 
			
 
				-
			
 
				-=====================
			
 
				+Issuing a raw hci command
			
 
				+=========================
			
 
				 
			
 
				 Use hcitool to issue raw hci command, refer to hcitool manual
			
 
				 
			
 
				-	Usage: Hcitool cmd <ogf> <ocf> [Parameters]
			
 
				+Usage::
			
 
				+
			
 
				+	Hcitool cmd <ogf> <ocf> [Parameters]
			
 
				+
			
 
				+Interface Control Command::
			
 
				 
			
 
				-	Interface Control Command
			
 
				 	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x00    --Enable All interface
			
 
				 	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x01    --Enable Wlan interface
			
 
				 	hcitool cmd 0x3f 0x5b 0xf5 0x01 0x02    --Enable BT interface
			
@@ -105,13 +110,13 @@ Use hcitool to issue raw hci command, refer to hcitool manual
 
				 	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x01    --Disable Wlan interface
			
 
				 	hcitool cmd 0x3f 0x5b 0xf5 0x00 0x02    --Disable BT interface
			
 
				 
			
 
				-=======================================================================
			
 
				-
			
 
				+SD8688 firmware
			
 
				+===============
			
 
				 
			
 
				-SD8688 firmware:
			
 
				+Images:
			
 
				 
			
 
				-/lib/firmware/sd8688_helper.bin
			
 
				-/lib/firmware/sd8688.bin
			
 
				+- /lib/firmware/sd8688_helper.bin
			
 
				+- /lib/firmware/sd8688.bin
			
 
				 
			
 
				 
			
 
				 The images can be downloaded from:
			
--- a/Documentation/bus-virt-phys-mapping.txt
+++ b/Documentation/bus-virt-phys-mapping.txt
@@ -1,17 +1,27 @@
 
				-[ NOTE: The virt_to_bus() and bus_to_virt() functions have been
			
 
				+==========================================================
			
 
				+How to access I/O mapped memory from within device drivers
			
 
				+==========================================================
			
 
				+
			
 
				+:Author: Linus
			
 
				+
			
 
				+.. warning::
			
 
				+
			
 
				+	The virt_to_bus() and bus_to_virt() functions have been
			
 
				 	superseded by the functionality provided by the PCI DMA interface
			
 
				 	(see Documentation/DMA-API-HOWTO.txt).  They continue
			
 
				 	to be documented below for historical purposes, but new code
			
 
				-	must not use them. --davidm 00/12/12 ]
			
 
				+	must not use them. --davidm 00/12/12
			
 
				 
			
 
				-[ This is a mail message in response to a query on IO mapping, thus the
			
 
				-  strange format for a "document" ]
			
 
				+::
			
 
				+
			
 
				+  [ This is a mail message in response to a query on IO mapping, thus the
			
 
				+    strange format for a "document" ]
			
 
				 
			
 
				 The AHA-1542 is a bus-master device, and your patch makes the driver give the
			
 
				 controller the physical address of the buffers, which is correct on x86
			
 
				 (because all bus master devices see the physical memory mappings directly). 
			
 
				 
			
 
				-However, on many setups, there are actually _three_ different ways of looking
			
 
				+However, on many setups, there are actually **three** different ways of looking
			
 
				 at memory addresses, and in this case we actually want the third, the
			
 
				 so-called "bus address". 
			
 
				 
			
@@ -38,7 +48,7 @@ because the memory and the devices share the same address space, and that is
 
				 not generally necessarily true on other PCI/ISA setups. 
			
 
				 
			
 
				 Now, just as an example, on the PReP (PowerPC Reference Platform), the 
			
 
				-CPU sees a memory map something like this (this is from memory):
			
 
				+CPU sees a memory map something like this (this is from memory)::
			
 
				 
			
 
				 	0-2 GB		"real memory"
			
 
				 	2 GB-3 GB	"system IO" (inb/out and similar accesses on x86)
			
@@ -52,7 +62,7 @@ So when the CPU wants any bus master to write to physical memory 0, it
 
				 has to give the master address 0x80000000 as the memory address.
			
 
				 
			
 
				 So, for example, depending on how the kernel is actually mapped on the 
			
 
				-PPC, you can end up with a setup like this:
			
 
				+PPC, you can end up with a setup like this::
			
 
				 
			
 
				  physical address:	0
			
 
				  virtual address:	0xC0000000
			
@@ -61,7 +71,7 @@ PPC, you can end up with a setup like this:
 
				 where all the addresses actually point to the same thing.  It's just seen 
			
 
				 through different translations..
			
 
				 
			
 
				-Similarly, on the Alpha, the normal translation is
			
 
				+Similarly, on the Alpha, the normal translation is::
			
 
				 
			
 
				  physical address:	0
			
 
				  virtual address:	0xfffffc0000000000
			
@@ -70,7 +80,7 @@ Similarly, on the Alpha, the normal translation is
 
				 (but there are also Alphas where the physical address and the bus address
			
 
				 are the same). 
			
 
				 
			
 
				-Anyway, the way to look up all these translations, you do
			
 
				+Anyway, the way to look up all these translations, you do::
			
 
				 
			
 
				 	#include <asm/io.h>
			
 
				 
			
@@ -81,8 +91,8 @@ Anyway, the way to look up all these translations, you do
 
				 
			
 
				 Now, when do you need these?
			
 
				 
			
 
				-You want the _virtual_ address when you are actually going to access that 
			
 
				-pointer from the kernel. So you can have something like this:
			
 
				+You want the **virtual** address when you are actually going to access that
			
 
				+pointer from the kernel. So you can have something like this::
			
 
				 
			
 
				 	/*
			
 
				 	 * this is the hardware "mailbox" we use to communicate with
			
@@ -104,7 +114,7 @@ pointer from the kernel. So you can have something like this:
 
				 				...
			
 
				 
			
 
				 on the other hand, you want the bus address when you have a buffer that 
			
 
				-you want to give to the controller:
			
 
				+you want to give to the controller::
			
 
				 
			
 
				 	/* ask the controller to read the sense status into "sense_buffer" */
			
 
				 	mbox.bufstart = virt_to_bus(&sense_buffer);
			
@@ -112,7 +122,7 @@ you want to give to the controller:
 
				 	mbox.status = 0;
			
 
				 	notify_controller(&mbox);
			
 
				 
			
 
				-And you generally _never_ want to use the physical address, because you can't
			
 
				+And you generally **never** want to use the physical address, because you can't
			
 
				 use that from the CPU (the CPU only uses translated virtual addresses), and
			
 
				 you can't use it from the bus master. 
			
 
				 
			
@@ -124,8 +134,10 @@ be remapped as measured in units of pages, a.k.a. the pfn (the memory
 
				 management layer doesn't know about devices outside the CPU, so it
			
 
				 shouldn't need to know about "bus addresses" etc).
			
 
				 
			
 
				-NOTE NOTE NOTE! The above is only one part of the whole equation. The above
			
 
				-only talks about "real memory", that is, CPU memory (RAM). 
			
 
				+.. note::
			
 
				+
			
 
				+	The above is only one part of the whole equation. The above
			
 
				+	only talks about "real memory", that is, CPU memory (RAM).
			
 
				 
			
 
				 There is a completely different type of memory too, and that's the "shared
			
 
				 memory" on the PCI or ISA bus. That's generally not RAM (although in the case
			
@@ -137,20 +149,22 @@ whatever, and there is only one way to access it: the readb/writeb and
 
				 related functions. You should never take the address of such memory, because
			
 
				 there is really nothing you can do with such an address: it's not
			
 
				 conceptually in the same memory space as "real memory" at all, so you cannot
			
 
				-just dereference a pointer. (Sadly, on x86 it _is_ in the same memory space,
			
 
				+just dereference a pointer. (Sadly, on x86 it **is** in the same memory space,
			
 
				 so on x86 it actually works to just deference a pointer, but it's not
			
 
				 portable). 
			
 
				 
			
 
				-For such memory, you can do things like
			
 
				+For such memory, you can do things like:
			
 
				+
			
 
				+ - reading::
			
 
				 
			
 
				- - reading:
			
 
				 	/*
			
 
				 	 * read first 32 bits from ISA memory at 0xC0000, aka
			
 
				 	 * C000:0000 in DOS terms
			
 
				 	 */
			
 
				 	unsigned int signature = isa_readl(0xC0000);
			
 
				 
			
 
				- - remapping and writing:
			
 
				+ - remapping and writing::
			
 
				+
			
 
				 	/*
			
 
				 	 * remap framebuffer PCI memory area at 0xFC000000,
			
 
				 	 * size 1MB, so that we can access it: We can directly
			
@@ -165,7 +179,8 @@ For such memory, you can do things like
 
				 	/* unmap when we unload the driver */
			
 
				 	iounmap(baseptr);
			
 
				 
			
 
				- - copying and clearing:
			
 
				+ - copying and clearing::
			
 
				+
			
 
				 	/* get the 6-byte Ethernet address at ISA address E000:0040 */
			
 
				 	memcpy_fromio(kernel_buffer, 0xE0040, 6);
			
 
				 	/* write a packet to the driver */
			
@@ -181,10 +196,10 @@ happy that your driver works ;)
 
				 Note that kernel versions 2.0.x (and earlier) mistakenly called the
			
 
				 ioremap() function "vremap()".  ioremap() is the proper name, but I
			
 
				 didn't think straight when I wrote it originally.  People who have to
			
 
				-support both can do something like:
			
 
				+support both can do something like::
			
 
				  
			
 
				 	/* support old naming silliness */
			
 
				-	#if LINUX_VERSION_CODE < 0x020100                                     
			
 
				+	#if LINUX_VERSION_CODE < 0x020100
			
 
				 	#define ioremap vremap
			
 
				 	#define iounmap vfree                                                     
			
 
				 	#endif
			
@@ -196,13 +211,10 @@ And the above sounds worse than it really is.  Most real drivers really
 
				 don't do all that complex things (or rather: the complexity is not so
			
 
				 much in the actual IO accesses as in error handling and timeouts etc). 
			
 
				 It's generally not hard to fix drivers, and in many cases the code
			
 
				-actually looks better afterwards:
			
 
				+actually looks better afterwards::
			
 
				 
			
 
				 	unsigned long signature = *(unsigned int *) 0xC0000;
			
 
				 		vs
			
 
				 	unsigned long signature = readl(0xC0000);
			
 
				 
			
 
				 I think the second version actually is more readable, no?
			
 
				-
			
 
				-		Linus
			
 
				-
			
--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -1,7 +1,8 @@
 
				-		Cache and TLB Flushing
			
 
				-		     Under Linux
			
 
				+==================================
			
 
				+Cache and TLB Flushing Under Linux
			
 
				+==================================
			
 
				 
			
 
				-	    David S. Miller <davem@redhat.com>
			
 
				+:Author: David S. Miller <davem@redhat.com>
			
 
				 
			
 
				 This document describes the cache/tlb flushing interfaces called
			
 
				 by the Linux VM subsystem.  It enumerates over each interface,
			
@@ -28,7 +29,7 @@ Therefore when software page table changes occur, the kernel will
 
				 invoke one of the following flush methods _after_ the page table
			
 
				 changes occur:
			
 
				 
			
 
				-1) void flush_tlb_all(void)
			
 
				+1) ``void flush_tlb_all(void)``
			
 
				 
			
 
				 	The most severe flush of all.  After this interface runs,
			
 
				 	any previous page table modification whatsoever will be
			
@@ -37,7 +38,7 @@ changes occur:
 
				 	This is usually invoked when the kernel page tables are
			
 
				 	changed, since such translations are "global" in nature.
			
 
				 
			
 
				-2) void flush_tlb_mm(struct mm_struct *mm)
			
 
				+2) ``void flush_tlb_mm(struct mm_struct *mm)``
			
 
				 
			
 
				 	This interface flushes an entire user address space from
			
 
				 	the TLB.  After running, this interface must make sure that
			
@@ -49,8 +50,8 @@ changes occur:
 
				 	page table operations such as what happens during
			
 
				 	fork, and exec.
			
 
				 
			
 
				-3) void flush_tlb_range(struct vm_area_struct *vma,
			
 
				-			unsigned long start, unsigned long end)
			
 
				+3) ``void flush_tlb_range(struct vm_area_struct *vma,
			
 
				+   unsigned long start, unsigned long end)``
			
 
				 
			
 
				 	Here we are flushing a specific range of (user) virtual
			
 
				 	address translations from the TLB.  After running, this
			
@@ -69,7 +70,7 @@ changes occur:
 
				 	call flush_tlb_page (see below) for each entry which may be
			
 
				 	modified.
			
 
				 
			
 
				-4) void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)
			
 
				+4) ``void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr)``
			
 
				 
			
 
				 	This time we need to remove the PAGE_SIZE sized translation
			
 
				 	from the TLB.  The 'vma' is the backing structure used by
			
@@ -87,8 +88,8 @@ changes occur:
 
				 
			
 
				 	This is used primarily during fault processing.
			
 
				 
			
 
				-5) void update_mmu_cache(struct vm_area_struct *vma,
			
 
				-			 unsigned long address, pte_t *ptep)
			
 
				+5) ``void update_mmu_cache(struct vm_area_struct *vma,
			
 
				+   unsigned long address, pte_t *ptep)``
			
 
				 
			
 
				 	At the end of every page fault, this routine is invoked to
			
 
				 	tell the architecture specific code that a translation
			
@@ -100,7 +101,7 @@ changes occur:
 
				 	translations for software managed TLB configurations.
			
 
				 	The sparc64 port currently does this.
			
 
				 
			
 
				-6) void tlb_migrate_finish(struct mm_struct *mm)
			
 
				+6) ``void tlb_migrate_finish(struct mm_struct *mm)``
			
 
				 
			
 
				 	This interface is called at the end of an explicit
			
 
				 	process migration. This interface provides a hook
			
@@ -112,7 +113,7 @@ changes occur:
 
				 
			
 
				 Next, we have the cache flushing interfaces.  In general, when Linux
			
 
				 is changing an existing virtual-->physical mapping to a new value,
			
 
				-the sequence will be in one of the following forms:
			
 
				+the sequence will be in one of the following forms::
			
 
				 
			
 
				 	1) flush_cache_mm(mm);
			
 
				 	   change_all_page_tables_of(mm);
			
@@ -143,7 +144,7 @@ and have no dependency on translation information.
 
				 
			
 
				 Here are the routines, one by one:
			
 
				 
			
 
				-1) void flush_cache_mm(struct mm_struct *mm)
			
 
				+1) ``void flush_cache_mm(struct mm_struct *mm)``
			
 
				 
			
 
				 	This interface flushes an entire user address space from
			
 
				 	the caches.  That is, after running, there will be no cache
			
@@ -152,7 +153,7 @@ Here are the routines, one by one:
 
				 	This interface is used to handle whole address space
			
 
				 	page table operations such as what happens during exit and exec.
			
 
				 
			
 
				-2) void flush_cache_dup_mm(struct mm_struct *mm)
			
 
				+2) ``void flush_cache_dup_mm(struct mm_struct *mm)``
			
 
				 
			
 
				 	This interface flushes an entire user address space from
			
 
				 	the caches.  That is, after running, there will be no cache
			
@@ -164,8 +165,8 @@ Here are the routines, one by one:
 
				 	This option is separate from flush_cache_mm to allow some
			
 
				 	optimizations for VIPT caches.
			
 
				 
			
 
				-3) void flush_cache_range(struct vm_area_struct *vma,
			
 
				-			  unsigned long start, unsigned long end)
			
 
				+3) ``void flush_cache_range(struct vm_area_struct *vma,
			
 
				+   unsigned long start, unsigned long end)``
			
 
				 
			
 
				 	Here we are flushing a specific range of (user) virtual
			
 
				 	addresses from the cache.  After running, there will be no
			
@@ -181,7 +182,7 @@ Here are the routines, one by one:
 
				 	call flush_cache_page (see below) for each entry which may be
			
 
				 	modified.
			
 
				 
			
 
				-4) void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)
			
 
				+4) ``void flush_cache_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn)``
			
 
				 
			
 
				 	This time we need to remove a PAGE_SIZE sized range
			
 
				 	from the cache.  The 'vma' is the backing structure used by
			
@@ -202,7 +203,7 @@ Here are the routines, one by one:
 
				 
			
 
				 	This is used primarily during fault processing.
			
 
				 
			
 
				-5) void flush_cache_kmaps(void)
			
 
				+5) ``void flush_cache_kmaps(void)``
			
 
				 
			
 
				 	This routine need only be implemented if the platform utilizes
			
 
				 	highmem.  It will be called right before all of the kmaps
			
@@ -214,8 +215,8 @@ Here are the routines, one by one:
 
				 
			
 
				 	This routing should be implemented in asm/highmem.h
			
 
				 
			
 
				-6) void flush_cache_vmap(unsigned long start, unsigned long end)
			
 
				-   void flush_cache_vunmap(unsigned long start, unsigned long end)
			
 
				+6) ``void flush_cache_vmap(unsigned long start, unsigned long end)``
			
 
				+   ``void flush_cache_vunmap(unsigned long start, unsigned long end)``
			
 
				 
			
 
				 	Here in these two interfaces we are flushing a specific range
			
 
				 	of (kernel) virtual addresses from the cache.  After running,
			
@@ -243,8 +244,10 @@ size).  This setting will force the SYSv IPC layer to only allow user
 
				 processes to mmap shared memory at address which are a multiple of
			
 
				 this value.
			
 
				 
			
 
				-NOTE: This does not fix shared mmaps, check out the sparc64 port for
			
 
				-one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
			
 
				+.. note::
			
 
				+
			
 
				+  This does not fix shared mmaps, check out the sparc64 port for
			
 
				+  one way to solve this (in particular SPARC_FLAG_MMAPSHARED).
			
 
				 
			
 
				 Next, you have to solve the D-cache aliasing issue for all
			
 
				 other cases.  Please keep in mind that fact that, for a given page
			
@@ -255,8 +258,8 @@ physical page into its address space, by implication the D-cache
 
				 aliasing problem has the potential to exist since the kernel already
			
 
				 maps this page at its virtual address.
			
 
				 
			
 
				-  void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)
			
 
				-  void clear_user_page(void *to, unsigned long addr, struct page *page)
			
 
				+  ``void copy_user_page(void *to, void *from, unsigned long addr, struct page *page)``
			
 
				+  ``void clear_user_page(void *to, unsigned long addr, struct page *page)``
			
 
				 
			
 
				 	These two routines store data in user anonymous or COW
			
 
				 	pages.  It allows a port to efficiently avoid D-cache alias
			
@@ -276,14 +279,16 @@ maps this page at its virtual address.
 
				 	If D-cache aliasing is not an issue, these two routines may
			
 
				 	simply call memcpy/memset directly and do nothing more.
			
 
				 
			
 
				-  void flush_dcache_page(struct page *page)
			
 
				+  ``void flush_dcache_page(struct page *page)``
			
 
				 
			
 
				 	Any time the kernel writes to a page cache page, _OR_
			
 
				 	the kernel is about to read from a page cache page and
			
 
				 	user space shared/writable mappings of this page potentially
			
 
				 	exist, this routine is called.
			
 
				 
			
 
				-	NOTE: This routine need only be called for page cache pages
			
 
				+	.. note::
			
 
				+
			
 
				+	      This routine need only be called for page cache pages
			
 
				 	      which can potentially ever be mapped into the address
			
 
				 	      space of a user process.  So for example, VFS layer code
			
 
				 	      handling vfs symlinks in the page cache need not call
			
@@ -322,18 +327,19 @@ maps this page at its virtual address.
 
				 	made of this flag bit, and if set the flush is done and the flag
			
 
				 	bit is cleared.
			
 
				 
			
 
				-	IMPORTANT NOTE: It is often important, if you defer the flush,
			
 
				+	.. important::
			
 
				+
			
 
				+			It is often important, if you defer the flush,
			
 
				 			that the actual flush occurs on the same CPU
			
 
				 			as did the cpu stores into the page to make it
			
 
				 			dirty.  Again, see sparc64 for examples of how
			
 
				 			to deal with this.
			
 
				 
			
 
				-  void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
			
 
				-                         unsigned long user_vaddr,
			
 
				-                         void *dst, void *src, int len)
			
 
				-  void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
			
 
				-                           unsigned long user_vaddr,
			
 
				-                           void *dst, void *src, int len)
			
 
				+  ``void copy_to_user_page(struct vm_area_struct *vma, struct page *page,
			
 
				+  unsigned long user_vaddr, void *dst, void *src, int len)``
			
 
				+  ``void copy_from_user_page(struct vm_area_struct *vma, struct page *page,
			
 
				+  unsigned long user_vaddr, void *dst, void *src, int len)``
			
 
				+
			
 
				 	When the kernel needs to copy arbitrary data in and out
			
 
				 	of arbitrary user pages (f.e. for ptrace()) it will use
			
 
				 	these two routines.
			
@@ -344,8 +350,9 @@ maps this page at its virtual address.
 
				 	likely that you will need to flush the instruction cache
			
 
				 	for copy_to_user_page().
			
 
				 
			
 
				-  void flush_anon_page(struct vm_area_struct *vma, struct page *page,
			
 
				-                       unsigned long vmaddr)
			
 
				+  ``void flush_anon_page(struct vm_area_struct *vma, struct page *page,
			
 
				+  unsigned long vmaddr)``
			
 
				+
			
 
				   	When the kernel needs to access the contents of an anonymous
			
 
				 	page, it calls this function (currently only
			
 
				 	get_user_pages()).  Note: flush_dcache_page() deliberately
			
@@ -354,7 +361,8 @@ maps this page at its virtual address.
 
				 	architectures).  For incoherent architectures, it should flush
			
 
				 	the cache of the page at vmaddr.
			
 
				 
			
 
				-  void flush_kernel_dcache_page(struct page *page)
			
 
				+  ``void flush_kernel_dcache_page(struct page *page)``
			
 
				+
			
 
				 	When the kernel needs to modify a user page is has obtained
			
 
				 	with kmap, it calls this function after all modifications are
			
 
				 	complete (but before kunmapping it) to bring the underlying
			
@@ -366,14 +374,16 @@ maps this page at its virtual address.
 
				 	the kernel cache for page (using page_address(page)).
			
 
				 
			
 
				 
			
 
				-  void flush_icache_range(unsigned long start, unsigned long end)
			
 
				+  ``void flush_icache_range(unsigned long start, unsigned long end)``
			
 
				+
			
 
				   	When the kernel stores into addresses that it will execute
			
 
				 	out of (eg when loading modules), this function is called.
			
 
				 
			
 
				 	If the icache does not snoop stores then this routine will need
			
 
				 	to flush it.
			
 
				 
			
 
				-  void flush_icache_page(struct vm_area_struct *vma, struct page *page)
			
 
				+  ``void flush_icache_page(struct vm_area_struct *vma, struct page *page)``
			
 
				+
			
 
				 	All the functionality of flush_icache_page can be implemented in
			
 
				 	flush_dcache_page and update_mmu_cache. In the future, the hope
			
 
				 	is to remove this interface completely.
			
@@ -387,7 +397,8 @@ the kernel trying to do I/O to vmap areas must manually manage
 
				 coherency.  It must do this by flushing the vmap range before doing
			
 
				 I/O and invalidating it after the I/O returns.
			
 
				 
			
 
				-  void flush_kernel_vmap_range(void *vaddr, int size)
			
 
				+  ``void flush_kernel_vmap_range(void *vaddr, int size)``
			
 
				+
			
 
				        flushes the kernel cache for a given virtual address range in
			
 
				        the vmap area.  This is to make sure that any data the kernel
			
 
				        modified in the vmap range is made visible to the physical
			
@@ -395,7 +406,8 @@ I/O and invalidating it after the I/O returns.
 
				        Note that this API does *not* also flush the offset map alias
			
 
				        of the area.
			
 
				 
			
 
				-  void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates
			
 
				+  ``void invalidate_kernel_vmap_range(void *vaddr, int size) invalidates``
			
 
				+
			
 
				        the cache for a given virtual address range in the vmap area
			
 
				        which prevents the processor from making the cache stale by
			
 
				        speculatively reading data while the I/O was occurring to the
			
--- a/Documentation/cgroup-v1/memory.txt
+++ b/Documentation/cgroup-v1/memory.txt
@@ -789,23 +789,46 @@ way to trigger. Applications should do whatever they can to help the
 
				 system. It might be too late to consult with vmstat or any other
			
 
				 statistics, so it's advisable to take an immediate action.
			
 
				 
			
 
				-The events are propagated upward until the event is handled, i.e. the
			
 
				-events are not pass-through. Here is what this means: for example you have
			
 
				-three cgroups: A->B->C. Now you set up an event listener on cgroups A, B
			
 
				-and C, and suppose group C experiences some pressure. In this situation,
			
 
				-only group C will receive the notification, i.e. groups A and B will not
			
 
				-receive it. This is done to avoid excessive "broadcasting" of messages,
			
 
				-which disturbs the system and which is especially bad if we are low on
			
 
				-memory or thrashing. So, organize the cgroups wisely, or propagate the
			
 
				-events manually (or, ask us to implement the pass-through events,
			
 
				-explaining why would you need them.)
			
 
				+By default, events are propagated upward until the event is handled, i.e. the
			
 
				+events are not pass-through. For example, you have three cgroups: A->B->C. Now
			
 
				+you set up an event listener on cgroups A, B and C, and suppose group C
			
 
				+experiences some pressure. In this situation, only group C will receive the
			
 
				+notification, i.e. groups A and B will not receive it. This is done to avoid
			
 
				+excessive "broadcasting" of messages, which disturbs the system and which is
			
 
				+especially bad if we are low on memory or thrashing. Group B, will receive
			
 
				+notification only if there are no event listers for group C.
			
 
				+
			
 
				+There are three optional modes that specify different propagation behavior:
			
 
				+
			
 
				+ - "default": this is the default behavior specified above. This mode is the
			
 
				+   same as omitting the optional mode parameter, preserved by backwards
			
 
				+   compatibility.
			
 
				+
			
 
				+ - "hierarchy": events always propagate up to the root, similar to the default
			
 
				+   behavior, except that propagation continues regardless of whether there are
			
 
				+   event listeners at each level, with the "hierarchy" mode. In the above
			
 
				+   example, groups A, B, and C will receive notification of memory pressure.
			
 
				+
			
 
				+ - "local": events are pass-through, i.e. they only receive notifications when
			
 
				+   memory pressure is experienced in the memcg for which the notification is
			
 
				+   registered. In the above example, group C will receive notification if
			
 
				+   registered for "local" notification and the group experiences memory
			
 
				+   pressure. However, group B will never receive notification, regardless if
			
 
				+   there is an event listener for group C or not, if group B is registered for
			
 
				+   local notification.
			
 
				+
			
 
				+The level and event notification mode ("hierarchy" or "local", if necessary) are
			
 
				+specified by a comma-delimited string, i.e. "low,hierarchy" specifies
			
 
				+hierarchical, pass-through, notification for all ancestor memcgs. Notification
			
 
				+that is the default, non pass-through behavior, does not specify a mode.
			
 
				+"medium,local" specifies pass-through notification for the medium level.
			
 
				 
			
 
				 The file memory.pressure_level is only used to setup an eventfd. To
			
 
				 register a notification, an application must:
			
 
				 
			
 
				 - create an eventfd using eventfd(2);
			
 
				 - open memory.pressure_level;
			
 
				-- write string like "<event_fd> <fd of memory.pressure_level> <level>"
			
 
				+- write string as "<event_fd> <fd of memory.pressure_level> <level[,mode]>"
			
 
				   to cgroup.event_control.
			
 
				 
			
 
				 Application will be notified through eventfd when memory pressure is at
			
@@ -821,7 +844,7 @@ Test:
 
				    # cd /sys/fs/cgroup/memory/
			
 
				    # mkdir foo
			
 
				    # cd foo
			
 
				-   # cgroup_event_listener memory.pressure_level low &
			
 
				+   # cgroup_event_listener memory.pressure_level low,hierarchy &
			
 
				    # echo 8000000 > memory.limit_in_bytes
			
 
				    # echo 8000000 > memory.memsw.limit_in_bytes
			
 
				    # echo $$ > tasks
			
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1,7 +1,9 @@
 
				-
			
 
				+================
			
 
				 Control Group v2
			
 
				+================
			
 
				 
			
 
				-October, 2015		Tejun Heo <tj@kernel.org>
			
 
				+:Date: October, 2015
			
 
				+:Author: Tejun Heo <tj@kernel.org>
			
 
				 
			
 
				 This is the authoritative documentation on the design, interface and
			
 
				 conventions of cgroup v2.  It describes all userland-visible aspects
			
@@ -9,70 +11,72 @@ of cgroup including core and specific controller behaviors.  All
 
				 future changes must be reflected in this document.  Documentation for
			
 
				 v1 is available under Documentation/cgroup-v1/.
			
 
				 
			
 
				-CONTENTS
			
 
				-
			
 
				-1. Introduction
			
 
				-  1-1. Terminology
			
 
				-  1-2. What is cgroup?
			
 
				-2. Basic Operations
			
 
				-  2-1. Mounting
			
 
				-  2-2. Organizing Processes
			
 
				-  2-3. [Un]populated Notification
			
 
				-  2-4. Controlling Controllers
			
 
				-    2-4-1. Enabling and Disabling
			
 
				-    2-4-2. Top-down Constraint
			
 
				-    2-4-3. No Internal Process Constraint
			
 
				-  2-5. Delegation
			
 
				-    2-5-1. Model of Delegation
			
 
				-    2-5-2. Delegation Containment
			
 
				-  2-6. Guidelines
			
 
				-    2-6-1. Organize Once and Control
			
 
				-    2-6-2. Avoid Name Collisions
			
 
				-3. Resource Distribution Models
			
 
				-  3-1. Weights
			
 
				-  3-2. Limits
			
 
				-  3-3. Protections
			
 
				-  3-4. Allocations
			
 
				-4. Interface Files
			
 
				-  4-1. Format
			
 
				-  4-2. Conventions
			
 
				-  4-3. Core Interface Files
			
 
				-5. Controllers
			
 
				-  5-1. CPU
			
 
				-    5-1-1. CPU Interface Files
			
 
				-  5-2. Memory
			
 
				-    5-2-1. Memory Interface Files
			
 
				-    5-2-2. Usage Guidelines
			
 
				-    5-2-3. Memory Ownership
			
 
				-  5-3. IO
			
 
				-    5-3-1. IO Interface Files
			
 
				-    5-3-2. Writeback
			
 
				-  5-4. PID
			
 
				-    5-4-1. PID Interface Files
			
 
				-  5-5. RDMA
			
 
				-    5-5-1. RDMA Interface Files
			
 
				-  5-6. Misc
			
 
				-    5-6-1. perf_event
			
 
				-6. Namespace
			
 
				-  6-1. Basics
			
 
				-  6-2. The Root and Views
			
 
				-  6-3. Migration and setns(2)
			
 
				-  6-4. Interaction with Other Namespaces
			
 
				-P. Information on Kernel Programming
			
 
				-  P-1. Filesystem Support for Writeback
			
 
				-D. Deprecated v1 Core Features
			
 
				-R. Issues with v1 and Rationales for v2
			
 
				-  R-1. Multiple Hierarchies
			
 
				-  R-2. Thread Granularity
			
 
				-  R-3. Competition Between Inner Nodes and Threads
			
 
				-  R-4. Other Interface Issues
			
 
				-  R-5. Controller Issues and Remedies
			
 
				-    R-5-1. Memory
			
 
				-
			
 
				-
			
 
				-1. Introduction
			
 
				-
			
 
				-1-1. Terminology
			
 
				+.. CONTENTS
			
 
				+
			
 
				+   1. Introduction
			
 
				+     1-1. Terminology
			
 
				+     1-2. What is cgroup?
			
 
				+   2. Basic Operations
			
 
				+     2-1. Mounting
			
 
				+     2-2. Organizing Processes
			
 
				+     2-3. [Un]populated Notification
			
 
				+     2-4. Controlling Controllers
			
 
				+       2-4-1. Enabling and Disabling
			
 
				+       2-4-2. Top-down Constraint
			
 
				+       2-4-3. No Internal Process Constraint
			
 
				+     2-5. Delegation
			
 
				+       2-5-1. Model of Delegation
			
 
				+       2-5-2. Delegation Containment
			
 
				+     2-6. Guidelines
			
 
				+       2-6-1. Organize Once and Control
			
 
				+       2-6-2. Avoid Name Collisions
			
 
				+   3. Resource Distribution Models
			
 
				+     3-1. Weights
			
 
				+     3-2. Limits
			
 
				+     3-3. Protections
			
 
				+     3-4. Allocations
			
 
				+   4. Interface Files
			
 
				+     4-1. Format
			
 
				+     4-2. Conventions
			
 
				+     4-3. Core Interface Files
			
 
				+   5. Controllers
			
 
				+     5-1. CPU
			
 
				+       5-1-1. CPU Interface Files
			
 
				+     5-2. Memory
			
 
				+       5-2-1. Memory Interface Files
			
 
				+       5-2-2. Usage Guidelines
			
 
				+       5-2-3. Memory Ownership
			
 
				+     5-3. IO
			
 
				+       5-3-1. IO Interface Files
			
 
				+       5-3-2. Writeback
			
 
				+     5-4. PID
			
 
				+       5-4-1. PID Interface Files
			
 
				+     5-5. RDMA
			
 
				+       5-5-1. RDMA Interface Files
			
 
				+     5-6. Misc
			
 
				+       5-6-1. perf_event
			
 
				+   6. Namespace
			
 
				+     6-1. Basics
			
 
				+     6-2. The Root and Views
			
 
				+     6-3. Migration and setns(2)
			
 
				+     6-4. Interaction with Other Namespaces
			
 
				+   P. Information on Kernel Programming
			
 
				+     P-1. Filesystem Support for Writeback
			
 
				+   D. Deprecated v1 Core Features
			
 
				+   R. Issues with v1 and Rationales for v2
			
 
				+     R-1. Multiple Hierarchies
			
 
				+     R-2. Thread Granularity
			
 
				+     R-3. Competition Between Inner Nodes and Threads
			
 
				+     R-4. Other Interface Issues
			
 
				+     R-5. Controller Issues and Remedies
			
 
				+       R-5-1. Memory
			
 
				+
			
 
				+
			
 
				+Introduction
			
 
				+============
			
 
				+
			
 
				+Terminology
			
 
				+-----------
			
 
				 
			
 
				 "cgroup" stands for "control group" and is never capitalized.  The
			
 
				 singular form is used to designate the whole feature and also as a
			
@@ -80,7 +84,8 @@ qualifier as in "cgroup controllers".  When explicitly referring to
 
				 multiple individual control groups, the plural form "cgroups" is used.
			
 
				 
			
 
				 
			
 
				-1-2. What is cgroup?
			
 
				+What is cgroup?
			
 
				+---------------
			
 
				 
			
 
				 cgroup is a mechanism to organize processes hierarchically and
			
 
				 distribute system resources along the hierarchy in a controlled and
			
@@ -110,12 +115,14 @@ restrictions set closer to the root in the hierarchy can not be
 
				 overridden from further away.
			
 
				 
			
 
				 
			
 
				-2. Basic Operations
			
 
				+Basic Operations
			
 
				+================
			
 
				 
			
 
				-2-1. Mounting
			
 
				+Mounting
			
 
				+--------
			
 
				 
			
 
				 Unlike v1, cgroup v2 has only single hierarchy.  The cgroup v2
			
 
				-hierarchy can be mounted with the following mount command.
			
 
				+hierarchy can be mounted with the following mount command::
			
 
				 
			
 
				   # mount -t cgroup2 none $MOUNT_POINT
			
 
				 
			
@@ -149,11 +156,22 @@ during boot, before manual intervention is possible. To make testing
 
				 and experimenting easier, the kernel parameter cgroup_no_v1= allows
			
 
				 disabling controllers in v1 and make them always available in v2.
			
 
				 
			
 
				+cgroup v2 currently supports the following mount options.
			
 
				+
			
 
				+  nsdelegate
			
 
				+
			
 
				+	Consider cgroup namespaces as delegation boundaries.  This
			
 
				+	option is system wide and can only be set on mount or modified
			
 
				+	through remount from the init namespace.  The mount option is
			
 
				+	ignored on non-init namespace mounts.  Please refer to the
			
 
				+	Delegation section for details.
			
 
				 
			
 
				-2-2. Organizing Processes
			
 
				+
			
 
				+Organizing Processes
			
 
				+--------------------
			
 
				 
			
 
				 Initially, only the root cgroup exists to which all processes belong.
			
 
				-A child cgroup can be created by creating a sub-directory.
			
 
				+A child cgroup can be created by creating a sub-directory::
			
 
				 
			
 
				   # mkdir $CGROUP_NAME
			
 
				 
			
@@ -180,28 +198,29 @@ moved to another cgroup.
 
				 A cgroup which doesn't have any children or live processes can be
			
 
				 destroyed by removing the directory.  Note that a cgroup which doesn't
			
 
				 have any children and is associated only with zombie processes is
			
 
				-considered empty and can be removed.
			
 
				+considered empty and can be removed::
			
 
				 
			
 
				   # rmdir $CGROUP_NAME
			
 
				 
			
 
				 "/proc/$PID/cgroup" lists a process's cgroup membership.  If legacy
			
 
				 cgroup is in use in the system, this file may contain multiple lines,
			
 
				 one for each hierarchy.  The entry for cgroup v2 is always in the
			
 
				-format "0::$PATH".
			
 
				+format "0::$PATH"::
			
 
				 
			
 
				   # cat /proc/842/cgroup
			
 
				   ...
			
 
				   0::/test-cgroup/test-cgroup-nested
			
 
				 
			
 
				 If the process becomes a zombie and the cgroup it was associated with
			
 
				-is removed subsequently, " (deleted)" is appended to the path.
			
 
				+is removed subsequently, " (deleted)" is appended to the path::
			
 
				 
			
 
				   # cat /proc/842/cgroup
			
 
				   ...
			
 
				   0::/test-cgroup/test-cgroup-nested (deleted)
			
 
				 
			
 
				 
			
 
				-2-3. [Un]populated Notification
			
 
				+[Un]populated Notification
			
 
				+--------------------------
			
 
				 
			
 
				 Each non-root cgroup has a "cgroup.events" file which contains
			
 
				 "populated" field indicating whether the cgroup's sub-hierarchy has
			
@@ -212,7 +231,7 @@ example, to start a clean-up operation after all processes of a given
 
				 sub-hierarchy have exited.  The populated state updates and
			
 
				 notifications are recursive.  Consider the following sub-hierarchy
			
 
				 where the numbers in the parentheses represent the numbers of processes
			
 
				-in each cgroup.
			
 
				+in each cgroup::
			
 
				 
			
 
				   A(4) - B(0) - C(1)
			
 
				               \ D(0)
			
@@ -223,18 +242,20 @@ file modified events will be generated on the "cgroup.events" files of
 
				 both cgroups.
			
 
				 
			
 
				 
			
 
				-2-4. Controlling Controllers
			
 
				+Controlling Controllers
			
 
				+-----------------------
			
 
				 
			
 
				-2-4-1. Enabling and Disabling
			
 
				+Enabling and Disabling
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Each cgroup has a "cgroup.controllers" file which lists all
			
 
				-controllers available for the cgroup to enable.
			
 
				+controllers available for the cgroup to enable::
			
 
				 
			
 
				   # cat cgroup.controllers
			
 
				   cpu io memory
			
 
				 
			
 
				 No controller is enabled by default.  Controllers can be enabled and
			
 
				-disabled by writing to the "cgroup.subtree_control" file.
			
 
				+disabled by writing to the "cgroup.subtree_control" file::
			
 
				 
			
 
				   # echo "+cpu +memory -io" > cgroup.subtree_control
			
 
				 
			
@@ -246,7 +267,7 @@ are specified, the last one is effective.
 
				 Enabling a controller in a cgroup indicates that the distribution of
			
 
				 the target resource across its immediate children will be controlled.
			
 
				 Consider the following sub-hierarchy.  The enabled controllers are
			
 
				-listed in parentheses.
			
 
				+listed in parentheses::
			
 
				 
			
 
				   A(cpu,memory) - B(memory) - C()
			
 
				                             \ D()
			
@@ -266,7 +287,8 @@ controller interface files - anything which doesn't start with
 
				 "cgroup." are owned by the parent rather than the cgroup itself.
			
 
				 
			
 
				 
			
 
				-2-4-2. Top-down Constraint
			
 
				+Top-down Constraint
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Resources are distributed top-down and a cgroup can further distribute
			
 
				 a resource only if the resource has been distributed to it from the
			
@@ -277,7 +299,8 @@ the parent has the controller enabled and a controller can't be
 
				 disabled if one or more children have it enabled.
			
 
				 
			
 
				 
			
 
				-2-4-3. No Internal Process Constraint
			
 
				+No Internal Process Constraint
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Non-root cgroups can only distribute resources to their children when
			
 
				 they don't have any processes of their own.  In other words, only
			
@@ -304,35 +327,49 @@ children before enabling controllers in its "cgroup.subtree_control"
 
				 file.
			
 
				 
			
 
				 
			
 
				-2-5. Delegation
			
 
				+Delegation
			
 
				+----------
			
 
				+
			
 
				+Model of Delegation
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				-2-5-1. Model of Delegation
			
 
				+A cgroup can be delegated in two ways.  First, to a less privileged
			
 
				+user by granting write access of the directory and its "cgroup.procs"
			
 
				+and "cgroup.subtree_control" files to the user.  Second, if the
			
 
				+"nsdelegate" mount option is set, automatically to a cgroup namespace
			
 
				+on namespace creation.
			
 
				 
			
 
				-A cgroup can be delegated to a less privileged user by granting write
			
 
				-access of the directory and its "cgroup.procs" file to the user.  Note
			
 
				-that resource control interface files in a given directory control the
			
 
				-distribution of the parent's resources and thus must not be delegated
			
 
				-along with the directory.
			
 
				+Because the resource control interface files in a given directory
			
 
				+control the distribution of the parent's resources, the delegatee
			
 
				+shouldn't be allowed to write to them.  For the first method, this is
			
 
				+achieved by not granting access to these files.  For the second, the
			
 
				+kernel rejects writes to all files other than "cgroup.procs" and
			
 
				+"cgroup.subtree_control" on a namespace root from inside the
			
 
				+namespace.
			
 
				 
			
 
				-Once delegated, the user can build sub-hierarchy under the directory,
			
 
				-organize processes as it sees fit and further distribute the resources
			
 
				-it received from the parent.  The limits and other settings of all
			
 
				-resource controllers are hierarchical and regardless of what happens
			
 
				-in the delegated sub-hierarchy, nothing can escape the resource
			
 
				-restrictions imposed by the parent.
			
 
				+The end results are equivalent for both delegation types.  Once
			
 
				+delegated, the user can build sub-hierarchy under the directory,
			
 
				+organize processes inside it as it sees fit and further distribute the
			
 
				+resources it received from the parent.  The limits and other settings
			
 
				+of all resource controllers are hierarchical and regardless of what
			
 
				+happens in the delegated sub-hierarchy, nothing can escape the
			
 
				+resource restrictions imposed by the parent.
			
 
				 
			
 
				 Currently, cgroup doesn't impose any restrictions on the number of
			
 
				 cgroups in or nesting depth of a delegated sub-hierarchy; however,
			
 
				 this may be limited explicitly in the future.
			
 
				 
			
 
				 
			
 
				-2-5-2. Delegation Containment
			
 
				+Delegation Containment
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 A delegated sub-hierarchy is contained in the sense that processes
			
 
				-can't be moved into or out of the sub-hierarchy by the delegatee.  For
			
 
				-a process with a non-root euid to migrate a target process into a
			
 
				-cgroup by writing its PID to the "cgroup.procs" file, the following
			
 
				-conditions must be met.
			
 
				+can't be moved into or out of the sub-hierarchy by the delegatee.
			
 
				+
			
 
				+For delegations to a less privileged user, this is achieved by
			
 
				+requiring the following conditions for a process with a non-root euid
			
 
				+to migrate a target process into a cgroup by writing its PID to the
			
 
				+"cgroup.procs" file.
			
 
				 
			
 
				 - The writer must have write access to the "cgroup.procs" file.
			
 
				 
			
@@ -345,7 +382,7 @@ in from or push out to outside the sub-hierarchy.
 
				 
			
 
				 For an example, let's assume cgroups C0 and C1 have been delegated to
			
 
				 user U0 who created C00, C01 under C0 and C10 under C1 as follows and
			
 
				-all processes under C0 and C1 belong to U0.
			
 
				+all processes under C0 and C1 belong to U0::
			
 
				 
			
 
				   ~~~~~~~~~~~~~ - C0 - C00
			
 
				   ~ cgroup    ~      \ C01
			
@@ -359,10 +396,17 @@ destination cgroup C00 is above the points of delegation and U0 would
 
				 not have write access to its "cgroup.procs" files and thus the write
			
 
				 will be denied with -EACCES.
			
 
				 
			
 
				+For delegations to namespaces, containment is achieved by requiring
			
 
				+that both the source and destination cgroups are reachable from the
			
 
				+namespace of the process which is attempting the migration.  If either
			
 
				+is not reachable, the migration is rejected with -ENOENT.
			
 
				+
			
 
				 
			
 
				-2-6. Guidelines
			
 
				+Guidelines
			
 
				+----------
			
 
				 
			
 
				-2-6-1. Organize Once and Control
			
 
				+Organize Once and Control
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Migrating a process across cgroups is a relatively expensive operation
			
 
				 and stateful resources such as memory are not moved together with the
			
@@ -378,7 +422,8 @@ distribution can be made by changing controller configuration through
 
				 the interface files.
			
 
				 
			
 
				 
			
 
				-2-6-2. Avoid Name Collisions
			
 
				+Avoid Name Collisions
			
 
				+~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 Interface files for a cgroup and its children cgroups occupy the same
			
 
				 directory and it is possible to create children cgroups which collide
			
@@ -396,14 +441,16 @@ cgroup doesn't do anything to prevent name collisions and it's the
 
				 user's responsibility to avoid them.
			
 
				 
			
 
				 
			
 
				-3. Resource Distribution Models
			
 
				+Resource Distribution Models
			
 
				+============================
			
 
				 
			
 
				 cgroup controllers implement several resource distribution schemes
			
 
				 depending on the resource type and expected use cases.  This section
			
 
				 describes major schemes in use along with their expected behaviors.
			
 
				 
			
 
				 
			
 
				-3-1. Weights
			
 
				+Weights
			
 
				+-------
			
 
				 
			
 
				 A parent's resource is distributed by adding up the weights of all
			
 
				 active children and giving each the fraction matching the ratio of its
			
@@ -424,7 +471,8 @@ process migrations.
 
				 and is an example of this type.
			
 
				 
			
 
				 
			
 
				-3-2. Limits
			
 
				+Limits
			
 
				+------
			
 
				 
			
 
				 A child can only consume upto the configured amount of the resource.
			
 
				 Limits can be over-committed - the sum of the limits of children can
			
@@ -440,7 +488,8 @@ process migrations.
 
				 on an IO device and is an example of this type.
			
 
				 
			
 
				 
			
 
				-3-3. Protections
			
 
				+Protections
			
 
				+-----------
			
 
				 
			
 
				 A cgroup is protected to be allocated upto the configured amount of
			
 
				 the resource if the usages of all its ancestors are under their
			
@@ -460,7 +509,8 @@ process migrations.
 
				 example of this type.
			
 
				 
			
 
				 
			
 
				-3-4. Allocations
			
 
				+Allocations
			
 
				+-----------
			
 
				 
			
 
				 A cgroup is exclusively allocated a certain amount of a finite
			
 
				 resource.  Allocations can't be over-committed - the sum of the
			
@@ -479,12 +529,14 @@ may be rejected.
 
				 type.
			
 
				 
			
 
				 
			
 
				-4. Interface Files
			
 
				+Interface Files
			
 
				+===============
			
 
				 
			
 
				-4-1. Format
			
 
				+Format
			
 
				+------
			
 
				 
			
 
				 All interface files should be in one of the following formats whenever
			
 
				-possible.
			
 
				+possible::
			
 
				 
			
 
				   New-line separated values
			
 
				   (when only one value can be written at once)
			
@@ -519,7 +571,8 @@ can be written at a time.  For nested keyed files, the sub key pairs
 
				 may be specified in any order and not all pairs have to be specified.
			
 
				 
			
 
				 
			
 
				-4-2. Conventions
			
 
				+Conventions
			
 
				+-----------
			
 
				 
			
 
				 - Settings for a single feature should be contained in a single file.
			
 
				 
			
@@ -555,25 +608,25 @@ may be specified in any order and not all pairs have to be specified.
 
				   with "default" as the value must not appear when read.
			
 
				 
			
 
				   For example, a setting which is keyed by major:minor device numbers
			
 
				-  with integer values may look like the following.
			
 
				+  with integer values may look like the following::
			
 
				 
			
 
				     # cat cgroup-example-interface-file
			
 
				     default 150
			
 
				     8:0 300
			
 
				 
			
 
				-  The default value can be updated by
			
 
				+  The default value can be updated by::
			
 
				 
			
 
				     # echo 125 > cgroup-example-interface-file
			
 
				 
			
 
				-  or
			
 
				+  or::
			
 
				 
			
 
				     # echo "default 125" > cgroup-example-interface-file
			
 
				 
			
 
				-  An override can be set by
			
 
				+  An override can be set by::
			
 
				 
			
 
				     # echo "8:16 170" > cgroup-example-interface-file
			
 
				 
			
 
				-  and cleared by
			
 
				+  and cleared by::
			
 
				 
			
 
				     # echo "8:0 default" > cgroup-example-interface-file
			
 
				     # cat cgroup-example-interface-file
			
@@ -586,12 +639,12 @@ may be specified in any order and not all pairs have to be specified.
 
				   generated on the file.
			
 
				 
			
 
				 
			
 
				-4-3. Core Interface Files
			
 
				+Core Interface Files
			
 
				+--------------------
			
 
				 
			
 
				 All cgroup core files are prefixed with "cgroup."
			
 
				 
			
 
				   cgroup.procs
			
 
				-
			
 
				 	A read-write new-line separated values file which exists on
			
 
				 	all cgroups.
			
 
				 
			
@@ -617,7 +670,6 @@ All cgroup core files are prefixed with "cgroup."
 
				 	should be granted along with the containing directory.
			
 
				 
			
 
				   cgroup.controllers
			
 
				-
			
 
				 	A read-only space separated values file which exists on all
			
 
				 	cgroups.
			
 
				 
			
@@ -625,7 +677,6 @@ All cgroup core files are prefixed with "cgroup."
 
				 	the cgroup.  The controllers are not ordered.
			
 
				 
			
 
				   cgroup.subtree_control
			
 
				-
			
 
				 	A read-write space separated values file which exists on all
			
 
				 	cgroups.  Starts out empty.
			
 
				 
			
@@ -641,23 +692,25 @@ All cgroup core files are prefixed with "cgroup."
 
				 	operations are specified, either all succeed or all fail.
			
 
				 
			
 
				   cgroup.events
			
 
				-
			
 
				 	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				 	The following entries are defined.  Unless specified
			
 
				 	otherwise, a value change in this file generates a file
			
 
				 	modified event.
			
 
				 
			
 
				 	  populated
			
 
				-
			
 
				 		1 if the cgroup or its descendants contains any live
			
 
				 		processes; otherwise, 0.
			
 
				 
			
 
				 
			
 
				-5. Controllers
			
 
				+Controllers
			
 
				+===========
			
 
				+
			
 
				+CPU
			
 
				+---
			
 
				 
			
 
				-5-1. CPU
			
 
				+.. note::
			
 
				 
			
 
				-[NOTE: The interface for the cpu controller hasn't been merged yet]
			
 
				+	The interface for the cpu controller hasn't been merged yet
			
 
				 
			
 
				 The "cpu" controllers regulates distribution of CPU cycles.  This
			
 
				 controller implements weight and absolute bandwidth limit models for
			
@@ -665,36 +718,34 @@ normal scheduling policy and absolute bandwidth allocation model for
 
				 realtime scheduling policy.
			
 
				 
			
 
				 
			
 
				-5-1-1. CPU Interface Files
			
 
				+CPU Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 All time durations are in microseconds.
			
 
				 
			
 
				   cpu.stat
			
 
				-
			
 
				 	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				 
			
 
				-	It reports the following six stats.
			
 
				+	It reports the following six stats:
			
 
				 
			
 
				-	  usage_usec
			
 
				-	  user_usec
			
 
				-	  system_usec
			
 
				-	  nr_periods
			
 
				-	  nr_throttled
			
 
				-	  throttled_usec
			
 
				+	- usage_usec
			
 
				+	- user_usec
			
 
				+	- system_usec
			
 
				+	- nr_periods
			
 
				+	- nr_throttled
			
 
				+	- throttled_usec
			
 
				 
			
 
				   cpu.weight
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "100".
			
 
				 
			
 
				 	The weight in the range [1, 10000].
			
 
				 
			
 
				   cpu.max
			
 
				-
			
 
				 	A read-write two value file which exists on non-root cgroups.
			
 
				 	The default is "max 100000".
			
 
				 
			
 
				-	The maximum bandwidth limit.  It's in the following format.
			
 
				+	The maximum bandwidth limit.  It's in the following format::
			
 
				 
			
 
				 	  $MAX $PERIOD
			
 
				 
			
@@ -703,9 +754,10 @@ All time durations are in microseconds.
 
				 	one number is written, $MAX is updated.
			
 
				 
			
 
				   cpu.rt.max
			
 
				+	.. note::
			
 
				 
			
 
				-  [NOTE: The semantics of this file is still under discussion and the
			
 
				-   interface hasn't been merged yet]
			
 
				+	   The semantics of this file is still under discussion and the
			
 
				+	   interface hasn't been merged yet
			
 
				 
			
 
				 	A read-write two value file which exists on all cgroups.
			
 
				 	The default is "0 100000".
			
@@ -713,7 +765,7 @@ All time durations are in microseconds.
 
				 	The maximum realtime runtime allocation.  Over-committing
			
 
				 	configurations are disallowed and process migrations are
			
 
				 	rejected if not enough bandwidth is available.  It's in the
			
 
				-	following format.
			
 
				+	following format::
			
 
				 
			
 
				 	  $MAX $PERIOD
			
 
				 
			
@@ -722,7 +774,8 @@ All time durations are in microseconds.
 
				 	updated.
			
 
				 
			
 
				 
			
 
				-5-2. Memory
			
 
				+Memory
			
 
				+------
			
 
				 
			
 
				 The "memory" controller regulates distribution of memory.  Memory is
			
 
				 stateful and implements both limit and protection models.  Due to the
			
@@ -744,14 +797,14 @@ following types of memory usages are tracked.
 
				 The above list may expand in the future for better coverage.
			
 
				 
			
 
				 
			
 
				-5-2-1. Memory Interface Files
			
 
				+Memory Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 All memory amounts are in bytes.  If a value which is not aligned to
			
 
				 PAGE_SIZE is written, the value may be rounded up to the closest
			
 
				 PAGE_SIZE multiple when read back.
			
 
				 
			
 
				   memory.current
			
 
				-
			
 
				 	A read-only single value file which exists on non-root
			
 
				 	cgroups.
			
 
				 
			
@@ -759,7 +812,6 @@ PAGE_SIZE multiple when read back.
 
				 	and its descendants.
			
 
				 
			
 
				   memory.low
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "0".
			
 
				 
			
@@ -772,7 +824,6 @@ PAGE_SIZE multiple when read back.
 
				 	protection is discouraged.
			
 
				 
			
 
				   memory.high
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "max".
			
 
				 
			
@@ -785,7 +836,6 @@ PAGE_SIZE multiple when read back.
 
				 	under extreme conditions the limit may be breached.
			
 
				 
			
 
				   memory.max
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "max".
			
 
				 
			
@@ -800,21 +850,18 @@ PAGE_SIZE multiple when read back.
 
				 	utility is limited to providing the final safety net.
			
 
				 
			
 
				   memory.events
			
 
				-
			
 
				 	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				 	The following entries are defined.  Unless specified
			
 
				 	otherwise, a value change in this file generates a file
			
 
				 	modified event.
			
 
				 
			
 
				 	  low
			
 
				-
			
 
				 		The number of times the cgroup is reclaimed due to
			
 
				 		high memory pressure even though its usage is under
			
 
				 		the low boundary.  This usually indicates that the low
			
 
				 		boundary is over-committed.
			
 
				 
			
 
				 	  high
			
 
				-
			
 
				 		The number of times processes of the cgroup are
			
 
				 		throttled and routed to perform direct memory reclaim
			
 
				 		because the high memory boundary was exceeded.  For a
			
@@ -823,19 +870,27 @@ PAGE_SIZE multiple when read back.
 
				 		occurrences are expected.
			
 
				 
			
 
				 	  max
			
 
				-
			
 
				 		The number of times the cgroup's memory usage was
			
 
				 		about to go over the max boundary.  If direct reclaim
			
 
				-		fails to bring it down, the OOM killer is invoked.
			
 
				+		fails to bring it down, the cgroup goes to OOM state.
			
 
				 
			
 
				 	  oom
			
 
				+		The number of time the cgroup's memory usage was
			
 
				+		reached the limit and allocation was about to fail.
			
 
				 
			
 
				-		The number of times the OOM killer has been invoked in
			
 
				-		the cgroup.  This may not exactly match the number of
			
 
				-		processes killed but should generally be close.
			
 
				+		Depending on context result could be invocation of OOM
			
 
				+		killer and retrying allocation or failing alloction.
			
 
				 
			
 
				-  memory.stat
			
 
				+		Failed allocation in its turn could be returned into
			
 
				+		userspace as -ENOMEM or siletly ignored in cases like
			
 
				+		disk readahead.  For now OOM in memory cgroup kills
			
 
				+		tasks iff shortage has happened inside page fault.
			
 
				 
			
 
				+	  oom_kill
			
 
				+		The number of processes belonging to this cgroup
			
 
				+		killed by any kind of OOM killer.
			
 
				+
			
 
				+  memory.stat
			
 
				 	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				 
			
 
				 	This breaks down the cgroup's memory footprint into different
			
@@ -849,73 +904,55 @@ PAGE_SIZE multiple when read back.
 
				 	fixed position; use the keys to look up specific values!
			
 
				 
			
 
				 	  anon
			
 
				-
			
 
				 		Amount of memory used in anonymous mappings such as
			
 
				 		brk(), sbrk(), and mmap(MAP_ANONYMOUS)
			
 
				 
			
 
				 	  file
			
 
				-
			
 
				 		Amount of memory used to cache filesystem data,
			
 
				 		including tmpfs and shared memory.
			
 
				 
			
 
				 	  kernel_stack
			
 
				-
			
 
				 		Amount of memory allocated to kernel stacks.
			
 
				 
			
 
				 	  slab
			
 
				-
			
 
				 		Amount of memory used for storing in-kernel data
			
 
				 		structures.
			
 
				 
			
 
				 	  sock
			
 
				-
			
 
				 		Amount of memory used in network transmission buffers
			
 
				 
			
 
				 	  shmem
			
 
				-
			
 
				 		Amount of cached filesystem data that is swap-backed,
			
 
				 		such as tmpfs, shm segments, shared anonymous mmap()s
			
 
				 
			
 
				 	  file_mapped
			
 
				-
			
 
				 		Amount of cached filesystem data mapped with mmap()
			
 
				 
			
 
				 	  file_dirty
			
 
				-
			
 
				 		Amount of cached filesystem data that was modified but
			
 
				 		not yet written back to disk
			
 
				 
			
 
				 	  file_writeback
			
 
				-
			
 
				 		Amount of cached filesystem data that was modified and
			
 
				 		is currently being written back to disk
			
 
				 
			
 
				-	  inactive_anon
			
 
				-	  active_anon
			
 
				-	  inactive_file
			
 
				-	  active_file
			
 
				-	  unevictable
			
 
				-
			
 
				+	  inactive_anon, active_anon, inactive_file, active_file, unevictable
			
 
				 		Amount of memory, swap-backed and filesystem-backed,
			
 
				 		on the internal memory management lists used by the
			
 
				 		page reclaim algorithm
			
 
				 
			
 
				 	  slab_reclaimable
			
 
				-
			
 
				 		Part of "slab" that might be reclaimed, such as
			
 
				 		dentries and inodes.
			
 
				 
			
 
				 	  slab_unreclaimable
			
 
				-
			
 
				 		Part of "slab" that cannot be reclaimed on memory
			
 
				 		pressure.
			
 
				 
			
 
				 	  pgfault
			
 
				-
			
 
				 		Total number of page faults incurred
			
 
				 
			
 
				 	  pgmajfault
			
 
				-
			
 
				 		Number of major page faults incurred
			
 
				 
			
 
				 	  workingset_refault
			
@@ -930,8 +967,35 @@ PAGE_SIZE multiple when read back.
 
				 
			
 
				 		Number of times a shadow node has been reclaimed
			
 
				 
			
 
				-  memory.swap.current
			
 
				+	  pgrefill
			
 
				+
			
 
				+		Amount of scanned pages (in an active LRU list)
			
 
				+
			
 
				+	  pgscan
			
 
				+
			
 
				+		Amount of scanned pages (in an inactive LRU list)
			
 
				+
			
 
				+	  pgsteal
			
 
				+
			
 
				+		Amount of reclaimed pages
			
 
				+
			
 
				+	  pgactivate
			
 
				+
			
 
				+		Amount of pages moved to the active LRU list
			
 
				+
			
 
				+	  pgdeactivate
			
 
				 
			
 
				+		Amount of pages moved to the inactive LRU lis
			
 
				+
			
 
				+	  pglazyfree
			
 
				+
			
 
				+		Amount of pages postponed to be freed under memory pressure
			
 
				+
			
 
				+	  pglazyfreed
			
 
				+
			
 
				+		Amount of reclaimed lazyfree pages
			
 
				+
			
 
				+  memory.swap.current
			
 
				 	A read-only single value file which exists on non-root
			
 
				 	cgroups.
			
 
				 
			
@@ -939,7 +1003,6 @@ PAGE_SIZE multiple when read back.
 
				 	and its descendants.
			
 
				 
			
 
				   memory.swap.max
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "max".
			
 
				 
			
@@ -947,7 +1010,8 @@ PAGE_SIZE multiple when read back.
 
				 	limit, anonymous meomry of the cgroup will not be swapped out.
			
 
				 
			
 
				 
			
 
				-5-2-2. Usage Guidelines
			
 
				+Usage Guidelines
			
 
				+~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 "memory.high" is the main mechanism to control memory usage.
			
 
				 Over-committing on high limit (sum of high limits > available memory)
			
@@ -970,7 +1034,8 @@ memory; unfortunately, memory pressure monitoring mechanism isn't
 
				 implemented yet.
			
 
				 
			
 
				 
			
 
				-5-2-3. Memory Ownership
			
 
				+Memory Ownership
			
 
				+~~~~~~~~~~~~~~~~
			
 
				 
			
 
				 A memory area is charged to the cgroup which instantiated it and stays
			
 
				 charged to the cgroup until the area is released.  Migrating a process
			
@@ -988,7 +1053,8 @@ POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
 
				 belonging to the affected files to ensure correct memory ownership.
			
 
				 
			
 
				 
			
 
				-5-3. IO
			
 
				+IO
			
 
				+--
			
 
				 
			
 
				 The "io" controller regulates the distribution of IO resources.  This
			
 
				 controller implements both weight based and absolute bandwidth or IOPS
			
@@ -997,28 +1063,29 @@ only if cfq-iosched is in use and neither scheme is available for
 
				 blk-mq devices.
			
 
				 
			
 
				 
			
 
				-5-3-1. IO Interface Files
			
 
				+IO Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				   io.stat
			
 
				-
			
 
				 	A read-only nested-keyed file which exists on non-root
			
 
				 	cgroups.
			
 
				 
			
 
				 	Lines are keyed by $MAJ:$MIN device numbers and not ordered.
			
 
				 	The following nested keys are defined.
			
 
				 
			
 
				+	  ======	===================
			
 
				 	  rbytes	Bytes read
			
 
				 	  wbytes	Bytes written
			
 
				 	  rios		Number of read IOs
			
 
				 	  wios		Number of write IOs
			
 
				+	  ======	===================
			
 
				 
			
 
				-	An example read output follows.
			
 
				+	An example read output follows:
			
 
				 
			
 
				 	  8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
			
 
				 	  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
			
 
				 
			
 
				   io.weight
			
 
				-
			
 
				 	A read-write flat-keyed file which exists on non-root cgroups.
			
 
				 	The default is "default 100".
			
 
				 
			
@@ -1032,14 +1099,13 @@ blk-mq devices.
 
				 	$WEIGHT" or simply "$WEIGHT".  Overrides can be set by writing
			
 
				 	"$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
			
 
				 
			
 
				-	An example read output follows.
			
 
				+	An example read output follows::
			
 
				 
			
 
				 	  default 100
			
 
				 	  8:16 200
			
 
				 	  8:0 50
			
 
				 
			
 
				   io.max
			
 
				-
			
 
				 	A read-write nested-keyed file which exists on non-root
			
 
				 	cgroups.
			
 
				 
			
@@ -1047,10 +1113,12 @@ blk-mq devices.
 
				 	device numbers and not ordered.  The following nested keys are
			
 
				 	defined.
			
 
				 
			
 
				+	  =====		==================================
			
 
				 	  rbps		Max read bytes per second
			
 
				 	  wbps		Max write bytes per second
			
 
				 	  riops		Max read IO operations per second
			
 
				 	  wiops		Max write IO operations per second
			
 
				+	  =====		==================================
			
 
				 
			
 
				 	When writing, any number of nested key-value pairs can be
			
 
				 	specified in any order.  "max" can be specified as the value
			
@@ -1060,24 +1128,25 @@ blk-mq devices.
 
				 	BPS and IOPS are measured in each IO direction and IOs are
			
 
				 	delayed if limit is reached.  Temporary bursts are allowed.
			
 
				 
			
 
				-	Setting read limit at 2M BPS and write at 120 IOPS for 8:16.
			
 
				+	Setting read limit at 2M BPS and write at 120 IOPS for 8:16::
			
 
				 
			
 
				 	  echo "8:16 rbps=2097152 wiops=120" > io.max
			
 
				 
			
 
				-	Reading returns the following.
			
 
				+	Reading returns the following::
			
 
				 
			
 
				 	  8:16 rbps=2097152 wbps=max riops=max wiops=120
			
 
				 
			
 
				-	Write IOPS limit can be removed by writing the following.
			
 
				+	Write IOPS limit can be removed by writing the following::
			
 
				 
			
 
				 	  echo "8:16 wiops=max" > io.max
			
 
				 
			
 
				-	Reading now returns the following.
			
 
				+	Reading now returns the following::
			
 
				 
			
 
				 	  8:16 rbps=2097152 wbps=max riops=max wiops=max
			
 
				 
			
 
				 
			
 
				-5-3-2. Writeback
			
 
				+Writeback
			
 
				+~~~~~~~~~
			
 
				 
			
 
				 Page cache is dirtied through buffered writes and shared mmaps and
			
 
				 written asynchronously to the backing filesystem by the writeback
			
@@ -1125,22 +1194,19 @@ patterns.
 
				 The sysctl knobs which affect writeback behavior are applied to cgroup
			
 
				 writeback as follows.
			
 
				 
			
 
				-  vm.dirty_background_ratio
			
 
				-  vm.dirty_ratio
			
 
				-
			
 
				+  vm.dirty_background_ratio, vm.dirty_ratio
			
 
				 	These ratios apply the same to cgroup writeback with the
			
 
				 	amount of available memory capped by limits imposed by the
			
 
				 	memory controller and system-wide clean memory.
			
 
				 
			
 
				-  vm.dirty_background_bytes
			
 
				-  vm.dirty_bytes
			
 
				-
			
 
				+  vm.dirty_background_bytes, vm.dirty_bytes
			
 
				 	For cgroup writeback, this is calculated into ratio against
			
 
				 	total available memory and applied the same way as
			
 
				 	vm.dirty[_background]_ratio.
			
 
				 
			
 
				 
			
 
				-5-4. PID
			
 
				+PID
			
 
				+---
			
 
				 
			
 
				 The process number controller is used to allow a cgroup to stop any
			
 
				 new tasks from being fork()'d or clone()'d after a specified limit is
			
@@ -1155,17 +1221,16 @@ Note that PIDs used in this controller refer to TIDs, process IDs as
 
				 used by the kernel.
			
 
				 
			
 
				 
			
 
				-5-4-1. PID Interface Files
			
 
				+PID Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				   pids.max
			
 
				-
			
 
				 	A read-write single value file which exists on non-root
			
 
				 	cgroups.  The default is "max".
			
 
				 
			
 
				 	Hard limit of number of processes.
			
 
				 
			
 
				   pids.current
			
 
				-
			
 
				 	A read-only single value file which exists on all cgroups.
			
 
				 
			
 
				 	The number of processes currently in the cgroup and its
			
@@ -1180,12 +1245,14 @@ through fork() or clone(). These will return -EAGAIN if the creation
 
				 of a new process would cause a cgroup policy to be violated.
			
 
				 
			
 
				 
			
 
				-5-5. RDMA
			
 
				+RDMA
			
 
				+----
			
 
				 
			
 
				 The "rdma" controller regulates the distribution and accounting of
			
 
				 of RDMA resources.
			
 
				 
			
 
				-5-5-1. RDMA Interface Files
			
 
				+RDMA Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~~
			
 
				 
			
 
				   rdma.max
			
 
				 	A readwrite nested-keyed file that exists for all the cgroups
			
@@ -1198,10 +1265,12 @@ of RDMA resources.
 
				 
			
 
				 	The following nested keys are defined.
			
 
				 
			
 
				+	  ==========	=============================
			
 
				 	  hca_handle	Maximum number of HCA Handles
			
 
				 	  hca_object 	Maximum number of HCA Objects
			
 
				+	  ==========	=============================
			
 
				 
			
 
				-	An example for mlx4 and ocrdma device follows.
			
 
				+	An example for mlx4 and ocrdma device follows::
			
 
				 
			
 
				 	  mlx4_0 hca_handle=2 hca_object=2000
			
 
				 	  ocrdma1 hca_handle=3 hca_object=max
			
@@ -1210,15 +1279,17 @@ of RDMA resources.
 
				 	A read-only file that describes current resource usage.
			
 
				 	It exists for all the cgroup except root.
			
 
				 
			
 
				-	An example for mlx4 and ocrdma device follows.
			
 
				+	An example for mlx4 and ocrdma device follows::
			
 
				 
			
 
				 	  mlx4_0 hca_handle=1 hca_object=20
			
 
				 	  ocrdma1 hca_handle=1 hca_object=23
			
 
				 
			
 
				 
			
 
				-5-6. Misc
			
 
				+Misc
			
 
				+----
			
 
				 
			
 
				-5-6-1. perf_event
			
 
				+perf_event
			
 
				+~~~~~~~~~~
			
 
				 
			
 
				 perf_event controller, if not mounted on a legacy hierarchy, is
			
 
				 automatically enabled on the v2 hierarchy so that perf events can
			
@@ -1226,9 +1297,11 @@ always be filtered by cgroup v2 path.  The controller can still be
 
				 moved to a legacy hierarchy after v2 hierarchy is populated.
			
 
				 
			
 
				 
			
 
				-6. Namespace
			
 
				+Namespace
			
 
				+=========
			
 
				 
			
 
				-6-1. Basics
			
 
				+Basics
			
 
				+------
			
 
				 
			
 
				 cgroup namespace provides a mechanism to virtualize the view of the
			
 
				 "/proc/$PID/cgroup" file and cgroup mounts.  The CLONE_NEWCGROUP clone
			
@@ -1242,7 +1315,7 @@ Without cgroup namespace, the "/proc/$PID/cgroup" file shows the
 
				 complete path of the cgroup of a process.  In a container setup where
			
 
				 a set of cgroups and namespaces are intended to isolate processes the
			
 
				 "/proc/$PID/cgroup" file may leak potential system level information
			
 
				-to the isolated processes.  For Example:
			
 
				+to the isolated processes.  For Example::
			
 
				 
			
 
				   # cat /proc/self/cgroup
			
 
				   0::/batchjobs/container_id1
			
@@ -1250,14 +1323,14 @@ to the isolated processes.  For Example:
 
				 The path '/batchjobs/container_id1' can be considered as system-data
			
 
				 and undesirable to expose to the isolated processes.  cgroup namespace
			
 
				 can be used to restrict visibility of this path.  For example, before
			
 
				-creating a cgroup namespace, one would see:
			
 
				+creating a cgroup namespace, one would see::
			
 
				 
			
 
				   # ls -l /proc/self/ns/cgroup
			
 
				   lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
			
 
				   # cat /proc/self/cgroup
			
 
				   0::/batchjobs/container_id1
			
 
				 
			
 
				-After unsharing a new namespace, the view changes.
			
 
				+After unsharing a new namespace, the view changes::
			
 
				 
			
 
				   # ls -l /proc/self/ns/cgroup
			
 
				   lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183]
			
@@ -1275,7 +1348,8 @@ namespace is destroyed.  The cgroupns root and the actual cgroups
 
				 remain.
			
 
				 
			
 
				 
			
 
				-6-2. The Root and Views
			
 
				+The Root and Views
			
 
				+------------------
			
 
				 
			
 
				 The 'cgroupns root' for a cgroup namespace is the cgroup in which the
			
 
				 process calling unshare(2) is running.  For example, if a process in
			
@@ -1284,7 +1358,7 @@ process calling unshare(2) is running.  For example, if a process in
 
				 init_cgroup_ns, this is the real root ('/') cgroup.
			
 
				 
			
 
				 The cgroupns root cgroup does not change even if the namespace creator
			
 
				-process later moves to a different cgroup.
			
 
				+process later moves to a different cgroup::
			
 
				 
			
 
				   # ~/unshare -c # unshare cgroupns in some cgroup
			
 
				   # cat /proc/self/cgroup
			
@@ -1298,7 +1372,7 @@ Each process gets its namespace-specific view of "/proc/$PID/cgroup"
 
				 
			
 
				 Processes running inside the cgroup namespace will be able to see
			
 
				 cgroup paths (in /proc/self/cgroup) only inside their root cgroup.
			
 
				-From within an unshared cgroupns:
			
 
				+From within an unshared cgroupns::
			
 
				 
			
 
				   # sleep 100000 &
			
 
				   [1] 7353
			
@@ -1307,7 +1381,7 @@ From within an unshared cgroupns:
 
				   0::/sub_cgrp_1
			
 
				 
			
 
				 From the initial cgroup namespace, the real cgroup path will be
			
 
				-visible:
			
 
				+visible::
			
 
				 
			
 
				   $ cat /proc/7353/cgroup
			
 
				   0::/batchjobs/container_id1/sub_cgrp_1
			
@@ -1315,7 +1389,7 @@ visible:
 
				 From a sibling cgroup namespace (that is, a namespace rooted at a
			
 
				 different cgroup), the cgroup path relative to its own cgroup
			
 
				 namespace root will be shown.  For instance, if PID 7353's cgroup
			
 
				-namespace root is at '/batchjobs/container_id2', then it will see
			
 
				+namespace root is at '/batchjobs/container_id2', then it will see::
			
 
				 
			
 
				   # cat /proc/7353/cgroup
			
 
				   0::/../container_id2/sub_cgrp_1
			
@@ -1324,13 +1398,14 @@ Note that the relative path always starts with '/' to indicate that
 
				 its relative to the cgroup namespace root of the caller.
			
 
				 
			
 
				 
			
 
				-6-3. Migration and setns(2)
			
 
				+Migration and setns(2)
			
 
				+----------------------
			
 
				 
			
 
				 Processes inside a cgroup namespace can move into and out of the
			
 
				 namespace root if they have proper access to external cgroups.  For
			
 
				 example, from inside a namespace with cgroupns root at
			
 
				 /batchjobs/container_id1, and assuming that the global hierarchy is
			
 
				-still accessible inside cgroupns:
			
 
				+still accessible inside cgroupns::
			
 
				 
			
 
				   # cat /proc/7353/cgroup
			
 
				   0::/sub_cgrp_1
			
@@ -1352,10 +1427,11 @@ namespace.  It is expected that the someone moves the attaching
 
				 process under the target cgroup namespace root.
			
 
				 
			
 
				 
			
 
				-6-4. Interaction with Other Namespaces
			
 
				+Interaction with Other Namespaces
			
 
				+---------------------------------
			
 
				 
			
 
				 Namespace specific cgroup hierarchy can be mounted by a process
			
 
				-running inside a non-init cgroup namespace.
			
 
				+running inside a non-init cgroup namespace::
			
 
				 
			
 
				   # mount -t cgroup2 none $MOUNT_POINT
			
 
				 
			
@@ -1368,27 +1444,27 @@ the view of cgroup hierarchy by namespace-private cgroupfs mount
 
				 provides a properly isolated cgroup view inside the container.
			
 
				 
			
 
				 
			
 
				-P. Information on Kernel Programming
			
 
				+Information on Kernel Programming
			
 
				+=================================
			
 
				 
			
 
				 This section contains kernel programming information in the areas
			
 
				 where interacting with cgroup is necessary.  cgroup core and
			
 
				 controllers are not covered.
			
 
				 
			
 
				 
			
 
				-P-1. Filesystem Support for Writeback
			
 
				+Filesystem Support for Writeback
			
 
				+--------------------------------
			
 
				 
			
 
				 A filesystem can support cgroup writeback by updating
			
 
				 address_space_operations->writepage[s]() to annotate bio's using the
			
 
				 following two functions.
			
 
				 
			
 
				   wbc_init_bio(@wbc, @bio)
			
 
				-
			
 
				 	Should be called for each bio carrying writeback data and
			
 
				 	associates the bio with the inode's owner cgroup.  Can be
			
 
				 	called anytime between bio allocation and submission.
			
 
				 
			
 
				   wbc_account_io(@wbc, @page, @bytes)
			
 
				-
			
 
				 	Should be called for each data segment being written out.
			
 
				 	While this function doesn't care exactly when it's called
			
 
				 	during the writeback session, it's the easiest and most
			
@@ -1409,11 +1485,12 @@ cases by skipping wbc_init_bio() or using bio_associate_blkcg()
 
				 directly.
			
 
				 
			
 
				 
			
 
				-D. Deprecated v1 Core Features
			
 
				+Deprecated v1 Core Features
			
 
				+===========================
			
 
				 
			
 
				 - Multiple hierarchies including named ones are not supported.
			
 
				 
			
 
				-- All mount options and remounting are not supported.
			
 
				+- All v1 mount options are not supported.
			
 
				 
			
 
				 - The "tasks" file is removed and "cgroup.procs" is not sorted.
			
 
				 
			
@@ -1423,9 +1500,11 @@ D. Deprecated v1 Core Features
 
				   at the root instead.
			
 
				 
			
 
				 
			
 
				-R. Issues with v1 and Rationales for v2
			
 
				+Issues with v1 and Rationales for v2
			
 
				+====================================
			
 
				 
			
 
				-R-1. Multiple Hierarchies
			
 
				+Multiple Hierarchies
			
 
				+--------------------
			
 
				 
			
 
				 cgroup v1 allowed an arbitrary number of hierarchies and each
			
 
				 hierarchy could host any number of controllers.  While this seemed to
			
@@ -1477,7 +1556,8 @@ how memory is distributed beyond a certain level while still wanting
 
				 to control how CPU cycles are distributed.
			
 
				 
			
 
				 
			
 
				-R-2. Thread Granularity
			
 
				+Thread Granularity
			
 
				+------------------
			
 
				 
			
 
				 cgroup v1 allowed threads of a process to belong to different cgroups.
			
 
				 This didn't make sense for some controllers and those controllers
			
@@ -1520,7 +1600,8 @@ misbehaving and poorly abstracted interfaces and kernel exposing and
 
				 locked into constructs inadvertently.
			
 
				 
			
 
				 
			
 
				-R-3. Competition Between Inner Nodes and Threads
			
 
				+Competition Between Inner Nodes and Threads
			
 
				+-------------------------------------------
			
 
				 
			
 
				 cgroup v1 allowed threads to be in any cgroups which created an
			
 
				 interesting problem where threads belonging to a parent cgroup and its
			
@@ -1539,7 +1620,7 @@ simply weren't available for threads.
 
				 
			
 
				 The io controller implicitly created a hidden leaf node for each
			
 
				 cgroup to host the threads.  The hidden leaf had its own copies of all
			
 
				-the knobs with "leaf_" prefixed.  While this allowed equivalent
			
 
				+the knobs with ``leaf_`` prefixed.  While this allowed equivalent
			
 
				 control over internal threads, it was with serious drawbacks.  It
			
 
				 always added an extra layer of nesting which wouldn't be necessary
			
 
				 otherwise, made the interface messy and significantly complicated the
			
@@ -1560,7 +1641,8 @@ This clearly is a problem which needs to be addressed from cgroup core
 
				 in a uniform way.
			
 
				 
			
 
				 
			
 
				-R-4. Other Interface Issues
			
 
				+Other Interface Issues
			
 
				+----------------------
			
 
				 
			
 
				 cgroup v1 grew without oversight and developed a large number of
			
 
				 idiosyncrasies and inconsistencies.  One issue on the cgroup core side
			
@@ -1588,9 +1670,11 @@ cgroup v2 establishes common conventions where appropriate and updates
 
				 controllers so that they expose minimal and consistent interfaces.
			
 
				 
			
 
				 
			
 
				-R-5. Controller Issues and Remedies
			
 
				+Controller Issues and Remedies
			
 
				+------------------------------
			
 
				 
			
 
				-R-5-1. Memory
			
 
				+Memory
			
 
				+~~~~~~
			
 
				 
			
 
				 The original lower boundary, the soft limit, is defined as a limit
			
 
				 that is per default unset.  As a result, the set of cgroups that
			
--- a/Documentation/circular-buffers.txt
+++ b/Documentation/circular-buffers.txt
@@ -1,9 +1,9 @@
 
				-			       ================
			
 
				-			       CIRCULAR BUFFERS
			
 
				-			       ================
			
 
				+================
			
 
				+Circular Buffers
			
 
				+================
			
 
				 
			
 
				-By: David Howells <dhowells@redhat.com>
			
 
				-    Paul E. McKenney <paulmck@linux.vnet.ibm.com>
			
 
				+:Author: David Howells <dhowells@redhat.com>
			
 
				+:Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
			
 
				 
			
 
				 
			
 
				 Linux provides a number of features that can be used to implement circular
			
@@ -20,7 +20,7 @@ producer and just one consumer.  It is possible to handle multiple producers by
 
				 serialising them, and to handle multiple consumers by serialising them.
			
 
				 
			
 
				 
			
 
				-Contents:
			
 
				+.. Contents:
			
 
				 
			
 
				  (*) What is a circular buffer?
			
 
				 
			
@@ -31,8 +31,8 @@ Contents:
 
				      - The consumer.
			
 
				 
			
 
				 
			
 
				-==========================
			
 
				-WHAT IS A CIRCULAR BUFFER?
			
 
				+
			
 
				+What is a circular buffer?
			
 
				 ==========================
			
 
				 
			
 
				 First of all, what is a circular buffer?  A circular buffer is a buffer of
			
@@ -60,9 +60,7 @@ buffer, provided that neither index overtakes the other.  The implementer must
 
				 be careful, however, as a region more than one unit in size may wrap the end of
			
 
				 the buffer and be broken into two segments.
			
 
				 
			
 
				-
			
 
				-============================
			
 
				-MEASURING POWER-OF-2 BUFFERS
			
 
				+Measuring power-of-2 buffers
			
 
				 ============================
			
 
				 
			
 
				 Calculation of the occupancy or the remaining capacity of an arbitrarily sized
			
@@ -71,13 +69,13 @@ modulus (divide) instruction.  However, if the buffer is of a power-of-2 size,
 
				 then a much quicker bitwise-AND instruction can be used instead.
			
 
				 
			
 
				 Linux provides a set of macros for handling power-of-2 circular buffers.  These
			
 
				-can be made use of by:
			
 
				+can be made use of by::
			
 
				 
			
 
				 	#include <linux/circ_buf.h>
			
 
				 
			
 
				 The macros are:
			
 
				 
			
 
				- (*) Measure the remaining capacity of a buffer:
			
 
				+ (#) Measure the remaining capacity of a buffer::
			
 
				 
			
 
				 	CIRC_SPACE(head_index, tail_index, buffer_size);
			
 
				 
			
@@ -85,7 +83,7 @@ The macros are:
 
				      can be inserted.
			
 
				 
			
 
				 
			
 
				- (*) Measure the maximum consecutive immediate space in a buffer:
			
 
				+ (#) Measure the maximum consecutive immediate space in a buffer::
			
 
				 
			
 
				 	CIRC_SPACE_TO_END(head_index, tail_index, buffer_size);
			
 
				 
			
@@ -94,14 +92,14 @@ The macros are:
 
				      beginning of the buffer.
			
 
				 
			
 
				 
			
 
				- (*) Measure the occupancy of a buffer:
			
 
				+ (#) Measure the occupancy of a buffer::
			
 
				 
			
 
				 	CIRC_CNT(head_index, tail_index, buffer_size);
			
 
				 
			
 
				      This returns the number of items currently occupying a buffer[2].
			
 
				 
			
 
				 
			
 
				- (*) Measure the non-wrapping occupancy of a buffer:
			
 
				+ (#) Measure the non-wrapping occupancy of a buffer::
			
 
				 
			
 
				 	CIRC_CNT_TO_END(head_index, tail_index, buffer_size);
			
 
				 
			
@@ -112,7 +110,7 @@ The macros are:
 
				 Each of these macros will nominally return a value between 0 and buffer_size-1,
			
 
				 however:
			
 
				 
			
 
				- [1] CIRC_SPACE*() are intended to be used in the producer.  To the producer
			
 
				+ (1) CIRC_SPACE*() are intended to be used in the producer.  To the producer
			
 
				      they will return a lower bound as the producer controls the head index,
			
 
				      but the consumer may still be depleting the buffer on another CPU and
			
 
				      moving the tail index.
			
@@ -120,7 +118,7 @@ however:
 
				      To the consumer it will show an upper bound as the producer may be busy
			
 
				      depleting the space.
			
 
				 
			
 
				- [2] CIRC_CNT*() are intended to be used in the consumer.  To the consumer they
			
 
				+ (2) CIRC_CNT*() are intended to be used in the consumer.  To the consumer they
			
 
				      will return a lower bound as the consumer controls the tail index, but the
			
 
				      producer may still be filling the buffer on another CPU and moving the
			
 
				      head index.
			
@@ -128,14 +126,12 @@ however:
 
				      To the producer it will show an upper bound as the consumer may be busy
			
 
				      emptying the buffer.
			
 
				 
			
 
				- [3] To a third party, the order in which the writes to the indices by the
			
 
				+ (3) To a third party, the order in which the writes to the indices by the
			
 
				      producer and consumer become visible cannot be guaranteed as they are
			
 
				      independent and may be made on different CPUs - so the result in such a
			
 
				      situation will merely be a guess, and may even be negative.
			
 
				 
			
 
				-
			
 
				-===========================================
			
 
				-USING MEMORY BARRIERS WITH CIRCULAR BUFFERS
			
 
				+Using memory barriers with circular buffers
			
 
				 ===========================================
			
 
				 
			
 
				 By using memory barriers in conjunction with circular buffers, you can avoid
			
@@ -152,10 +148,10 @@ time, and only one thing should be emptying a buffer at any one time, but the
 
				 two sides can operate simultaneously.
			
 
				 
			
 
				 
			
 
				-THE PRODUCER
			
 
				+The producer
			
 
				 ------------
			
 
				 
			
 
				-The producer will look something like this:
			
 
				+The producer will look something like this::
			
 
				 
			
 
				 	spin_lock(&producer_lock);
			
 
				 
			
@@ -193,10 +189,10 @@ ordering between the read of the index indicating that the consumer has
 
				 vacated a given element and the write by the producer to that same element.
			
 
				 
			
 
				 
			
 
				-THE CONSUMER
			
 
				+The Consumer
			
 
				 ------------
			
 
				 
			
 
				-The consumer will look something like this:
			
 
				+The consumer will look something like this::
			
 
				 
			
 
				 	spin_lock(&consumer_lock);
			
 
				 
			
@@ -235,8 +231,7 @@ prevents the compiler from tearing the store, and enforces ordering
 
				 against previous accesses.
			
 
				 
			
 
				 
			
 
				-===============
			
 
				-FURTHER READING
			
 
				+Further reading
			
 
				 ===============
			
 
				 
			
 
				 See also Documentation/memory-barriers.txt for a description of Linux's memory
			
--- a/Documentation/clk.txt
+++ b/Documentation/clk.txt
@@ -1,12 +1,16 @@
 
				-		The Common Clk Framework
			
 
				-		Mike Turquette <mturquette@ti.com>
			
 
				+========================
			
 
				+The Common Clk Framework
			
 
				+========================
			
 
				+
			
 
				+:Author: Mike Turquette <mturquette@ti.com>
			
 
				 
			
 
				 This document endeavours to explain the common clk framework details,
			
 
				 and how to port a platform over to this framework.  It is not yet a
			
 
				 detailed explanation of the clock api in include/linux/clk.h, but
			
 
				 perhaps someday it will include that information.
			
 
				 
			
 
				-	Part 1 - introduction and interface split
			
 
				+Introduction and interface split
			
 
				+================================
			
 
				 
			
 
				 The common clk framework is an interface to control the clock nodes
			
 
				 available on various devices today.  This may come in the form of clock
			
@@ -35,10 +39,11 @@ is defined in struct clk_foo and pointed to within struct clk_core.  This
 
				 allows for easy navigation between the two discrete halves of the common
			
 
				 clock interface.
			
 
				 
			
 
				-	Part 2 - common data structures and api
			
 
				+Common data structures and api
			
 
				+==============================
			
 
				 
			
 
				 Below is the common struct clk_core definition from
			
 
				-drivers/clk/clk.c, modified for brevity:
			
 
				+drivers/clk/clk.c, modified for brevity::
			
 
				 
			
 
				 	struct clk_core {
			
 
				 		const char		*name;
			
@@ -59,7 +64,7 @@ struct clk.  That api is documented in include/linux/clk.h.
 
				 
			
 
				 Platforms and devices utilizing the common struct clk_core use the struct
			
 
				 clk_ops pointer in struct clk_core to perform the hardware-specific parts of
			
 
				-the operations defined in clk-provider.h:
			
 
				+the operations defined in clk-provider.h::
			
 
				 
			
 
				 	struct clk_ops {
			
 
				 		int		(*prepare)(struct clk_hw *hw);
			
@@ -95,19 +100,20 @@ the operations defined in clk-provider.h:
 
				 					      struct dentry *dentry);
			
 
				 	};
			
 
				 
			
 
				-	Part 3 - hardware clk implementations
			
 
				+Hardware clk implementations
			
 
				+============================
			
 
				 
			
 
				 The strength of the common struct clk_core comes from its .ops and .hw pointers
			
 
				 which abstract the details of struct clk from the hardware-specific bits, and
			
 
				 vice versa.  To illustrate consider the simple gateable clk implementation in
			
 
				-drivers/clk/clk-gate.c:
			
 
				+drivers/clk/clk-gate.c::
			
 
				 
			
 
				-struct clk_gate {
			
 
				-	struct clk_hw	hw;
			
 
				-	void __iomem    *reg;
			
 
				-	u8              bit_idx;
			
 
				-	...
			
 
				-};
			
 
				+	struct clk_gate {
			
 
				+		struct clk_hw	hw;
			
 
				+		void __iomem    *reg;
			
 
				+		u8              bit_idx;
			
 
				+		...
			
 
				+	};
			
 
				 
			
 
				 struct clk_gate contains struct clk_hw hw as well as hardware-specific
			
 
				 knowledge about which register and bit controls this clk's gating.
			
@@ -115,7 +121,7 @@ Nothing about clock topology or accounting, such as enable_count or
 
				 notifier_count, is needed here.  That is all handled by the common
			
 
				 framework code and struct clk_core.
			
 
				 
			
 
				-Let's walk through enabling this clk from driver code:
			
 
				+Let's walk through enabling this clk from driver code::
			
 
				 
			
 
				 	struct clk *clk;
			
 
				 	clk = clk_get(NULL, "my_gateable_clk");
			
@@ -123,70 +129,71 @@ Let's walk through enabling this clk from driver code:
 
				 	clk_prepare(clk);
			
 
				 	clk_enable(clk);
			
 
				 
			
 
				-The call graph for clk_enable is very simple:
			
 
				+The call graph for clk_enable is very simple::
			
 
				 
			
 
				-clk_enable(clk);
			
 
				-	clk->ops->enable(clk->hw);
			
 
				-	[resolves to...]
			
 
				-		clk_gate_enable(hw);
			
 
				-		[resolves struct clk gate with to_clk_gate(hw)]
			
 
				-			clk_gate_set_bit(gate);
			
 
				+	clk_enable(clk);
			
 
				+		clk->ops->enable(clk->hw);
			
 
				+		[resolves to...]
			
 
				+			clk_gate_enable(hw);
			
 
				+			[resolves struct clk gate with to_clk_gate(hw)]
			
 
				+				clk_gate_set_bit(gate);
			
 
				 
			
 
				-And the definition of clk_gate_set_bit:
			
 
				+And the definition of clk_gate_set_bit::
			
 
				 
			
 
				-static void clk_gate_set_bit(struct clk_gate *gate)
			
 
				-{
			
 
				-	u32 reg;
			
 
				+	static void clk_gate_set_bit(struct clk_gate *gate)
			
 
				+	{
			
 
				+		u32 reg;
			
 
				 
			
 
				-	reg = __raw_readl(gate->reg);
			
 
				-	reg |= BIT(gate->bit_idx);
			
 
				-	writel(reg, gate->reg);
			
 
				-}
			
 
				+		reg = __raw_readl(gate->reg);
			
 
				+		reg |= BIT(gate->bit_idx);
			
 
				+		writel(reg, gate->reg);
			
 
				+	}
			
 
				 
			
 
				-Note that to_clk_gate is defined as:
			
 
				+Note that to_clk_gate is defined as::
			
 
				 
			
 
				-#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
			
 
				+	#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
			
 
				 
			
 
				 This pattern of abstraction is used for every clock hardware
			
 
				 representation.
			
 
				 
			
 
				-	Part 4 - supporting your own clk hardware
			
 
				+Supporting your own clk hardware
			
 
				+================================
			
 
				 
			
 
				 When implementing support for a new type of clock it is only necessary to
			
 
				-include the following header:
			
 
				+include the following header::
			
 
				 
			
 
				-#include <linux/clk-provider.h>
			
 
				+	#include <linux/clk-provider.h>
			
 
				 
			
 
				 To construct a clk hardware structure for your platform you must define
			
 
				-the following:
			
 
				+the following::
			
 
				 
			
 
				-struct clk_foo {
			
 
				-	struct clk_hw hw;
			
 
				-	... hardware specific data goes here ...
			
 
				-};
			
 
				+	struct clk_foo {
			
 
				+		struct clk_hw hw;
			
 
				+		... hardware specific data goes here ...
			
 
				+	};
			
 
				 
			
 
				 To take advantage of your data you'll need to support valid operations
			
 
				-for your clk:
			
 
				+for your clk::
			
 
				 
			
 
				-struct clk_ops clk_foo_ops {
			
 
				-	.enable		= &clk_foo_enable;
			
 
				-	.disable	= &clk_foo_disable;
			
 
				-};
			
 
				+	struct clk_ops clk_foo_ops {
			
 
				+		.enable		= &clk_foo_enable;
			
 
				+		.disable	= &clk_foo_disable;
			
 
				+	};
			
 
				 
			
 
				-Implement the above functions using container_of:
			
 
				+Implement the above functions using container_of::
			
 
				 
			
 
				-#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
			
 
				+	#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
			
 
				 
			
 
				-int clk_foo_enable(struct clk_hw *hw)
			
 
				-{
			
 
				-	struct clk_foo *foo;
			
 
				+	int clk_foo_enable(struct clk_hw *hw)
			
 
				+	{
			
 
				+		struct clk_foo *foo;
			
 
				 
			
 
				-	foo = to_clk_foo(hw);
			
 
				+		foo = to_clk_foo(hw);
			
 
				 
			
 
				-	... perform magic on foo ...
			
 
				+		... perform magic on foo ...
			
 
				 
			
 
				-	return 0;
			
 
				-};
			
 
				+		return 0;
			
 
				+	};
			
 
				 
			
 
				 Below is a matrix detailing which clk_ops are mandatory based upon the
			
 
				 hardware capabilities of that clock.  A cell marked as "y" means
			
@@ -194,41 +201,56 @@ mandatory, a cell marked as "n" implies that either including that
 
				 callback is invalid or otherwise unnecessary.  Empty cells are either
			
 
				 optional or must be evaluated on a case-by-case basis.
			
 
				 
			
 
				-                              clock hardware characteristics
			
 
				-                -----------------------------------------------------------
			
 
				-                | gate | change rate | single parent | multiplexer | root |
			
 
				-                |------|-------------|---------------|-------------|------|
			
 
				-.prepare        |      |             |               |             |      |
			
 
				-.unprepare      |      |             |               |             |      |
			
 
				-                |      |             |               |             |      |
			
 
				-.enable         | y    |             |               |             |      |
			
 
				-.disable        | y    |             |               |             |      |
			
 
				-.is_enabled     | y    |             |               |             |      |
			
 
				-                |      |             |               |             |      |
			
 
				-.recalc_rate    |      | y           |               |             |      |
			
 
				-.round_rate     |      | y [1]       |               |             |      |
			
 
				-.determine_rate |      | y [1]       |               |             |      |
			
 
				-.set_rate       |      | y           |               |             |      |
			
 
				-                |      |             |               |             |      |
			
 
				-.set_parent     |      |             | n             | y           | n    |
			
 
				-.get_parent     |      |             | n             | y           | n    |
			
 
				-                |      |             |               |             |      |
			
 
				-.recalc_accuracy|      |             |               |             |      |
			
 
				-                |      |             |               |             |      |
			
 
				-.init           |      |             |               |             |      |
			
 
				-                -----------------------------------------------------------
			
 
				-[1] either one of round_rate or determine_rate is required.
			
 
				+.. table:: clock hardware characteristics
			
 
				+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |                | gate | change rate | single parent | multiplexer | root |
			
 
				+   +================+======+=============+===============+=============+======+
			
 
				+   |.prepare        |      |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.unprepare      |      |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.enable         | y    |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.disable        | y    |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.is_enabled     | y    |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.recalc_rate    |      | y           |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.round_rate     |      | y [1]_      |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.determine_rate |      | y [1]_      |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.set_rate       |      | y           |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.set_parent     |      |             | n             | y           | n    |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.get_parent     |      |             | n             | y           | n    |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.recalc_accuracy|      |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+   |.init           |      |             |               |             |      |
			
 
				+   +----------------+------+-------------+---------------+-------------+------+
			
 
				+
			
 
				+.. [1] either one of round_rate or determine_rate is required.
			
 
				 
			
 
				 Finally, register your clock at run-time with a hardware-specific
			
 
				 registration function.  This function simply populates struct clk_foo's
			
 
				 data and then passes the common struct clk parameters to the framework
			
 
				-with a call to:
			
 
				+with a call to::
			
 
				 
			
 
				-clk_register(...)
			
 
				+	clk_register(...)
			
 
				 
			
 
				-See the basic clock types in drivers/clk/clk-*.c for examples.
			
 
				+See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
			
 
				 
			
 
				-	Part 5 - Disabling clock gating of unused clocks
			
 
				+Disabling clock gating of unused clocks
			
 
				+=======================================
			
 
				 
			
 
				 Sometimes during development it can be useful to be able to bypass the
			
 
				 default disabling of unused clocks. For example, if drivers aren't enabling
			
@@ -239,7 +261,8 @@ are sorted out.
 
				 To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
			
 
				 kernel.
			
 
				 
			
 
				-	Part 6 - Locking
			
 
				+Locking
			
 
				+=======
			
 
				 
			
 
				 The common clock framework uses two global locks, the prepare lock and the
			
 
				 enable lock.
			
--- a/Documentation/conf.py
+++ b/Documentation/conf.py
@@ -271,8 +271,7 @@ latex_elements = {
 
				 
			
 
				 # Additional stuff for the LaTeX preamble.
			
 
				     'preamble': '''
			
 
				-	% Adjust margins
			
 
				-	\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}
			
 
				+        \\usepackage{ifthen}
			
 
				 
			
 
				         % Allow generate some pages in landscape
			
 
				         \\usepackage{lscape}
			
@@ -281,6 +280,7 @@ latex_elements = {
 
				 	\\definecolor{NoteColor}{RGB}{204,255,255}
			
 
				 	\\definecolor{WarningColor}{RGB}{255,204,204}
			
 
				 	\\definecolor{AttentionColor}{RGB}{255,255,204}
			
 
				+	\\definecolor{ImportantColor}{RGB}{192,255,204}
			
 
				 	\\definecolor{OtherColor}{RGB}{204,204,204}
			
 
				         \\newlength{\\mynoticelength}
			
 
				         \\makeatletter\\newenvironment{coloredbox}[1]{%
			
@@ -301,7 +301,12 @@ latex_elements = {
 
				 	            \\ifthenelse%
			
 
				 	            {\\equal{\\py@noticetype}{attention}}%
			
 
				 	            {\\colorbox{AttentionColor}{\\usebox{\\@tempboxa}}}%
			
 
				-	            {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
			
 
				+		    {%
			
 
				+	               \\ifthenelse%
			
 
				+	               {\\equal{\\py@noticetype}{important}}%
			
 
				+	               {\\colorbox{ImportantColor}{\\usebox{\\@tempboxa}}}%
			
 
				+	               {\\colorbox{OtherColor}{\\usebox{\\@tempboxa}}}%
			
 
				+		    }%
			
 
				 		 }%
			
 
				 	      }%
			
 
				         }\\makeatother
			
@@ -336,30 +341,51 @@ latex_elements = {
 
				 if major == 1 and minor > 3:
			
 
				     latex_elements['preamble']  += '\\renewcommand*{\\DUrole}[2]{ #2 }\n'
			
 
				 
			
 
				+if major == 1 and minor <= 4:
			
 
				+    latex_elements['preamble']  += '\\usepackage[margin=0.5in, top=1in, bottom=1in]{geometry}'
			
 
				+elif major == 1 and (minor > 5 or (minor == 5 and patch >= 3)):
			
 
				+    latex_elements['sphinxsetup'] = 'hmargin=0.5in, vmargin=0.5in'
			
 
				+
			
 
				+
			
 
				 # Grouping the document tree into LaTeX files. List of tuples
			
 
				 # (source start file, target name, title,
			
 
				 #  author, documentclass [howto, manual, or own class]).
			
 
				+# Sorted in alphabetical order
			
 
				 latex_documents = [
			
 
				-    ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
			
 
				-     'The kernel development community', 'manual'),
			
 
				     ('admin-guide/index', 'linux-user.tex', 'Linux Kernel User Documentation',
			
 
				      'The kernel development community', 'manual'),
			
 
				     ('core-api/index', 'core-api.tex', 'The kernel core API manual',
			
 
				      'The kernel development community', 'manual'),
			
 
				-    ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
			
 
				+    ('crypto/index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
			
 
				      'The kernel development community', 'manual'),
			
 
				-    ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
			
 
				+    ('dev-tools/index', 'dev-tools.tex', 'Development tools for the Kernel',
			
 
				      'The kernel development community', 'manual'),
			
 
				-    ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation',
			
 
				+    ('doc-guide/index', 'kernel-doc-guide.tex', 'Linux Kernel Documentation Guide',
			
 
				      'The kernel development community', 'manual'),
			
 
				-    ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
			
 
				+    ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+    ('filesystems/index', 'filesystems.tex', 'Linux Filesystems API',
			
 
				      'The kernel development community', 'manual'),
			
 
				     ('gpu/index', 'gpu.tex', 'Linux GPU Driver Developer\'s Guide',
			
 
				      'The kernel development community', 'manual'),
			
 
				+    ('input/index', 'linux-input.tex', 'The Linux input driver subsystem',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+    ('kernel-hacking/index', 'kernel-hacking.tex', 'Unreliable Guide To Hacking The Linux Kernel',
			
 
				+     'The kernel development community', 'manual'),
			
 
				     ('media/index', 'media.tex', 'Linux Media Subsystem Documentation',
			
 
				      'The kernel development community', 'manual'),
			
 
				+    ('networking/index', 'networking.tex', 'Linux Networking Documentation',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+    ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation',
			
 
				+     'The kernel development community', 'manual'),
			
 
				     ('security/index', 'security.tex', 'The kernel security subsystem manual',
			
 
				      'The kernel development community', 'manual'),
			
 
				+    ('sh/index', 'sh.tex', 'SuperH architecture implementation manual',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+    ('sound/index', 'sound.tex', 'Linux Sound Subsystem Documentation',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+    ('userspace-api/index', 'userspace-api.tex', 'The Linux kernel user-space API guide',
			
 
				+     'The kernel development community', 'manual'),
			
 
				 ]
			
 
				 
			
 
				 # The name of an image file (relative to this directory) to place at the top of
			
--- a/Documentation/core-api/assoc_array.rst
+++ b/Documentation/core-api/assoc_array.rst
@@ -10,7 +10,10 @@ properties:
 
				 
			
 
				 1. Objects are opaque pointers.  The implementation does not care where they
			
 
				    point (if anywhere) or what they point to (if anything).
			
 
				-.. note:: Pointers to objects _must_ be zero in the least significant bit.
			
 
				+
			
 
				+   .. note::
			
 
				+
			
 
				+      Pointers to objects _must_ be zero in the least significant bit.
			
 
				 
			
 
				 2. Objects do not need to contain linkage blocks for use by the array.  This
			
 
				    permits an object to be located in multiple arrays simultaneously.
			
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -303,6 +303,11 @@ defined which accomplish this::
 
				 	void smp_mb__before_atomic(void);
			
 
				 	void smp_mb__after_atomic(void);
			
 
				 
			
 
				+Preceding a non-value-returning read-modify-write atomic operation with
			
 
				+smp_mb__before_atomic() and following it with smp_mb__after_atomic()
			
 
				+provides the same full ordering that is provided by value-returning
			
 
				+read-modify-write atomic operations.
			
 
				+
			
 
				 For example, smp_mb__before_atomic() can be used like so::
			
 
				 
			
 
				 	obj->dead = 1;
			
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -19,6 +19,7 @@ Core utilities
 
				    workqueue
			
 
				    genericirq
			
 
				    flexible-arrays
			
 
				+   librs
			
 
				 
			
 
				 Interfaces for kernel debugging
			
 
				 ===============================
			
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -114,7 +114,7 @@ The Slab Cache
 
				 User Space Memory Access
			
 
				 ------------------------
			
 
				 
			
 
				-.. kernel-doc:: arch/x86/include/asm/uaccess_32.h
			
 
				+.. kernel-doc:: arch/x86/include/asm/uaccess.h
			
 
				    :internal:
			
 
				 
			
 
				 .. kernel-doc:: arch/x86/lib/usercopy_32.c
			
--- a/Documentation/core-api/librs.rst
+++ b/Documentation/core-api/librs.rst
@@ -0,0 +1,212 @@
 
				+==========================================
			
 
				+Reed-Solomon Library Programming Interface
			
 
				+==========================================
			
 
				+
			
 
				+:Author: Thomas Gleixner
			
 
				+
			
 
				+Introduction
			
 
				+============
			
 
				+
			
 
				+The generic Reed-Solomon Library provides encoding, decoding and error
			
 
				+correction functions.
			
 
				+
			
 
				+Reed-Solomon codes are used in communication and storage applications to
			
 
				+ensure data integrity.
			
 
				+
			
 
				+This documentation is provided for developers who want to utilize the
			
 
				+functions provided by the library.
			
 
				+
			
 
				+Known Bugs And Assumptions
			
 
				+==========================
			
 
				+
			
 
				+None.
			
 
				+
			
 
				+Usage
			
 
				+=====
			
 
				+
			
 
				+This chapter provides examples of how to use the library.
			
 
				+
			
 
				+Initializing
			
 
				+------------
			
 
				+
			
 
				+The init function init_rs returns a pointer to an rs decoder structure,
			
 
				+which holds the necessary information for encoding, decoding and error
			
 
				+correction with the given polynomial. It either uses an existing
			
 
				+matching decoder or creates a new one. On creation all the lookup tables
			
 
				+for fast en/decoding are created. The function may take a while, so make
			
 
				+sure not to call it in critical code paths.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* the Reed Solomon control structure */
			
 
				+    static struct rs_control *rs_decoder;
			
 
				+
			
 
				+    /* Symbolsize is 10 (bits)
			
 
				+     * Primitive polynomial is x^10+x^3+1
			
 
				+     * first consecutive root is 0
			
 
				+     * primitive element to generate roots = 1
			
 
				+     * generator polynomial degree (number of roots) = 6
			
 
				+     */
			
 
				+    rs_decoder = init_rs (10, 0x409, 0, 1, 6);
			
 
				+
			
 
				+
			
 
				+Encoding
			
 
				+--------
			
 
				+
			
 
				+The encoder calculates the Reed-Solomon code over the given data length
			
 
				+and stores the result in the parity buffer. Note that the parity buffer
			
 
				+must be initialized before calling the encoder.
			
 
				+
			
 
				+The expanded data can be inverted on the fly by providing a non-zero
			
 
				+inversion mask. The expanded data is XOR'ed with the mask. This is used
			
 
				+e.g. for FLASH ECC, where the all 0xFF is inverted to an all 0x00. The
			
 
				+Reed-Solomon code for all 0x00 is all 0x00. The code is inverted before
			
 
				+storing to FLASH so it is 0xFF too. This prevents that reading from an
			
 
				+erased FLASH results in ECC errors.
			
 
				+
			
 
				+The databytes are expanded to the given symbol size on the fly. There is
			
 
				+no support for encoding continuous bitstreams with a symbol size != 8 at
			
 
				+the moment. If it is necessary it should be not a big deal to implement
			
 
				+such functionality.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* Parity buffer. Size = number of roots */
			
 
				+    uint16_t par[6];
			
 
				+    /* Initialize the parity buffer */
			
 
				+    memset(par, 0, sizeof(par));
			
 
				+    /* Encode 512 byte in data8. Store parity in buffer par */
			
 
				+    encode_rs8 (rs_decoder, data8, 512, par, 0);
			
 
				+
			
 
				+
			
 
				+Decoding
			
 
				+--------
			
 
				+
			
 
				+The decoder calculates the syndrome over the given data length and the
			
 
				+received parity symbols and corrects errors in the data.
			
 
				+
			
 
				+If a syndrome is available from a hardware decoder then the syndrome
			
 
				+calculation is skipped.
			
 
				+
			
 
				+The correction of the data buffer can be suppressed by providing a
			
 
				+correction pattern buffer and an error location buffer to the decoder.
			
 
				+The decoder stores the calculated error location and the correction
			
 
				+bitmask in the given buffers. This is useful for hardware decoders which
			
 
				+use a weird bit ordering scheme.
			
 
				+
			
 
				+The databytes are expanded to the given symbol size on the fly. There is
			
 
				+no support for decoding continuous bitstreams with a symbolsize != 8 at
			
 
				+the moment. If it is necessary it should be not a big deal to implement
			
 
				+such functionality.
			
 
				+
			
 
				+Decoding with syndrome calculation, direct data correction
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* Parity buffer. Size = number of roots */
			
 
				+    uint16_t par[6];
			
 
				+    uint8_t  data[512];
			
 
				+    int numerr;
			
 
				+    /* Receive data */
			
 
				+    .....
			
 
				+    /* Receive parity */
			
 
				+    .....
			
 
				+    /* Decode 512 byte in data8.*/
			
 
				+    numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
			
 
				+
			
 
				+
			
 
				+Decoding with syndrome given by hardware decoder, direct data correction
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* Parity buffer. Size = number of roots */
			
 
				+    uint16_t par[6], syn[6];
			
 
				+    uint8_t  data[512];
			
 
				+    int numerr;
			
 
				+    /* Receive data */
			
 
				+    .....
			
 
				+    /* Receive parity */
			
 
				+    .....
			
 
				+    /* Get syndrome from hardware decoder */
			
 
				+    .....
			
 
				+    /* Decode 512 byte in data8.*/
			
 
				+    numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
			
 
				+
			
 
				+
			
 
				+Decoding with syndrome given by hardware decoder, no direct data correction.
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Note: It's not necessary to give data and received parity to the
			
 
				+decoder.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* Parity buffer. Size = number of roots */
			
 
				+    uint16_t par[6], syn[6], corr[8];
			
 
				+    uint8_t  data[512];
			
 
				+    int numerr, errpos[8];
			
 
				+    /* Receive data */
			
 
				+    .....
			
 
				+    /* Receive parity */
			
 
				+    .....
			
 
				+    /* Get syndrome from hardware decoder */
			
 
				+    .....
			
 
				+    /* Decode 512 byte in data8.*/
			
 
				+    numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
			
 
				+    for (i = 0; i < numerr; i++) {
			
 
				+        do_error_correction_in_your_buffer(errpos[i], corr[i]);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+Cleanup
			
 
				+-------
			
 
				+
			
 
				+The function free_rs frees the allocated resources, if the caller is
			
 
				+the last user of the decoder.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+    /* Release resources */
			
 
				+    free_rs(rs_decoder);
			
 
				+
			
 
				+
			
 
				+Structures
			
 
				+==========
			
 
				+
			
 
				+This chapter contains the autogenerated documentation of the structures
			
 
				+which are used in the Reed-Solomon Library and are relevant for a
			
 
				+developer.
			
 
				+
			
 
				+.. kernel-doc:: include/linux/rslib.h
			
 
				+   :internal:
			
 
				+
			
 
				+Public Functions Provided
			
 
				+=========================
			
 
				+
			
 
				+This chapter contains the autogenerated documentation of the
			
 
				+Reed-Solomon functions which are exported.
			
 
				+
			
 
				+.. kernel-doc:: lib/reed_solomon/reed_solomon.c
			
 
				+   :export:
			
 
				+
			
 
				+Credits
			
 
				+=======
			
 
				+
			
 
				+The library code for encoding and decoding was written by Phil Karn.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+            Copyright 2002, Phil Karn, KA9Q
			
 
				+            May be used under the terms of the GNU General Public License (GPL)
			
 
				+
			
 
				+
			
 
				+The wrapper functions and interfaces are written by Thomas Gleixner.
			
 
				+
			
 
				+Many users have provided bugfixes, improvements and helping hands for
			
 
				+testing. Thanks a lot.
			
 
				+
			
 
				+The following people have contributed to this document:
			
 
				+
			
 
				+Thomas Gleixner\ tglx@linutronix.de
			
--- a/Documentation/cpu-load.txt
+++ b/Documentation/cpu-load.txt
@@ -1,9 +1,10 @@
 
				+========
			
 
				 CPU load
			
 
				---------
			
 
				+========
			
 
				 
			
 
				-Linux exports various bits of information via `/proc/stat' and
			
 
				-`/proc/uptime' that userland tools, such as top(1), use to calculate
			
 
				-the average time system spent in a particular state, for example:
			
 
				+Linux exports various bits of information via ``/proc/stat`` and
			
 
				+``/proc/uptime`` that userland tools, such as top(1), use to calculate
			
 
				+the average time system spent in a particular state, for example::
			
 
				 
			
 
				     $ iostat
			
 
				     Linux 2.6.18.3-exp (linmac)     02/20/2007
			
@@ -17,7 +18,7 @@ Here the system thinks that over the default sampling period the
 
				 system spent 10.01% of the time doing work in user space, 2.92% in the
			
 
				 kernel, and was overall 81.63% of the time idle.
			
 
				 
			
 
				-In most cases the `/proc/stat' information reflects the reality quite
			
 
				+In most cases the ``/proc/stat``	 information reflects the reality quite
			
 
				 closely, however due to the nature of how/when the kernel collects
			
 
				 this data sometimes it can not be trusted at all.
			
 
				 
			
@@ -33,78 +34,78 @@ Example
 
				 -------
			
 
				 
			
 
				 If we imagine the system with one task that periodically burns cycles
			
 
				-in the following manner:
			
 
				+in the following manner::
			
 
				 
			
 
				- time line between two timer interrupts
			
 
				-|--------------------------------------|
			
 
				- ^                                    ^
			
 
				- |_ something begins working          |
			
 
				-                                      |_ something goes to sleep
			
 
				-                                     (only to be awaken quite soon)
			
 
				+     time line between two timer interrupts
			
 
				+    |--------------------------------------|
			
 
				+     ^                                    ^
			
 
				+     |_ something begins working          |
			
 
				+                                          |_ something goes to sleep
			
 
				+                                         (only to be awaken quite soon)
			
 
				 
			
 
				 In the above situation the system will be 0% loaded according to the
			
 
				-`/proc/stat' (since the timer interrupt will always happen when the
			
 
				+``/proc/stat`` (since the timer interrupt will always happen when the
			
 
				 system is executing the idle handler), but in reality the load is
			
 
				 closer to 99%.
			
 
				 
			
 
				 One can imagine many more situations where this behavior of the kernel
			
 
				-will lead to quite erratic information inside `/proc/stat'.
			
 
				-
			
 
				-
			
 
				-/* gcc -o hog smallhog.c */
			
 
				-#include <time.h>
			
 
				-#include <limits.h>
			
 
				-#include <signal.h>
			
 
				-#include <sys/time.h>
			
 
				-#define HIST 10
			
 
				-
			
 
				-static volatile sig_atomic_t stop;
			
 
				-
			
 
				-static void sighandler (int signr)
			
 
				-{
			
 
				-     (void) signr;
			
 
				-     stop = 1;
			
 
				-}
			
 
				-static unsigned long hog (unsigned long niters)
			
 
				-{
			
 
				-     stop = 0;
			
 
				-     while (!stop && --niters);
			
 
				-     return niters;
			
 
				-}
			
 
				-int main (void)
			
 
				-{
			
 
				-     int i;
			
 
				-     struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
			
 
				-                             .it_value = { .tv_sec = 0, .tv_usec = 1 } };
			
 
				-     sigset_t set;
			
 
				-     unsigned long v[HIST];
			
 
				-     double tmp = 0.0;
			
 
				-     unsigned long n;
			
 
				-     signal (SIGALRM, &sighandler);
			
 
				-     setitimer (ITIMER_REAL, &it, NULL);
			
 
				-
			
 
				-     hog (ULONG_MAX);
			
 
				-     for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
			
 
				-     for (i = 0; i < HIST; ++i) tmp += v[i];
			
 
				-     tmp /= HIST;
			
 
				-     n = tmp - (tmp / 3.0);
			
 
				-
			
 
				-     sigemptyset (&set);
			
 
				-     sigaddset (&set, SIGALRM);
			
 
				-
			
 
				-     for (;;) {
			
 
				-         hog (n);
			
 
				-         sigwait (&set, &i);
			
 
				-     }
			
 
				-     return 0;
			
 
				-}
			
 
				+will lead to quite erratic information inside ``/proc/stat``::
			
 
				+
			
 
				+
			
 
				+	/* gcc -o hog smallhog.c */
			
 
				+	#include <time.h>
			
 
				+	#include <limits.h>
			
 
				+	#include <signal.h>
			
 
				+	#include <sys/time.h>
			
 
				+	#define HIST 10
			
 
				+
			
 
				+	static volatile sig_atomic_t stop;
			
 
				+
			
 
				+	static void sighandler (int signr)
			
 
				+	{
			
 
				+	(void) signr;
			
 
				+	stop = 1;
			
 
				+	}
			
 
				+	static unsigned long hog (unsigned long niters)
			
 
				+	{
			
 
				+	stop = 0;
			
 
				+	while (!stop && --niters);
			
 
				+	return niters;
			
 
				+	}
			
 
				+	int main (void)
			
 
				+	{
			
 
				+	int i;
			
 
				+	struct itimerval it = { .it_interval = { .tv_sec = 0, .tv_usec = 1 },
			
 
				+				.it_value = { .tv_sec = 0, .tv_usec = 1 } };
			
 
				+	sigset_t set;
			
 
				+	unsigned long v[HIST];
			
 
				+	double tmp = 0.0;
			
 
				+	unsigned long n;
			
 
				+	signal (SIGALRM, &sighandler);
			
 
				+	setitimer (ITIMER_REAL, &it, NULL);
			
 
				+
			
 
				+	hog (ULONG_MAX);
			
 
				+	for (i = 0; i < HIST; ++i) v[i] = ULONG_MAX - hog (ULONG_MAX);
			
 
				+	for (i = 0; i < HIST; ++i) tmp += v[i];
			
 
				+	tmp /= HIST;
			
 
				+	n = tmp - (tmp / 3.0);
			
 
				+
			
 
				+	sigemptyset (&set);
			
 
				+	sigaddset (&set, SIGALRM);
			
 
				+
			
 
				+	for (;;) {
			
 
				+		hog (n);
			
 
				+		sigwait (&set, &i);
			
 
				+	}
			
 
				+	return 0;
			
 
				+	}
			
 
				 
			
 
				 
			
 
				 References
			
 
				 ----------
			
 
				 
			
 
				-http://lkml.org/lkml/2007/2/12/6
			
 
				-Documentation/filesystems/proc.txt (1.8)
			
 
				+- http://lkml.org/lkml/2007/2/12/6
			
 
				+- Documentation/filesystems/proc.txt (1.8)
			
 
				 
			
 
				 
			
 
				 Thanks
			
--- a/Documentation/cputopology.txt
+++ b/Documentation/cputopology.txt
@@ -1,3 +1,6 @@
 
				+===========================================
			
 
				+How CPU topology info is exported via sysfs
			
 
				+===========================================
			
 
				 
			
 
				 Export CPU topology info via sysfs. Items (attributes) are similar
			
 
				 to /proc/cpuinfo output of some architectures:
			
@@ -75,24 +78,26 @@ CONFIG_SCHED_BOOK and CONFIG_DRAWER are currently only used on s390, where
 
				 they reflect the cpu and cache hierarchy.
			
 
				 
			
 
				 For an architecture to support this feature, it must define some of
			
 
				-these macros in include/asm-XXX/topology.h:
			
 
				-#define topology_physical_package_id(cpu)
			
 
				-#define topology_core_id(cpu)
			
 
				-#define topology_book_id(cpu)
			
 
				-#define topology_drawer_id(cpu)
			
 
				-#define topology_sibling_cpumask(cpu)
			
 
				-#define topology_core_cpumask(cpu)
			
 
				-#define topology_book_cpumask(cpu)
			
 
				-#define topology_drawer_cpumask(cpu)
			
 
				-
			
 
				-The type of **_id macros is int.
			
 
				-The type of **_cpumask macros is (const) struct cpumask *. The latter
			
 
				-correspond with appropriate **_siblings sysfs attributes (except for
			
 
				+these macros in include/asm-XXX/topology.h::
			
 
				+
			
 
				+	#define topology_physical_package_id(cpu)
			
 
				+	#define topology_core_id(cpu)
			
 
				+	#define topology_book_id(cpu)
			
 
				+	#define topology_drawer_id(cpu)
			
 
				+	#define topology_sibling_cpumask(cpu)
			
 
				+	#define topology_core_cpumask(cpu)
			
 
				+	#define topology_book_cpumask(cpu)
			
 
				+	#define topology_drawer_cpumask(cpu)
			
 
				+
			
 
				+The type of ``**_id macros`` is int.
			
 
				+The type of ``**_cpumask macros`` is ``(const) struct cpumask *``. The latter
			
 
				+correspond with appropriate ``**_siblings`` sysfs attributes (except for
			
 
				 topology_sibling_cpumask() which corresponds with thread_siblings).
			
 
				 
			
 
				 To be consistent on all architectures, include/linux/topology.h
			
 
				 provides default definitions for any of the above macros that are
			
 
				 not defined by include/asm-XXX/topology.h:
			
 
				+
			
 
				 1) physical_package_id: -1
			
 
				 2) core_id: 0
			
 
				 3) sibling_cpumask: just the given CPU
			
@@ -107,6 +112,7 @@ Additionally, CPU topology information is provided under
 
				 /sys/devices/system/cpu and includes these files.  The internal
			
 
				 source for the output is in brackets ("[]").
			
 
				 
			
 
				+    =========== ==========================================================
			
 
				     kernel_max: the maximum CPU index allowed by the kernel configuration.
			
 
				 		[NR_CPUS-1]
			
 
				 
			
@@ -122,6 +128,7 @@ source for the output is in brackets ("[]").
 
				 
			
 
				     present:	CPUs that have been identified as being present in the
			
 
				 		system. [cpu_present_mask]
			
 
				+    =========== ==========================================================
			
 
				 
			
 
				 The format for the above output is compatible with cpulist_parse()
			
 
				 [see <linux/cpumask.h>].  Some examples follow.
			
@@ -129,7 +136,7 @@ The format for the above output is compatible with cpulist_parse()
 
				 In this example, there are 64 CPUs in the system but cpus 32-63 exceed
			
 
				 the kernel max which is limited to 0..31 by the NR_CPUS config option
			
 
				 being 32.  Note also that CPUs 2 and 4-31 are not online but could be
			
 
				-brought online as they are both present and possible.
			
 
				+brought online as they are both present and possible::
			
 
				 
			
 
				      kernel_max: 31
			
 
				         offline: 2,4-31,32-63
			
@@ -140,7 +147,7 @@ brought online as they are both present and possible.
 
				 In this example, the NR_CPUS config option is 128, but the kernel was
			
 
				 started with possible_cpus=144.  There are 4 CPUs in the system and cpu2
			
 
				 was manually taken offline (and is the only CPU that can be brought
			
 
				-online.)
			
 
				+online.)::
			
 
				 
			
 
				      kernel_max: 127
			
 
				         offline: 2,4-127,128-143
			
--- a/Documentation/crc32.txt
+++ b/Documentation/crc32.txt
@@ -1,4 +1,6 @@
 
				-A brief CRC tutorial.
			
 
				+=================================
			
 
				+brief tutorial on CRC computation
			
 
				+=================================
			
 
				 
			
 
				 A CRC is a long-division remainder.  You add the CRC to the message,
			
 
				 and the whole thing (message+CRC) is a multiple of the given
			
@@ -8,7 +10,8 @@ remainder computed on the message+CRC is 0.  This latter approach
 
				 is used by a lot of hardware implementations, and is why so many
			
 
				 protocols put the end-of-frame flag after the CRC.
			
 
				 
			
 
				-It's actually the same long division you learned in school, except that
			
 
				+It's actually the same long division you learned in school, except that:
			
 
				+
			
 
				 - We're working in binary, so the digits are only 0 and 1, and
			
 
				 - When dividing polynomials, there are no carries.  Rather than add and
			
 
				   subtract, we just xor.  Thus, we tend to get a bit sloppy about
			
@@ -40,11 +43,12 @@ throw the quotient bit away, but subtract the appropriate multiple of
 
				 the polynomial from the remainder and we're back to where we started,
			
 
				 ready to process the next bit.
			
 
				 
			
 
				-A big-endian CRC written this way would be coded like:
			
 
				-for (i = 0; i < input_bits; i++) {
			
 
				-	multiple = remainder & 0x80000000 ? CRCPOLY : 0;
			
 
				-	remainder = (remainder << 1 | next_input_bit()) ^ multiple;
			
 
				-}
			
 
				+A big-endian CRC written this way would be coded like::
			
 
				+
			
 
				+	for (i = 0; i < input_bits; i++) {
			
 
				+		multiple = remainder & 0x80000000 ? CRCPOLY : 0;
			
 
				+		remainder = (remainder << 1 | next_input_bit()) ^ multiple;
			
 
				+	}
			
 
				 
			
 
				 Notice how, to get at bit 32 of the shifted remainder, we look
			
 
				 at bit 31 of the remainder *before* shifting it.
			
@@ -54,25 +58,26 @@ the remainder don't actually affect any decision-making until
 
				 32 bits later.  Thus, the first 32 cycles of this are pretty boring.
			
 
				 Also, to add the CRC to a message, we need a 32-bit-long hole for it at
			
 
				 the end, so we have to add 32 extra cycles shifting in zeros at the
			
 
				-end of every message,
			
 
				+end of every message.
			
 
				 
			
 
				 These details lead to a standard trick: rearrange merging in the
			
 
				 next_input_bit() until the moment it's needed.  Then the first 32 cycles
			
 
				 can be precomputed, and merging in the final 32 zero bits to make room
			
 
				-for the CRC can be skipped entirely.  This changes the code to:
			
 
				+for the CRC can be skipped entirely.  This changes the code to::
			
 
				 
			
 
				-for (i = 0; i < input_bits; i++) {
			
 
				-	remainder ^= next_input_bit() << 31;
			
 
				-	multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
			
 
				-	remainder = (remainder << 1) ^ multiple;
			
 
				-}
			
 
				+	for (i = 0; i < input_bits; i++) {
			
 
				+		remainder ^= next_input_bit() << 31;
			
 
				+		multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
			
 
				+		remainder = (remainder << 1) ^ multiple;
			
 
				+	}
			
 
				 
			
 
				-With this optimization, the little-endian code is particularly simple:
			
 
				-for (i = 0; i < input_bits; i++) {
			
 
				-	remainder ^= next_input_bit();
			
 
				-	multiple = (remainder & 1) ? CRCPOLY : 0;
			
 
				-	remainder = (remainder >> 1) ^ multiple;
			
 
				-}
			
 
				+With this optimization, the little-endian code is particularly simple::
			
 
				+
			
 
				+	for (i = 0; i < input_bits; i++) {
			
 
				+		remainder ^= next_input_bit();
			
 
				+		multiple = (remainder & 1) ? CRCPOLY : 0;
			
 
				+		remainder = (remainder >> 1) ^ multiple;
			
 
				+	}
			
 
				 
			
 
				 The most significant coefficient of the remainder polynomial is stored
			
 
				 in the least significant bit of the binary "remainder" variable.
			
@@ -81,23 +86,25 @@ be bit-reversed) and next_input_bit().
 
				 
			
 
				 As long as next_input_bit is returning the bits in a sensible order, we don't
			
 
				 *have* to wait until the last possible moment to merge in additional bits.
			
 
				-We can do it 8 bits at a time rather than 1 bit at a time:
			
 
				-for (i = 0; i < input_bytes; i++) {
			
 
				-	remainder ^= next_input_byte() << 24;
			
 
				-	for (j = 0; j < 8; j++) {
			
 
				-		multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
			
 
				-		remainder = (remainder << 1) ^ multiple;
			
 
				+We can do it 8 bits at a time rather than 1 bit at a time::
			
 
				+
			
 
				+	for (i = 0; i < input_bytes; i++) {
			
 
				+		remainder ^= next_input_byte() << 24;
			
 
				+		for (j = 0; j < 8; j++) {
			
 
				+			multiple = (remainder & 0x80000000) ? CRCPOLY : 0;
			
 
				+			remainder = (remainder << 1) ^ multiple;
			
 
				+		}
			
 
				 	}
			
 
				-}
			
 
				 
			
 
				-Or in little-endian:
			
 
				-for (i = 0; i < input_bytes; i++) {
			
 
				-	remainder ^= next_input_byte();
			
 
				-	for (j = 0; j < 8; j++) {
			
 
				-		multiple = (remainder & 1) ? CRCPOLY : 0;
			
 
				-		remainder = (remainder >> 1) ^ multiple;
			
 
				+Or in little-endian::
			
 
				+
			
 
				+	for (i = 0; i < input_bytes; i++) {
			
 
				+		remainder ^= next_input_byte();
			
 
				+		for (j = 0; j < 8; j++) {
			
 
				+			multiple = (remainder & 1) ? CRCPOLY : 0;
			
 
				+			remainder = (remainder >> 1) ^ multiple;
			
 
				+		}
			
 
				 	}
			
 
				-}
			
 
				 
			
 
				 If the input is a multiple of 32 bits, you can even XOR in a 32-bit
			
 
				 word at a time and increase the inner loop count to 32.
			
--- a/Documentation/crypto/api-samples.rst
+++ b/Documentation/crypto/api-samples.rst
@@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH
 
				         char ctx[];
			
 
				     };
			
 
				 
			
 
				-    static struct sdesc init_sdesc(struct crypto_shash *alg)
			
 
				+    static struct sdesc *init_sdesc(struct crypto_shash *alg)
			
 
				     {
			
 
				-        struct sdesc sdesc;
			
 
				+        struct sdesc *sdesc;
			
 
				         int size;
			
 
				 
			
 
				         size = sizeof(struct shash_desc) + crypto_shash_descsize(alg);
			
@@ -169,15 +169,16 @@ Code Example For Use of Operational State Memory With SHASH
 
				         return sdesc;
			
 
				     }
			
 
				 
			
 
				-    static int calc_hash(struct crypto_shashalg,
			
 
				-                 const unsigned chardata, unsigned int datalen,
			
 
				-                 unsigned chardigest) {
			
 
				-        struct sdesc sdesc;
			
 
				+    static int calc_hash(struct crypto_shash *alg,
			
 
				+                 const unsigned char *data, unsigned int datalen,
			
 
				+                 unsigned char *digest)
			
 
				+    {
			
 
				+        struct sdesc *sdesc;
			
 
				         int ret;
			
 
				 
			
 
				         sdesc = init_sdesc(alg);
			
 
				         if (IS_ERR(sdesc)) {
			
 
				-            pr_info("trusted_key: can't alloc %s\n", hash_alg);
			
 
				+            pr_info("can't alloc sdesc\n");
			
 
				             return PTR_ERR(sdesc);
			
 
				         }
			
 
				 
			
@@ -186,6 +187,23 @@ Code Example For Use of Operational State Memory With SHASH
 
				         return ret;
			
 
				     }
			
 
				 
			
 
				+    static int test_hash(const unsigned char *data, unsigned int datalen,
			
 
				+                 unsigned char *digest)
			
 
				+    {
			
 
				+        struct crypto_shash *alg;
			
 
				+        char *hash_alg_name = "sha1-padlock-nano";
			
 
				+        int ret;
			
 
				+
			
 
				+        alg = crypto_alloc_shash(hash_alg_name, CRYPTO_ALG_TYPE_SHASH, 0);
			
 
				+        if (IS_ERR(alg)) {
			
 
				+                pr_info("can't alloc alg %s\n", hash_alg_name);
			
 
				+                return PTR_ERR(alg);
			
 
				+        }
			
 
				+        ret = calc_hash(alg, data, datalen, digest);
			
 
				+        crypto_free_shash(alg);
			
 
				+        return ret;
			
 
				+    }
			
 
				+
			
 
				 
			
 
				 Code Example For Random Number Generator Usage
			
 
				 ----------------------------------------------
			
@@ -195,8 +213,8 @@ Code Example For Random Number Generator Usage
 
				 
			
 
				     static int get_random_numbers(u8 *buf, unsigned int len)
			
 
				     {
			
 
				-        struct crypto_rngrng = NULL;
			
 
				-        chardrbg = "drbg_nopr_sha256"; /* Hash DRBG with SHA-256, no PR */
			
 
				+        struct crypto_rng *rng = NULL;
			
 
				+        char *drbg = "drbg_nopr_sha256"; /* Hash DRBG with SHA-256, no PR */
			
 
				         int ret;
			
 
				 
			
 
				         if (!buf || !len) {
			
@@ -207,7 +225,7 @@ Code Example For Random Number Generator Usage
 
				         rng = crypto_alloc_rng(drbg, 0, 0);
			
 
				         if (IS_ERR(rng)) {
			
 
				             pr_debug("could not allocate RNG handle for %s\n", drbg);
			
 
				-            return -PTR_ERR(rng);
			
 
				+            return PTR_ERR(rng);
			
 
				         }
			
 
				 
			
 
				         ret = crypto_rng_get_bytes(rng, buf, len);
			
--- a/Documentation/crypto/asymmetric-keys.txt
+++ b/Documentation/crypto/asymmetric-keys.txt
@@ -10,6 +10,7 @@ Contents:
 
				     - Signature verification.
			
 
				   - Asymmetric key subtypes.
			
 
				   - Instantiation data parsers.
			
 
				+  - Keyring link restrictions.
			
 
				 
			
 
				 
			
 
				 ========
			
@@ -265,7 +266,7 @@ mandatory:
 
				 
			
 
				      The caller passes a pointer to the following struct with all of the fields
			
 
				      cleared, except for data, datalen and quotalen [see
			
 
				-     Documentation/security/keys.txt].
			
 
				+     Documentation/security/keys/core.rst].
			
 
				 
			
 
				 	struct key_preparsed_payload {
			
 
				 		char		*description;
			
@@ -318,7 +319,8 @@ KEYRING LINK RESTRICTIONS
 
				 =========================
			
 
				 
			
 
				 Keyrings created from userspace using add_key can be configured to check the
			
 
				-signature of the key being linked.
			
 
				+signature of the key being linked.  Keys without a valid signature are not
			
 
				+allowed to link.
			
 
				 
			
 
				 Several restriction methods are available:
			
 
				 
			
@@ -327,9 +329,10 @@ Several restriction methods are available:
 
				      - Option string used with KEYCTL_RESTRICT_KEYRING:
			
 
				        - "builtin_trusted"
			
 
				 
			
 
				-     The kernel builtin trusted keyring will be searched for the signing
			
 
				-     key. The ca_keys kernel parameter also affects which keys are used for
			
 
				-     signature verification.
			
 
				+     The kernel builtin trusted keyring will be searched for the signing key.
			
 
				+     If the builtin trusted keyring is not configured, all links will be
			
 
				+     rejected.  The ca_keys kernel parameter also affects which keys are used
			
 
				+     for signature verification.
			
 
				 
			
 
				  (2) Restrict using the kernel builtin and secondary trusted keyrings
			
 
				 
			
@@ -337,8 +340,10 @@ Several restriction methods are available:
 
				        - "builtin_and_secondary_trusted"
			
 
				 
			
 
				      The kernel builtin and secondary trusted keyrings will be searched for the
			
 
				-     signing key. The ca_keys kernel parameter also affects which keys are used
			
 
				-     for signature verification.
			
 
				+     signing key.  If the secondary trusted keyring is not configured, this
			
 
				+     restriction will behave like the "builtin_trusted" option.  The ca_keys
			
 
				+     kernel parameter also affects which keys are used for signature
			
 
				+     verification.
			
 
				 
			
 
				  (3) Restrict using a separate key or keyring
			
 
				 
			
@@ -346,7 +351,7 @@ Several restriction methods are available:
 
				        - "key_or_keyring:<key or keyring serial number>[:chain]"
			
 
				 
			
 
				      Whenever a key link is requested, the link will only succeed if the key
			
 
				-     being linked is signed by one of the designated keys. This key may be
			
 
				+     being linked is signed by one of the designated keys.  This key may be
			
 
				      specified directly by providing a serial number for one asymmetric key, or
			
 
				      a group of keys may be searched for the signing key by providing the
			
 
				      serial number for a keyring.
			
@@ -354,7 +359,51 @@ Several restriction methods are available:
 
				      When the "chain" option is provided at the end of the string, the keys
			
 
				      within the destination keyring will also be searched for signing keys.
			
 
				      This allows for verification of certificate chains by adding each
			
 
				-     cert in order (starting closest to the root) to one keyring.
			
 
				+     certificate in order (starting closest to the root) to a keyring.  For
			
 
				+     instance, one keyring can be populated with links to a set of root
			
 
				+     certificates, with a separate, restricted keyring set up for each
			
 
				+     certificate chain to be validated:
			
 
				+
			
 
				+	# Create and populate a keyring for root certificates
			
 
				+	root_id=`keyctl add keyring root-certs "" @s`
			
 
				+	keyctl padd asymmetric "" $root_id < root1.cert
			
 
				+	keyctl padd asymmetric "" $root_id < root2.cert
			
 
				+
			
 
				+	# Create and restrict a keyring for the certificate chain
			
 
				+	chain_id=`keyctl add keyring chain "" @s`
			
 
				+	keyctl restrict_keyring $chain_id asymmetric key_or_keyring:$root_id:chain
			
 
				+
			
 
				+	# Attempt to add each certificate in the chain, starting with the
			
 
				+	# certificate closest to the root.
			
 
				+	keyctl padd asymmetric "" $chain_id < intermediateA.cert
			
 
				+	keyctl padd asymmetric "" $chain_id < intermediateB.cert
			
 
				+	keyctl padd asymmetric "" $chain_id < end-entity.cert
			
 
				+
			
 
				+     If the final end-entity certificate is successfully added to the "chain"
			
 
				+     keyring, we can be certain that it has a valid signing chain going back to
			
 
				+     one of the root certificates.
			
 
				+
			
 
				+     A single keyring can be used to verify a chain of signatures by
			
 
				+     restricting the keyring after linking the root certificate:
			
 
				+
			
 
				+	# Create a keyring for the certificate chain and add the root
			
 
				+	chain2_id=`keyctl add keyring chain2 "" @s`
			
 
				+	keyctl padd asymmetric "" $chain2_id < root1.cert
			
 
				+
			
 
				+	# Restrict the keyring that already has root1.cert linked.  The cert
			
 
				+	# will remain linked by the keyring.
			
 
				+	keyctl restrict_keyring $chain2_id asymmetric key_or_keyring:0:chain
			
 
				+
			
 
				+	# Attempt to add each certificate in the chain, starting with the
			
 
				+	# certificate closest to the root.
			
 
				+	keyctl padd asymmetric "" $chain2_id < intermediateA.cert
			
 
				+	keyctl padd asymmetric "" $chain2_id < intermediateB.cert
			
 
				+	keyctl padd asymmetric "" $chain2_id < end-entity.cert
			
 
				+
			
 
				+     If the final end-entity certificate is successfully added to the "chain2"
			
 
				+     keyring, we can be certain that there is a valid signing chain going back
			
 
				+     to the root certificate that was added before the keyring was restricted.
			
 
				+
			
 
				 
			
 
				 In all of these cases, if the signing key is found the signature of the key to
			
 
				 be linked will be verified using the signing key.  The requested key is added
			
--- a/Documentation/crypto/conf.py
+++ b/Documentation/crypto/conf.py
@@ -0,0 +1,10 @@
 
				+# -*- coding: utf-8; mode: python -*-
			
 
				+
			
 
				+project = 'Linux Kernel Crypto API'
			
 
				+
			
 
				+tags.add("subproject")
			
 
				+
			
 
				+latex_documents = [
			
 
				+    ('index', 'crypto-api.tex', 'Linux Kernel Crypto API manual',
			
 
				+     'The kernel development community', 'manual'),
			
 
				+]