7 år sedan · 7731b8bc94
--- a/.mailmap
+++ b/.mailmap
@@ -186,6 +186,9 @@ Uwe Kleine-König <ukleinek@informatik.uni-freiburg.de>
 
				 Uwe Kleine-König <ukl@pengutronix.de>
			
 
				 Uwe Kleine-König <Uwe.Kleine-Koenig@digi.com>
			
 
				 Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
			
 
				+Vinod Koul <vkoul@kernel.org> <vinod.koul@intel.com>
			
 
				+Vinod Koul <vkoul@kernel.org> <vinod.koul@linux.intel.com>
			
 
				+Vinod Koul <vkoul@kernel.org> <vkoul@infradead.org>
			
 
				 Viresh Kumar <vireshk@kernel.org> <viresh.kumar@st.com>
			
 
				 Viresh Kumar <vireshk@kernel.org> <viresh.linux@gmail.com>
			
 
				 Viresh Kumar <vireshk@kernel.org> <viresh.kumar2@arm.com>
			
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -64,8 +64,6 @@ auxdisplay/
 
				 	- misc. LCD driver documentation (cfag12864b, ks0108).
			
 
				 backlight/
			
 
				 	- directory with info on controlling backlights in flat panel displays
			
 
				-bcache.txt
			
 
				-	- Block-layer cache on fast SSDs to improve slow (raid) I/O performance.
			
 
				 block/
			
 
				 	- info on the Block I/O (BIO) layer.
			
 
				 blockdev/
			
@@ -78,18 +76,10 @@ bus-devices/
 
				 	- directory with info on TI GPMC (General Purpose Memory Controller)
			
 
				 bus-virt-phys-mapping.txt
			
 
				 	- how to access I/O mapped memory from within device drivers.
			
 
				-cachetlb.txt
			
 
				-	- describes the cache/TLB flushing interfaces Linux uses.
			
 
				 cdrom/
			
 
				 	- directory with information on the CD-ROM drivers that Linux has.
			
 
				 cgroup-v1/
			
 
				 	- cgroups v1 features, including cpusets and memory controller.
			
 
				-cgroup-v2.txt
			
 
				-	- cgroups v2 features, including cpusets and memory controller.
			
 
				-circular-buffers.txt
			
 
				-	- how to make use of the existing circular buffer infrastructure
			
 
				-clk.txt
			
 
				-	- info on the common clock framework
			
 
				 cma/
			
 
				 	- Continuous Memory Area (CMA) debugfs interface.
			
 
				 conf.py
			
--- a/Documentation/ABI/obsolete/sysfs-gpio
+++ b/Documentation/ABI/obsolete/sysfs-gpio
@@ -11,7 +11,7 @@ Description:
 
				   Kernel code may export it for complete or partial access.
			
 
				 
			
 
				   GPIOs are identified as they are inside the kernel, using integers in
			
 
				-  the range 0..INT_MAX.  See Documentation/gpio/gpio.txt for more information.
			
 
				+  the range 0..INT_MAX.  See Documentation/gpio for more information.
			
 
				 
			
 
				     /sys/class/gpio
			
 
				 	/export ... asks the kernel to export a GPIO to userspace
			
--- a/Documentation/ABI/removed/sysfs-bus-nfit
+++ b/Documentation/ABI/removed/sysfs-bus-nfit
@@ -0,0 +1,17 @@
 
				+What:		/sys/bus/nd/devices/regionX/nfit/ecc_unit_size
			
 
				+Date:		Aug, 2017
			
 
				+KernelVersion:	v4.14 (Removed v4.18)
			
 
				+Contact:	linux-nvdimm@lists.01.org
			
 
				+Description:
			
 
				+		(RO) Size of a write request to a DIMM that will not incur a
			
 
				+		read-modify-write cycle at the memory controller.
			
 
				+
			
 
				+		When the nfit driver initializes it runs an ARS (Address Range
			
 
				+		Scrub) operation across every pmem range. Part of that process
			
 
				+		involves determining the ARS capabilities of a given address
			
 
				+		range. One of the capabilities that is reported is the 'Clear
			
 
				+		Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
			
 
				+		section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
			
 
				+		This property indicates the boundary at which the NVDIMM may
			
 
				+		need to perform read-modify-write cycles to maintain ECC (Error
			
 
				+		Correcting Code) blocks.
			
--- a/Documentation/ABI/stable/sysfs-bus-vmbus
+++ b/Documentation/ABI/stable/sysfs-bus-vmbus
@@ -1,25 +1,25 @@
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/id
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/id
			
 
				 Date:		Jul 2009
			
 
				 KernelVersion:	2.6.31
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
 
				 Description:	The VMBus child_relid of the device's primary channel
			
 
				 Users:		tools/hv/lsvmbus
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/class_id
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/class_id
			
 
				 Date:		Jul 2009
			
 
				 KernelVersion:	2.6.31
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
 
				 Description:	The VMBus interface type GUID of the device
			
 
				 Users:		tools/hv/lsvmbus
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/device_id
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/device_id
			
 
				 Date:		Jul 2009
			
 
				 KernelVersion:	2.6.31
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
 
				 Description:	The VMBus interface instance GUID of the device
			
 
				 Users:		tools/hv/lsvmbus
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channel_vp_mapping
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channel_vp_mapping
			
 
				 Date:		Jul 2015
			
 
				 KernelVersion:	4.2.0
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
@@ -28,112 +28,112 @@ Description:	The mapping of which primary/sub channels are bound to which
 
				 		Format: <channel's child_relid:the bound cpu's number>
			
 
				 Users:		tools/hv/lsvmbus
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/device
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/device
			
 
				 Date:		Dec. 2015
			
 
				 KernelVersion:	4.5
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
 
				 Description:	The 16 bit device ID of the device
			
 
				 Users:		tools/hv/lsvmbus and user level RDMA libraries
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/vendor
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/vendor
			
 
				 Date:		Dec. 2015
			
 
				 KernelVersion:	4.5
			
 
				 Contact:	K. Y. Srinivasan <kys@microsoft.com>
			
 
				 Description:	The 16 bit vendor ID of the device
			
 
				 Users:		tools/hv/lsvmbus and user level RDMA libraries
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Directory for per-channel information
			
 
				 		NN is the VMBUS relid associtated with the channel.
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/cpu
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/cpu
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	VCPU (sub)channel is affinitized to
			
 
				 Users:		tools/hv/lsvmbus and other debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/cpu
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/cpu
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	VCPU (sub)channel is affinitized to
			
 
				 Users:		tools/hv/lsvmbus and other debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/in_mask
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/in_mask
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Host to guest channel interrupt mask
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/latency
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/latency
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Channel signaling latency
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/out_mask
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/out_mask
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Guest to host channel interrupt mask
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/pending
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/pending
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Channel interrupt pending state
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/read_avail
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/read_avail
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Bytes available to read
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/write_avail
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/write_avail
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Bytes available to write
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/events
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/events
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Number of times we have signaled the host
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/interrupts
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/interrupts
			
 
				 Date:		September. 2017
			
 
				 KernelVersion:	4.14
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Number of times we have taken an interrupt (incoming)
			
 
				 Users:		Debugging tools
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/subchannel_id
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/subchannel_id
			
 
				 Date:		January. 2018
			
 
				 KernelVersion:	4.16
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Subchannel ID associated with VMBUS channel
			
 
				 Users:		Debugging tools and userspace drivers
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/monitor_id
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/monitor_id
			
 
				 Date:		January. 2018
			
 
				 KernelVersion:	4.16
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
 
				 Description:	Monitor bit associated with channel
			
 
				 Users:		Debugging tools and userspace drivers
			
 
				 
			
 
				-What:		/sys/bus/vmbus/devices/vmbus_*/channels/NN/ring
			
 
				+What:		/sys/bus/vmbus/devices/<UUID>/channels/<N>/ring
			
 
				 Date:		January. 2018
			
 
				 KernelVersion:	4.16
			
 
				 Contact:	Stephen Hemminger <sthemmin@microsoft.com>
			
--- a/Documentation/ABI/stable/sysfs-devices-node
+++ b/Documentation/ABI/stable/sysfs-devices-node
@@ -90,4 +90,4 @@ Date:		December 2009
 
				 Contact:	Lee Schermerhorn <lee.schermerhorn@hp.com>
			
 
				 Description:
			
 
				 		The node's huge page size control/query attributes.
			
 
				-		See Documentation/vm/hugetlbpage.txt
			
 
				+		See Documentation/admin-guide/mm/hugetlbpage.rst
			
--- a/Documentation/ABI/testing/evm
+++ b/Documentation/ABI/testing/evm
@@ -57,3 +57,16 @@ Description:
 
				 		dracut (via 97masterkey and 98integrity) and systemd (via
			
 
				 		core/ima-setup) have support for loading keys at boot
			
 
				 		time.
			
 
				+
			
 
				+What:		security/integrity/evm/evm_xattrs
			
 
				+Date:		April 2018
			
 
				+Contact:	Matthew Garrett <mjg59@google.com>
			
 
				+Description:
			
 
				+		Shows the set of extended attributes used to calculate or
			
 
				+		validate the EVM signature, and allows additional attributes
			
 
				+		to be added at runtime. Any signatures generated after
			
 
				+		additional attributes are added (and on files posessing those
			
 
				+		additional attributes) will only be valid if the same
			
 
				+		additional attributes are configured on system boot. Writing
			
 
				+		a single period (.) will lock the xattr list from any further
			
 
				+		modification.
			
--- a/Documentation/ABI/testing/ima_policy
+++ b/Documentation/ABI/testing/ima_policy
@@ -21,7 +21,7 @@ Description:
 
				 			audit | hash | dont_hash
			
 
				 		condition:= base | lsm  [option]
			
 
				 			base:	[[func=] [mask=] [fsmagic=] [fsuuid=] [uid=]
			
 
				-				[euid=] [fowner=]]
			
 
				+				[euid=] [fowner=] [fsname=]]
			
 
				 			lsm:	[[subj_user=] [subj_role=] [subj_type=]
			
 
				 				 [obj_user=] [obj_role=] [obj_type=]]
			
 
				 			option:	[[appraise_type=]] [permit_directio]
			
--- a/Documentation/ABI/testing/sysfs-bus-iio
+++ b/Documentation/ABI/testing/sysfs-bus-iio
@@ -190,6 +190,13 @@ Description:
 
				 		but should match other such assignments on device).
			
 
				 		Units after application of scale and offset are m/s^2.
			
 
				 
			
 
				+What:		/sys/bus/iio/devices/iio:deviceX/in_angl_raw
			
 
				+KernelVersion:	4.17
			
 
				+Contact:	linux-iio@vger.kernel.org
			
 
				+Description:
			
 
				+		Angle of rotation. Units after application of scale and offset
			
 
				+		are radians.
			
 
				+
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_x_raw
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_y_raw
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_anglvel_z_raw
			
@@ -297,6 +304,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_pressure_offset
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_humidityrelative_offset
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_magn_offset
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_rot_offset
			
 
				+What:		/sys/bus/iio/devices/iio:deviceX/in_angl_offset
			
 
				 KernelVersion:	2.6.35
			
 
				 Contact:	linux-iio@vger.kernel.org
			
 
				 Description:
			
@@ -350,6 +358,7 @@ What:		/sys/bus/iio/devices/iio:deviceX/in_humidityrelative_scale
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_velocity_sqrt(x^2+y^2+z^2)_scale
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_illuminance_scale
			
 
				 What:		/sys/bus/iio/devices/iio:deviceX/in_countY_scale
			
 
				+What:		/sys/bus/iio/devices/iio:deviceX/in_angl_scale
			
 
				 KernelVersion:	2.6.35
			
 
				 Contact:	linux-iio@vger.kernel.org
			
 
				 Description:
			
--- a/Documentation/ABI/testing/sysfs-bus-nfit
+++ b/Documentation/ABI/testing/sysfs-bus-nfit
@@ -212,22 +212,3 @@ Description:
 
				 		range. Used by NVDIMM Region Mapping Structure to uniquely refer
			
 
				 		to this structure. Value of 0 is reserved and not used as an
			
 
				 		index.
			
 
				-
			
 
				-
			
 
				-What:		/sys/bus/nd/devices/regionX/nfit/ecc_unit_size
			
 
				-Date:		Aug, 2017
			
 
				-KernelVersion:	v4.14
			
 
				-Contact:	linux-nvdimm@lists.01.org
			
 
				-Description:
			
 
				-		(RO) Size of a write request to a DIMM that will not incur a
			
 
				-		read-modify-write cycle at the memory controller.
			
 
				-
			
 
				-		When the nfit driver initializes it runs an ARS (Address Range
			
 
				-		Scrub) operation across every pmem range. Part of that process
			
 
				-		involves determining the ARS capabilities of a given address
			
 
				-		range. One of the capabilities that is reported is the 'Clear
			
 
				-		Uncorrectable Error Range Length Unit Size' (see: ACPI 6.2
			
 
				-		section 9.20.7.4 Function Index 1 - Query ARS Capabilities).
			
 
				-		This property indicates the boundary at which the NVDIMM may
			
 
				-		need to perform read-modify-write cycles to maintain ECC (Error
			
 
				-		Correcting Code) blocks.
			
--- a/Documentation/ABI/testing/sysfs-bus-rpmsg
+++ b/Documentation/ABI/testing/sysfs-bus-rpmsg
@@ -73,3 +73,23 @@ Description:
 
				 		This sysfs entry tells us whether the channel is a local
			
 
				 		server channel that is announced (values are either
			
 
				 		true or false).
			
 
				+
			
 
				+What:		/sys/bus/rpmsg/devices/.../driver_override
			
 
				+Date:		April 2018
			
 
				+KernelVersion:	4.18
			
 
				+Contact:	Bjorn Andersson <bjorn.andersson@linaro.org>
			
 
				+Description:
			
 
				+		Every rpmsg device is a communication channel with a remote
			
 
				+		processor. Channels are identified by a textual name (see
			
 
				+		/sys/bus/rpmsg/devices/.../name above) and have a local
			
 
				+		("source") rpmsg address, and remote ("destination") rpmsg
			
 
				+		address.
			
 
				+
			
 
				+		The listening entity (or client) which communicates with a
			
 
				+		remote processor is referred as rpmsg driver. The rpmsg device
			
 
				+		and rpmsg driver are matched based on rpmsg device name and
			
 
				+		rpmsg driver ID table.
			
 
				+
			
 
				+		This sysfs entry allows the rpmsg driver for a rpmsg device
			
 
				+		to be specified which will override standard OF, ID table
			
 
				+		and name matching.
			
--- a/Documentation/ABI/testing/sysfs-bus-usb
+++ b/Documentation/ABI/testing/sysfs-bus-usb
@@ -189,6 +189,28 @@ Description:
 
				 		The file will read "hotplug", "wired" and "not used" if the
			
 
				 		information is available, and "unknown" otherwise.
			
 
				 
			
 
				+What:		/sys/bus/usb/devices/.../(hub interface)/portX/quirks
			
 
				+Date:		May 2018
			
 
				+Contact:	Nicolas Boichat <drinkcat@chromium.org>
			
 
				+Description:
			
 
				+		In some cases, we care about time-to-active for devices
			
 
				+		connected on a specific port (e.g. non-standard USB port like
			
 
				+		pogo pins), where the device to be connected is known in
			
 
				+		advance, and behaves well according to the specification.
			
 
				+		This attribute is a bit-field that controls the behavior of
			
 
				+		a specific port:
			
 
				+		 - Bit 0 of this field selects the "old" enumeration scheme,
			
 
				+		   as it is considerably faster (it only causes one USB reset
			
 
				+		   instead of 2).
			
 
				+		   The old enumeration scheme can also be selected globally
			
 
				+		   using /sys/module/usbcore/parameters/old_scheme_first, but
			
 
				+		   it is often not desirable as the new scheme was introduced to
			
 
				+		   increase compatibility with more devices.
			
 
				+		 - Bit 1 reduces TRSTRCY to the 10 ms that are required by the
			
 
				+		   USB 2.0 specification, instead of the 50 ms that are normally
			
 
				+		   used to help make enumeration work better on some high speed
			
 
				+		   devices.
			
 
				+
			
 
				 What:		/sys/bus/usb/devices/.../(hub interface)/portX/over_current_count
			
 
				 Date:		February 2018
			
 
				 Contact:	Richard Leitner <richard.leitner@skidata.com>
			
@@ -236,3 +258,21 @@ Description:
 
				 		Supported values are 0 - 15.
			
 
				 		More information on how besl values map to microseconds can be found in
			
 
				 		USB 2.0 ECN Errata for Link Power Management, section 4.10)
			
 
				+
			
 
				+What:		/sys/bus/usb/devices/.../rx_lanes
			
 
				+Date:		March 2018
			
 
				+Contact:	Mathias Nyman <mathias.nyman@linux.intel.com>
			
 
				+Description:
			
 
				+		Number of rx lanes the device is using.
			
 
				+		USB 3.2 adds Dual-lane support, 2 rx and 2 tx lanes over Type-C.
			
 
				+		Inter-Chip SSIC devices support asymmetric lanes up to 4 lanes per
			
 
				+		direction. Devices before USB 3.2 are single lane (rx_lanes = 1)
			
 
				+
			
 
				+What:		/sys/bus/usb/devices/.../tx_lanes
			
 
				+Date:		March 2018
			
 
				+Contact:	Mathias Nyman <mathias.nyman@linux.intel.com>
			
 
				+Description:
			
 
				+		Number of tx lanes the device is using.
			
 
				+		USB 3.2 adds Dual-lane support, 2 rx and 2 tx -lanes over Type-C.
			
 
				+		Inter-Chip SSIC devices support asymmetric lanes up to 4 lanes per
			
 
				+		direction. Devices before USB 3.2 are single lane (tx_lanes = 1)
			
--- a/Documentation/ABI/testing/sysfs-class-cxl
+++ b/Documentation/ABI/testing/sysfs-class-cxl
@@ -69,7 +69,9 @@ Date:           September 2014
 
				 Contact:        linuxppc-dev@lists.ozlabs.org
			
 
				 Description:    read/write
			
 
				                 Set the mode for prefaulting in segments into the segment table
			
 
				-                when performing the START_WORK ioctl. Possible values:
			
 
				+                when performing the START_WORK ioctl. Only applicable when
			
 
				+                running under hashed page table mmu.
			
 
				+                Possible values:
			
 
				                         none: No prefaulting (default)
			
 
				                         work_element_descriptor: Treat the work element
			
 
				                                  descriptor as an effective address and
			
--- a/Documentation/ABI/testing/sysfs-class-mtd
+++ b/Documentation/ABI/testing/sysfs-class-mtd
@@ -232,3 +232,11 @@ Description:
 
				 		of the parent (another partition or a flash device) in bytes.
			
 
				 		This attribute is absent on flash devices, so it can be used
			
 
				 		to distinguish them from partitions.
			
 
				+
			
 
				+What:		/sys/class/mtd/mtdX/oobavail
			
 
				+Date:		April 2018
			
 
				+KernelVersion:	4.16
			
 
				+Contact:	linux-mtd@lists.infradead.org
			
 
				+Description:
			
 
				+		Number of bytes available for a client to place data into
			
 
				+		the out of band area.
			
--- a/Documentation/ABI/testing/sysfs-class-power
+++ b/Documentation/ABI/testing/sysfs-class-power
@@ -1,3 +1,458 @@
 
				+===== General Properties =====
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/manufacturer
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the name of the device manufacturer.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented as string
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/model_name
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the name of the device model.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented as string
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/serial_number
			
 
				+Date:		January 2008
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the serial number of the device.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented as string
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/type
			
 
				+Date:		May 2010
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Describes the main type of the supply.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: "Battery", "UPS", "Mains", "USB"
			
 
				+
			
 
				+===== Battery Properties =====
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/capacity
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Fine grain representation of battery capacity.
			
 
				+		Access: Read
			
 
				+		Valid values: 0 - 100 (percent)
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/capacity_alert_max
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Maximum battery capacity trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		battery discharging scenario where user-space needs to know the
			
 
				+		battery has dropped to an upper level so it can take
			
 
				+		appropriate action (e.g. warning user that battery level is
			
 
				+		low).
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: 0 - 100 (percent)
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/capacity_alert_min
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Minimum battery capacity trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		battery discharging scenario where user-space needs to know the
			
 
				+		battery has dropped to a lower level so it can take
			
 
				+		appropriate action (e.g. warning user that battery level is
			
 
				+		critically low).
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: 0 - 100 (percent)
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/capacity_level
			
 
				+Date:		June 2009
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Coarse representation of battery capacity.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: "Unknown", "Critical", "Low", "Normal", "High",
			
 
				+			      "Full"
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/current_avg
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports an average IBAT current reading for the battery, over a
			
 
				+		fixed period. Normally devices will provide a fixed interval in
			
 
				+		which they average readings to smooth out the reported value.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/current_max
			
 
				+Date:		October 2010
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum IBAT current allowed into the battery.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/current_now
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports an instant, single IBAT current reading for the battery.
			
 
				+		This value is not averaged/smoothed.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/charge_type
			
 
				+Date:		July 2009
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Represents the type of charging currently being applied to the
			
 
				+		battery.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: "Unknown", "N/A", "Trickle", "Fast"
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/charge_term_current
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the charging current value which is used to determine
			
 
				+		when the battery is considered full and charging should end.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/health
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the health of the battery or battery side of charger
			
 
				+		functionality.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: "Unknown", "Good", "Overheat", "Dead",
			
 
				+			      "Over voltage", "Unspecified failure", "Cold",
			
 
				+			      "Watchdog timer expire", "Safety timer expire"
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/precharge_current
			
 
				+Date:		June 2017
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the charging current applied during pre-charging phase
			
 
				+		for a battery charge cycle.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/present
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports whether a battery is present or not in the system.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values:
			
 
				+			0: Absent
			
 
				+			1: Present
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/status
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Represents the charging status of the battery. Normally this
			
 
				+		is read-only reporting although for some supplies this can be
			
 
				+		used to enable/disable charging to the battery.
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: "Unknown", "Charging", "Discharging",
			
 
				+			      "Not charging", "Full"
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/technology
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Describes the battery technology supported by the supply.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: "Unknown", "NiMH", "Li-ion", "Li-poly", "LiFe",
			
 
				+			      "NiCd", "LiMn"
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the current TBAT battery temperature reading.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_alert_max
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Maximum TBAT temperature trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		battery charging scenario where user-space needs to know the
			
 
				+		battery temperature has crossed an upper threshold so it can
			
 
				+		take appropriate action (e.g. warning user that battery level is
			
 
				+		critically high, and charging has stopped).
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_alert_min
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Minimum TBAT temperature trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		battery charging scenario where user-space needs to know the
			
 
				+		battery temperature has crossed a lower threshold so it can take
			
 
				+		appropriate action (e.g. warning user that battery level is
			
 
				+		high, and charging current has been reduced accordingly to
			
 
				+		remedy the situation).
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_max
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum allowed TBAT battery temperature for
			
 
				+		charging.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_min
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the minimum allowed TBAT battery temperature for
			
 
				+		charging.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/voltage_avg,
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports an average VBAT voltage reading for the battery, over a
			
 
				+		fixed period. Normally devices will provide a fixed interval in
			
 
				+		which they average readings to smooth out the reported value.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/voltage_max,
			
 
				+Date:		January 2008
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum safe VBAT voltage permitted for the battery,
			
 
				+		during charging.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/voltage_min,
			
 
				+Date:		January 2008
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the minimum safe VBAT voltage permitted for the battery,
			
 
				+		during discharging.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/voltage_now,
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports an instant, single VBAT voltage reading for the battery.
			
 
				+		This value is not averaged/smoothed.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+===== USB Properties =====
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/current_avg
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports an average IBUS current reading over a fixed period.
			
 
				+		Normally devices will provide a fixed interval in which they
			
 
				+		average readings to smooth out the reported value.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/current_max
			
 
				+Date:		October 2010
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum IBUS current the supply can support.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/current_now
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the IBUS current supplied now. This value is generally
			
 
				+		read-only reporting, unless the 'online' state of the supply
			
 
				+		is set to be programmable, in which case this value can be set
			
 
				+		within the reported min/max range.
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/input_current_limit
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Details the incoming IBUS current limit currently set in the
			
 
				+		supply. Normally this is configured based on the type of
			
 
				+		connection made (e.g. A configured SDP should output a maximum
			
 
				+		of 500mA so the input current limit is set to the same value).
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: Represented in microamps
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/online,
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Indicates if VBUS is present for the supply. When the supply is
			
 
				+		online, and the supply allows it, then it's possible to switch
			
 
				+		between online states (e.g. Fixed -> Programmable for a PD_PPS
			
 
				+		USB supply so voltage and current can be controlled).
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values:
			
 
				+			0: Offline
			
 
				+			1: Online Fixed - Fixed Voltage Supply
			
 
				+			2: Online Programmable - Programmable Voltage Supply
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the current supply temperature reading. This would
			
 
				+		normally be the internal temperature of the device itself (e.g
			
 
				+		TJUNC temperature of an IC)
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_alert_max
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Maximum supply temperature trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		charging scenario where user-space needs to know the supply
			
 
				+		temperature has crossed an upper threshold so it can take
			
 
				+		appropriate action (e.g. warning user that the supply
			
 
				+		temperature is critically high, and charging has stopped to
			
 
				+		remedy the situation).
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_alert_min
			
 
				+Date:		July 2012
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Minimum supply temperature trip-wire value where the supply will
			
 
				+		notify user-space of the event. This is normally used for the
			
 
				+		charging scenario where user-space needs to know the supply
			
 
				+		temperature has crossed a lower threshold so it can take
			
 
				+		appropriate action (e.g. warning user that the supply
			
 
				+		temperature is high, and charging current has been reduced
			
 
				+		accordingly to remedy the situation).
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_max
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum allowed supply temperature for operation.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What:		/sys/class/power_supply/<supply_name>/temp_min
			
 
				+Date:		July 2014
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the mainimum allowed supply temperature for operation.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in 1/10 Degrees Celsius
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/usb_type
			
 
				+Date:		March 2018
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports what type of USB connection is currently active for
			
 
				+		the supply, for example it can show if USB-PD capable source
			
 
				+		is attached.
			
 
				+
			
 
				+		Access: Read-Only
			
 
				+		Valid values: "Unknown", "SDP", "DCP", "CDP", "ACA", "C", "PD",
			
 
				+			      "PD_DRP", "PD_PPS", "BrickID"
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/voltage_max
			
 
				+Date:		January 2008
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the maximum VBUS voltage the supply can support.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/voltage_min
			
 
				+Date:		January 2008
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the minimum VBUS voltage the supply can support.
			
 
				+
			
 
				+		Access: Read
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+What: 		/sys/class/power_supply/<supply_name>/voltage_now
			
 
				+Date:		May 2007
			
 
				+Contact:	linux-pm@vger.kernel.org
			
 
				+Description:
			
 
				+		Reports the VBUS voltage supplied now. This value is generally
			
 
				+		read-only reporting, unless the 'online' state of the supply
			
 
				+		is set to be programmable, in which case this value can be set
			
 
				+		within the reported min/max range.
			
 
				+
			
 
				+		Access: Read, Write
			
 
				+		Valid values: Represented in microvolts
			
 
				+
			
 
				+===== Device Specific Properties =====
			
 
				+
			
 
				 What:		/sys/class/power/ds2760-battery.*/charge_now
			
 
				 Date:		May 2010
			
 
				 KernelVersion:	2.6.35
			
--- a/Documentation/ABI/testing/sysfs-class-rc
+++ b/Documentation/ABI/testing/sysfs-class-rc
@@ -1,7 +1,7 @@
 
				 What:		/sys/class/rc/
			
 
				 Date:		Apr 2010
			
 
				 KernelVersion:	2.6.35
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		The rc/ class sub-directory belongs to the Remote Controller
			
 
				 		core and provides a sysfs interface for configuring infrared
			
@@ -10,7 +10,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/
			
 
				 Date:		Apr 2010
			
 
				 KernelVersion:	2.6.35
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		A /sys/class/rc/rcN directory is created for each remote
			
 
				 		control receiver device where N is the number of the receiver.
			
@@ -18,7 +18,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/protocols
			
 
				 Date:		Jun 2010
			
 
				 KernelVersion:	2.6.36
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Reading this file returns a list of available protocols,
			
 
				 		something like:
			
@@ -36,7 +36,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/filter
			
 
				 Date:		Jan 2014
			
 
				 KernelVersion:	3.15
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Sets the scancode filter expected value.
			
 
				 		Use in combination with /sys/class/rc/rcN/filter_mask to set the
			
@@ -49,7 +49,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/filter_mask
			
 
				 Date:		Jan 2014
			
 
				 KernelVersion:	3.15
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Sets the scancode filter mask of bits to compare.
			
 
				 		Use in combination with /sys/class/rc/rcN/filter to set the bits
			
@@ -64,7 +64,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/wakeup_protocols
			
 
				 Date:		Feb 2017
			
 
				 KernelVersion:	4.11
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Reading this file returns a list of available protocols to use
			
 
				 		for the wakeup filter, something like:
			
@@ -83,7 +83,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/wakeup_filter
			
 
				 Date:		Jan 2014
			
 
				 KernelVersion:	3.15
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Sets the scancode wakeup filter expected value.
			
 
				 		Use in combination with /sys/class/rc/rcN/wakeup_filter_mask to
			
@@ -98,7 +98,7 @@ Description:
 
				 What:		/sys/class/rc/rcN/wakeup_filter_mask
			
 
				 Date:		Jan 2014
			
 
				 KernelVersion:	3.15
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Sets the scancode wakeup filter mask of bits to compare.
			
 
				 		Use in combination with /sys/class/rc/rcN/wakeup_filter to set
			
--- a/Documentation/ABI/testing/sysfs-class-rc-nuvoton
+++ b/Documentation/ABI/testing/sysfs-class-rc-nuvoton
@@ -1,7 +1,7 @@
 
				 What:		/sys/class/rc/rcN/wakeup_data
			
 
				 Date:		Mar 2016
			
 
				 KernelVersion:	4.6
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 Description:
			
 
				 		Reading this file returns the stored CIR wakeup sequence.
			
 
				 		It starts with a pulse, followed by a space, pulse etc.
			
--- a/Documentation/ABI/testing/sysfs-devices-edac
+++ b/Documentation/ABI/testing/sysfs-devices-edac
@@ -77,7 +77,7 @@ Description:	Read/Write attribute file that controls memory scrubbing.
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/max_location
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file displays the information about the last
			
 
				 		available memory slot in this memory controller. It is used by
			
@@ -85,7 +85,7 @@ Description:	This attribute file displays the information about the last
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/size
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file will display the size of dimm or rank.
			
 
				 		For dimm*/size, this is the size, in MB of the DIMM memory
			
@@ -96,14 +96,14 @@ Description:	This attribute file will display the size of dimm or rank.
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_dev_type
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file will display what type of DRAM device is
			
 
				 		being utilized on this DIMM (x1, x2, x4, x8, ...).
			
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_edac_mode
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file will display what type of Error detection
			
 
				 		and correction is being utilized. For example: S4ECD4ED would
			
@@ -111,7 +111,7 @@ Description:	This attribute file will display what type of Error detection
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_label
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This control file allows this DIMM to have a label assigned
			
 
				 		to it. With this label in the module, when errors occur
			
@@ -126,14 +126,14 @@ Description:	This control file allows this DIMM to have a label assigned
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_location
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file will display the location (csrow/channel,
			
 
				 		branch/channel/slot or channel/slot) of the dimm or rank.
			
 
				 
			
 
				 What:		/sys/devices/system/edac/mc/mc*/(dimm|rank)*/dimm_mem_type
			
 
				 Date:		April 2012
			
 
				-Contact:	Mauro Carvalho Chehab <m.chehab@samsung.com>
			
 
				+Contact:	Mauro Carvalho Chehab <mchehab+samsung@kernel.org>
			
 
				 		linux-edac@vger.kernel.org
			
 
				 Description:	This attribute file will display what type of memory is
			
 
				 		currently on this csrow. Normally, either buffered or
			
--- a/Documentation/ABI/testing/sysfs-devices-system-cpu
+++ b/Documentation/ABI/testing/sysfs-devices-system-cpu
@@ -238,9 +238,6 @@ Description:	Discover and change clock speed of CPUs
 
				 
			
 
				 		See files in Documentation/cpu-freq/ for more information.
			
 
				 
			
 
				-		In particular, read Documentation/cpu-freq/user-guide.txt
			
 
				-		to learn how to control the knobs.
			
 
				-
			
 
				 
			
 
				 What:		/sys/devices/system/cpu/cpu#/cpufreq/freqdomain_cpus
			
 
				 Date:		June 2013
			
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -101,6 +101,7 @@ Date:		February 2015
 
				 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
			
 
				 Description:
			
 
				 		 Controls the trimming rate in batch mode.
			
 
				+		 <deprecated>
			
 
				 
			
 
				 What:		/sys/fs/f2fs/<disk>/cp_interval
			
 
				 Date:		October 2015
			
@@ -140,7 +141,7 @@ Contact:	"Shuoran Liu" <liushuoran@huawei.com>
 
				 Description:
			
 
				 		 Shows total written kbytes issued to disk.
			
 
				 
			
 
				-What:		/sys/fs/f2fs/<disk>/feature
			
 
				+What:		/sys/fs/f2fs/<disk>/features
			
 
				 Date:		July 2017
			
 
				 Contact:	"Jaegeuk Kim" <jaegeuk@kernel.org>
			
 
				 Description:
			
--- a/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-hugepages
@@ -12,4 +12,4 @@ Description:
 
				 			free_hugepages
			
 
				 			surplus_hugepages
			
 
				 			resv_hugepages
			
 
				-		See Documentation/vm/hugetlbpage.txt for details.
			
 
				+		See Documentation/admin-guide/mm/hugetlbpage.rst for details.
			
--- a/Documentation/ABI/testing/sysfs-kernel-mm-ksm
+++ b/Documentation/ABI/testing/sysfs-kernel-mm-ksm
@@ -40,7 +40,7 @@ Description:	Kernel Samepage Merging daemon sysfs interface
 
				 		sleep_millisecs: how many milliseconds ksm should sleep between
			
 
				 		scans.
			
 
				 
			
 
				-		See Documentation/vm/ksm.txt for more information.
			
 
				+		See Documentation/vm/ksm.rst for more information.
			
 
				 
			
 
				 What:		/sys/kernel/mm/ksm/merge_across_nodes
			
 
				 Date:		January 2013
			
--- a/Documentation/ABI/testing/sysfs-kernel-slab
+++ b/Documentation/ABI/testing/sysfs-kernel-slab
@@ -37,7 +37,7 @@ Description:
 
				 		The alloc_calls file is read-only and lists the kernel code
			
 
				 		locations from which allocations for this cache were performed.
			
 
				 		The alloc_calls file only contains information if debugging is
			
 
				-		enabled for that cache (see Documentation/vm/slub.txt).
			
 
				+		enabled for that cache (see Documentation/vm/slub.rst).
			
 
				 
			
 
				 What:		/sys/kernel/slab/cache/alloc_fastpath
			
 
				 Date:		February 2008
			
@@ -219,7 +219,7 @@ Contact:	Pekka Enberg <penberg@cs.helsinki.fi>,
 
				 Description:
			
 
				 		The free_calls file is read-only and lists the locations of
			
 
				 		object frees if slab debugging is enabled (see
			
 
				-		Documentation/vm/slub.txt).
			
 
				+		Documentation/vm/slub.rst).
			
 
				 
			
 
				 What:		/sys/kernel/slab/cache/free_fastpath
			
 
				 Date:		February 2008
			
--- a/Documentation/ABI/testing/sysfs-platform-ideapad-laptop
+++ b/Documentation/ABI/testing/sysfs-platform-ideapad-laptop
@@ -25,3 +25,16 @@ Description:
 
				 		Control touchpad mode.
			
 
				 			* 1 -> Switched On
			
 
				 			* 0 -> Switched Off
			
 
				+
			
 
				+What:		/sys/bus/pci/devices/<bdf>/<device>/VPC2004:00/fn_lock
			
 
				+Date:		May 2018
			
 
				+KernelVersion:	4.18
			
 
				+Contact:	"Oleg Keri <ezhi99@gmail.com>"
			
 
				+Description:
			
 
				+		Control fn-lock mode.
			
 
				+			* 1 -> Switched On
			
 
				+			* 0 -> Switched Off
			
 
				+
			
 
				+		For example:
			
 
				+		# echo "0" >	\
			
 
				+		/sys/bus/pci/devices/0000:00:1f.0/PNP0C09:00/VPC2004:00/fn_lock
			
--- a/Documentation/PCI/pci-error-recovery.txt
+++ b/Documentation/PCI/pci-error-recovery.txt
@@ -110,7 +110,7 @@ The actual steps taken by a platform to recover from a PCI error
 
				 event will be platform-dependent, but will follow the general
			
 
				 sequence described below.
			
 
				 
			
 
				-STEP 0: Error Event
			
 
				+STEP 0: Error Event: ERR_NONFATAL
			
 
				 -------------------
			
 
				 A PCI bus error is detected by the PCI hardware.  On powerpc, the slot
			
 
				 is isolated, in that all I/O is blocked: all reads return 0xffffffff,
			
@@ -228,13 +228,7 @@ proceeds to either STEP3 (Link Reset) or to STEP 5 (Resume Operations).
 
				 If any driver returned PCI_ERS_RESULT_NEED_RESET, then the platform
			
 
				 proceeds to STEP 4 (Slot Reset)
			
 
				 
			
 
				-STEP 3: Link Reset
			
 
				-------------------
			
 
				-The platform resets the link.  This is a PCI-Express specific step
			
 
				-and is done whenever a fatal error has been detected that can be
			
 
				-"solved" by resetting the link.
			
 
				-
			
 
				-STEP 4: Slot Reset
			
 
				+STEP 3: Slot Reset
			
 
				 ------------------
			
 
				 
			
 
				 In response to a return value of PCI_ERS_RESULT_NEED_RESET, the
			
@@ -320,7 +314,7 @@ Failure).
 
				 >>> However, it probably should.
			
 
				 
			
 
				 
			
 
				-STEP 5: Resume Operations
			
 
				+STEP 4: Resume Operations
			
 
				 -------------------------
			
 
				 The platform will call the resume() callback on all affected device
			
 
				 drivers if all drivers on the segment have returned
			
@@ -332,7 +326,7 @@ a result code.
 
				 At this point, if a new error happens, the platform will restart
			
 
				 a new error recovery sequence.
			
 
				 
			
 
				-STEP 6: Permanent Failure
			
 
				+STEP 5: Permanent Failure
			
 
				 -------------------------
			
 
				 A "permanent failure" has occurred, and the platform cannot recover
			
 
				 the device.  The platform will call error_detected() with a
			
@@ -355,6 +349,27 @@ errors. See the discussion in powerpc/eeh-pci-error-recovery.txt
 
				 for additional detail on real-life experience of the causes of
			
 
				 software errors.
			
 
				 
			
 
				+STEP 0: Error Event: ERR_FATAL
			
 
				+-------------------
			
 
				+PCI bus error is detected by the PCI hardware. On powerpc, the slot is
			
 
				+isolated, in that all I/O is blocked: all reads return 0xffffffff, all
			
 
				+writes are ignored.
			
 
				+
			
 
				+STEP 1: Remove devices
			
 
				+--------------------
			
 
				+Platform removes the devices depending on the error agent, it could be
			
 
				+this port for all subordinates or upstream component (likely downstream
			
 
				+port)
			
 
				+
			
 
				+STEP 2: Reset link
			
 
				+--------------------
			
 
				+The platform resets the link.  This is a PCI-Express specific step and is
			
 
				+done whenever a fatal error has been detected that can be "solved" by
			
 
				+resetting the link.
			
 
				+
			
 
				+STEP 3: Re-enumerate the devices
			
 
				+--------------------
			
 
				+Initiates the re-enumeration.
			
 
				 
			
 
				 Conclusion; General Remarks
			
 
				 ---------------------------
			
--- a/Documentation/RCU/whatisRCU.txt
+++ b/Documentation/RCU/whatisRCU.txt
@@ -1,3 +1,5 @@
 
				+What is RCU?  --  "Read, Copy, Update"
			
 
				+
			
 
				 Please note that the "What is RCU?" LWN series is an excellent place
			
 
				 to start learning about RCU:
			
 
				 
			
--- a/Documentation/accelerators/ocxl.rst
+++ b/Documentation/accelerators/ocxl.rst
@@ -157,6 +157,17 @@ OCXL_IOCTL_GET_METADATA:
 
				   Obtains configuration information from the card, such at the size of
			
 
				   MMIO areas, the AFU version, and the PASID for the current context.
			
 
				 
			
 
				+OCXL_IOCTL_ENABLE_P9_WAIT:
			
 
				+
			
 
				+  Allows the AFU to wake a userspace thread executing 'wait'. Returns
			
 
				+  information to userspace to allow it to configure the AFU. Note that
			
 
				+  this is only available on POWER9.
			
 
				+
			
 
				+OCXL_IOCTL_GET_FEATURES:
			
 
				+
			
 
				+  Reports on which CPU features that affect OpenCAPI are usable from
			
 
				+  userspace.
			
 
				+
			
 
				 
			
 
				 mmap
			
 
				 ----
			
--- a/Documentation/acpi/cppc_sysfs.txt
+++ b/Documentation/acpi/cppc_sysfs.txt
@@ -0,0 +1,69 @@
 
				+
			
 
				+	Collaborative Processor Performance Control (CPPC)
			
 
				+
			
 
				+CPPC defined in the ACPI spec describes a mechanism for the OS to manage the
			
 
				+performance of a logical processor on a contigious and abstract performance
			
 
				+scale. CPPC exposes a set of registers to describe abstract performance scale,
			
 
				+to request performance levels and to measure per-cpu delivered performance.
			
 
				+
			
 
				+For more details on CPPC please refer to the ACPI specification at:
			
 
				+
			
 
				+http://uefi.org/specifications
			
 
				+
			
 
				+Some of the CPPC registers are exposed via sysfs under:
			
 
				+
			
 
				+/sys/devices/system/cpu/cpuX/acpi_cppc/
			
 
				+
			
 
				+for each cpu X
			
 
				+
			
 
				+--------------------------------------------------------------------------------
			
 
				+
			
 
				+$ ls -lR  /sys/devices/system/cpu/cpu0/acpi_cppc/
			
 
				+/sys/devices/system/cpu/cpu0/acpi_cppc/:
			
 
				+total 0
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 feedback_ctrs
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 highest_perf
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 lowest_freq
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 lowest_nonlinear_perf
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 lowest_perf
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 nominal_freq
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 nominal_perf
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 reference_perf
			
 
				+-r--r--r-- 1 root root 65536 Mar  5 19:38 wraparound_time
			
 
				+
			
 
				+--------------------------------------------------------------------------------
			
 
				+
			
 
				+* highest_perf : Highest performance of this processor (abstract scale).
			
 
				+* nominal_perf : Highest sustained performance of this processor (abstract scale).
			
 
				+* lowest_nonlinear_perf : Lowest performance of this processor with nonlinear
			
 
				+  power savings (abstract scale).
			
 
				+* lowest_perf : Lowest performance of this processor (abstract scale).
			
 
				+
			
 
				+* lowest_freq : CPU frequency corresponding to lowest_perf (in MHz).
			
 
				+* nominal_freq : CPU frequency corresponding to nominal_perf (in MHz).
			
 
				+  The above frequencies should only be used to report processor performance in
			
 
				+  freqency instead of abstract scale. These values should not be used for any
			
 
				+  functional decisions.
			
 
				+
			
 
				+* feedback_ctrs : Includes both Reference and delivered performance counter.
			
 
				+  Reference counter ticks up proportional to processor's reference performance.
			
 
				+  Delivered counter ticks up proportional to processor's delivered performance.
			
 
				+* wraparound_time: Minimum time for the feedback counters to wraparound (seconds).
			
 
				+* reference_perf : Performance level at which reference performance counter
			
 
				+  accumulates (abstract scale).
			
 
				+
			
 
				+--------------------------------------------------------------------------------
			
 
				+
			
 
				+		Computing Average Delivered Performance
			
 
				+
			
 
				+Below describes the steps to compute the average performance delivered by taking
			
 
				+two different snapshots of feedback counters at time T1 and T2.
			
 
				+
			
 
				+T1: Read feedback_ctrs as fbc_t1
			
 
				+    Wait or run some workload
			
 
				+T2: Read feedback_ctrs as fbc_t2
			
 
				+
			
 
				+delivered_counter_delta = fbc_t2[del] - fbc_t1[del]
			
 
				+reference_counter_delta = fbc_t2[ref] - fbc_t1[ref]
			
 
				+
			
 
				+delivered_perf = (refernce_perf x delivered_counter_delta) / reference_counter_delta
			
--- a/Documentation/acpi/method-customizing.txt
+++ b/Documentation/acpi/method-customizing.txt
@@ -16,7 +16,8 @@ control method rather than override the entire DSDT, because kernel
 
				 rebuild/reboot is not needed and test result can be got in minutes.
			
 
				 
			
 
				 Note: Only ACPI METHOD can be overridden, any other object types like
			
 
				-      "Device", "OperationRegion", are not recognized.
			
 
				+      "Device", "OperationRegion", are not recognized. Methods
			
 
				+      declared inside scope operators are also not supported.
			
 
				 Note: The same ACPI control method can be overridden for many times,
			
 
				       and it's always the latest one that used by Linux/kernel.
			
 
				 Note: To get the ACPI debug object output (Store (AAAA, Debug)),
			
@@ -32,8 +33,6 @@ Note: To get the ACPI debug object output (Store (AAAA, Debug)),
 
				 
			
 
				       DefinitionBlock ("", "SSDT", 1, "", "", 0x20080715)
			
 
				       {
			
 
				-	External (ACON)
			
 
				-
			
 
				 	Method (\_SB_.AC._PSR, 0, NotSerialized)
			
 
				 	{
			
 
				 		Store ("In AC _PSR", Debug)
			
@@ -42,9 +41,10 @@ Note: To get the ACPI debug object output (Store (AAAA, Debug)),
 
				       }
			
 
				       Note that the full pathname of the method in ACPI namespace
			
 
				       should be used.
			
 
				-      And remember to use "External" to declare external objects.
			
 
				    e) assemble the file to generate the AML code of the method.
			
 
				-      e.g. "iasl psr.asl" (psr.aml is generated as a result)
			
 
				+      e.g. "iasl -vw 6084 psr.asl" (psr.aml is generated as a result)
			
 
				+      If parameter "-vw 6084" is not supported by your iASL compiler,
			
 
				+      please try a newer version.
			
 
				    f) mount debugfs by "mount -t debugfs none /sys/kernel/debug"
			
 
				    g) override the old method via the debugfs by running
			
 
				       "cat /tmp/psr.aml > /sys/kernel/debug/acpi/custom_method"
			
--- a/Documentation/admin-guide/LSM/apparmor.rst
+++ b/Documentation/admin-guide/LSM/apparmor.rst
@@ -44,8 +44,8 @@ Links
 
				 
			
 
				 Mailing List - apparmor@lists.ubuntu.com
			
 
				 
			
 
				-Wiki - http://apparmor.wiki.kernel.org/
			
 
				+Wiki - http://wiki.apparmor.net
			
 
				 
			
 
				-User space tools - https://launchpad.net/apparmor
			
 
				+User space tools - https://gitlab.com/apparmor
			
 
				 
			
 
				-Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/apparmor-dev.git
			
 
				+Kernel module - git://git.kernel.org/pub/scm/linux/kernel/git/jj/linux-apparmor
			
--- a/Documentation/admin-guide/bcache.rst
+++ b/Documentation/admin-guide/bcache.rst
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -0,0 +1,2040 @@
 
				+================
			
 
				+Control Group v2
			
 
				+================
			
 
				+
			
 
				+:Date: October, 2015
			
 
				+:Author: Tejun Heo <tj@kernel.org>
			
 
				+
			
 
				+This is the authoritative documentation on the design, interface and
			
 
				+conventions of cgroup v2.  It describes all userland-visible aspects
			
 
				+of cgroup including core and specific controller behaviors.  All
			
 
				+future changes must be reflected in this document.  Documentation for
			
 
				+v1 is available under Documentation/cgroup-v1/.
			
 
				+
			
 
				+.. CONTENTS
			
 
				+
			
 
				+   1. Introduction
			
 
				+     1-1. Terminology
			
 
				+     1-2. What is cgroup?
			
 
				+   2. Basic Operations
			
 
				+     2-1. Mounting
			
 
				+     2-2. Organizing Processes and Threads
			
 
				+       2-2-1. Processes
			
 
				+       2-2-2. Threads
			
 
				+     2-3. [Un]populated Notification
			
 
				+     2-4. Controlling Controllers
			
 
				+       2-4-1. Enabling and Disabling
			
 
				+       2-4-2. Top-down Constraint
			
 
				+       2-4-3. No Internal Process Constraint
			
 
				+     2-5. Delegation
			
 
				+       2-5-1. Model of Delegation
			
 
				+       2-5-2. Delegation Containment
			
 
				+     2-6. Guidelines
			
 
				+       2-6-1. Organize Once and Control
			
 
				+       2-6-2. Avoid Name Collisions
			
 
				+   3. Resource Distribution Models
			
 
				+     3-1. Weights
			
 
				+     3-2. Limits
			
 
				+     3-3. Protections
			
 
				+     3-4. Allocations
			
 
				+   4. Interface Files
			
 
				+     4-1. Format
			
 
				+     4-2. Conventions
			
 
				+     4-3. Core Interface Files
			
 
				+   5. Controllers
			
 
				+     5-1. CPU
			
 
				+       5-1-1. CPU Interface Files
			
 
				+     5-2. Memory
			
 
				+       5-2-1. Memory Interface Files
			
 
				+       5-2-2. Usage Guidelines
			
 
				+       5-2-3. Memory Ownership
			
 
				+     5-3. IO
			
 
				+       5-3-1. IO Interface Files
			
 
				+       5-3-2. Writeback
			
 
				+     5-4. PID
			
 
				+       5-4-1. PID Interface Files
			
 
				+     5-5. Device
			
 
				+     5-6. RDMA
			
 
				+       5-6-1. RDMA Interface Files
			
 
				+     5-7. Misc
			
 
				+       5-7-1. perf_event
			
 
				+     5-N. Non-normative information
			
 
				+       5-N-1. CPU controller root cgroup process behaviour
			
 
				+       5-N-2. IO controller root cgroup process behaviour
			
 
				+   6. Namespace
			
 
				+     6-1. Basics
			
 
				+     6-2. The Root and Views
			
 
				+     6-3. Migration and setns(2)
			
 
				+     6-4. Interaction with Other Namespaces
			
 
				+   P. Information on Kernel Programming
			
 
				+     P-1. Filesystem Support for Writeback
			
 
				+   D. Deprecated v1 Core Features
			
 
				+   R. Issues with v1 and Rationales for v2
			
 
				+     R-1. Multiple Hierarchies
			
 
				+     R-2. Thread Granularity
			
 
				+     R-3. Competition Between Inner Nodes and Threads
			
 
				+     R-4. Other Interface Issues
			
 
				+     R-5. Controller Issues and Remedies
			
 
				+       R-5-1. Memory
			
 
				+
			
 
				+
			
 
				+Introduction
			
 
				+============
			
 
				+
			
 
				+Terminology
			
 
				+-----------
			
 
				+
			
 
				+"cgroup" stands for "control group" and is never capitalized.  The
			
 
				+singular form is used to designate the whole feature and also as a
			
 
				+qualifier as in "cgroup controllers".  When explicitly referring to
			
 
				+multiple individual control groups, the plural form "cgroups" is used.
			
 
				+
			
 
				+
			
 
				+What is cgroup?
			
 
				+---------------
			
 
				+
			
 
				+cgroup is a mechanism to organize processes hierarchically and
			
 
				+distribute system resources along the hierarchy in a controlled and
			
 
				+configurable manner.
			
 
				+
			
 
				+cgroup is largely composed of two parts - the core and controllers.
			
 
				+cgroup core is primarily responsible for hierarchically organizing
			
 
				+processes.  A cgroup controller is usually responsible for
			
 
				+distributing a specific type of system resource along the hierarchy
			
 
				+although there are utility controllers which serve purposes other than
			
 
				+resource distribution.
			
 
				+
			
 
				+cgroups form a tree structure and every process in the system belongs
			
 
				+to one and only one cgroup.  All threads of a process belong to the
			
 
				+same cgroup.  On creation, all processes are put in the cgroup that
			
 
				+the parent process belongs to at the time.  A process can be migrated
			
 
				+to another cgroup.  Migration of a process doesn't affect already
			
 
				+existing descendant processes.
			
 
				+
			
 
				+Following certain structural constraints, controllers may be enabled or
			
 
				+disabled selectively on a cgroup.  All controller behaviors are
			
 
				+hierarchical - if a controller is enabled on a cgroup, it affects all
			
 
				+processes which belong to the cgroups consisting the inclusive
			
 
				+sub-hierarchy of the cgroup.  When a controller is enabled on a nested
			
 
				+cgroup, it always restricts the resource distribution further.  The
			
 
				+restrictions set closer to the root in the hierarchy can not be
			
 
				+overridden from further away.
			
 
				+
			
 
				+
			
 
				+Basic Operations
			
 
				+================
			
 
				+
			
 
				+Mounting
			
 
				+--------
			
 
				+
			
 
				+Unlike v1, cgroup v2 has only single hierarchy.  The cgroup v2
			
 
				+hierarchy can be mounted with the following mount command::
			
 
				+
			
 
				+  # mount -t cgroup2 none $MOUNT_POINT
			
 
				+
			
 
				+cgroup2 filesystem has the magic number 0x63677270 ("cgrp").  All
			
 
				+controllers which support v2 and are not bound to a v1 hierarchy are
			
 
				+automatically bound to the v2 hierarchy and show up at the root.
			
 
				+Controllers which are not in active use in the v2 hierarchy can be
			
 
				+bound to other hierarchies.  This allows mixing v2 hierarchy with the
			
 
				+legacy v1 multiple hierarchies in a fully backward compatible way.
			
 
				+
			
 
				+A controller can be moved across hierarchies only after the controller
			
 
				+is no longer referenced in its current hierarchy.  Because per-cgroup
			
 
				+controller states are destroyed asynchronously and controllers may
			
 
				+have lingering references, a controller may not show up immediately on
			
 
				+the v2 hierarchy after the final umount of the previous hierarchy.
			
 
				+Similarly, a controller should be fully disabled to be moved out of
			
 
				+the unified hierarchy and it may take some time for the disabled
			
 
				+controller to become available for other hierarchies; furthermore, due
			
 
				+to inter-controller dependencies, other controllers may need to be
			
 
				+disabled too.
			
 
				+
			
 
				+While useful for development and manual configurations, moving
			
 
				+controllers dynamically between the v2 and other hierarchies is
			
 
				+strongly discouraged for production use.  It is recommended to decide
			
 
				+the hierarchies and controller associations before starting using the
			
 
				+controllers after system boot.
			
 
				+
			
 
				+During transition to v2, system management software might still
			
 
				+automount the v1 cgroup filesystem and so hijack all controllers
			
 
				+during boot, before manual intervention is possible. To make testing
			
 
				+and experimenting easier, the kernel parameter cgroup_no_v1= allows
			
 
				+disabling controllers in v1 and make them always available in v2.
			
 
				+
			
 
				+cgroup v2 currently supports the following mount options.
			
 
				+
			
 
				+  nsdelegate
			
 
				+
			
 
				+	Consider cgroup namespaces as delegation boundaries.  This
			
 
				+	option is system wide and can only be set on mount or modified
			
 
				+	through remount from the init namespace.  The mount option is
			
 
				+	ignored on non-init namespace mounts.  Please refer to the
			
 
				+	Delegation section for details.
			
 
				+
			
 
				+
			
 
				+Organizing Processes and Threads
			
 
				+--------------------------------
			
 
				+
			
 
				+Processes
			
 
				+~~~~~~~~~
			
 
				+
			
 
				+Initially, only the root cgroup exists to which all processes belong.
			
 
				+A child cgroup can be created by creating a sub-directory::
			
 
				+
			
 
				+  # mkdir $CGROUP_NAME
			
 
				+
			
 
				+A given cgroup may have multiple child cgroups forming a tree
			
 
				+structure.  Each cgroup has a read-writable interface file
			
 
				+"cgroup.procs".  When read, it lists the PIDs of all processes which
			
 
				+belong to the cgroup one-per-line.  The PIDs are not ordered and the
			
 
				+same PID may show up more than once if the process got moved to
			
 
				+another cgroup and then back or the PID got recycled while reading.
			
 
				+
			
 
				+A process can be migrated into a cgroup by writing its PID to the
			
 
				+target cgroup's "cgroup.procs" file.  Only one process can be migrated
			
 
				+on a single write(2) call.  If a process is composed of multiple
			
 
				+threads, writing the PID of any thread migrates all threads of the
			
 
				+process.
			
 
				+
			
 
				+When a process forks a child process, the new process is born into the
			
 
				+cgroup that the forking process belongs to at the time of the
			
 
				+operation.  After exit, a process stays associated with the cgroup
			
 
				+that it belonged to at the time of exit until it's reaped; however, a
			
 
				+zombie process does not appear in "cgroup.procs" and thus can't be
			
 
				+moved to another cgroup.
			
 
				+
			
 
				+A cgroup which doesn't have any children or live processes can be
			
 
				+destroyed by removing the directory.  Note that a cgroup which doesn't
			
 
				+have any children and is associated only with zombie processes is
			
 
				+considered empty and can be removed::
			
 
				+
			
 
				+  # rmdir $CGROUP_NAME
			
 
				+
			
 
				+"/proc/$PID/cgroup" lists a process's cgroup membership.  If legacy
			
 
				+cgroup is in use in the system, this file may contain multiple lines,
			
 
				+one for each hierarchy.  The entry for cgroup v2 is always in the
			
 
				+format "0::$PATH"::
			
 
				+
			
 
				+  # cat /proc/842/cgroup
			
 
				+  ...
			
 
				+  0::/test-cgroup/test-cgroup-nested
			
 
				+
			
 
				+If the process becomes a zombie and the cgroup it was associated with
			
 
				+is removed subsequently, " (deleted)" is appended to the path::
			
 
				+
			
 
				+  # cat /proc/842/cgroup
			
 
				+  ...
			
 
				+  0::/test-cgroup/test-cgroup-nested (deleted)
			
 
				+
			
 
				+
			
 
				+Threads
			
 
				+~~~~~~~
			
 
				+
			
 
				+cgroup v2 supports thread granularity for a subset of controllers to
			
 
				+support use cases requiring hierarchical resource distribution across
			
 
				+the threads of a group of processes.  By default, all threads of a
			
 
				+process belong to the same cgroup, which also serves as the resource
			
 
				+domain to host resource consumptions which are not specific to a
			
 
				+process or thread.  The thread mode allows threads to be spread across
			
 
				+a subtree while still maintaining the common resource domain for them.
			
 
				+
			
 
				+Controllers which support thread mode are called threaded controllers.
			
 
				+The ones which don't are called domain controllers.
			
 
				+
			
 
				+Marking a cgroup threaded makes it join the resource domain of its
			
 
				+parent as a threaded cgroup.  The parent may be another threaded
			
 
				+cgroup whose resource domain is further up in the hierarchy.  The root
			
 
				+of a threaded subtree, that is, the nearest ancestor which is not
			
 
				+threaded, is called threaded domain or thread root interchangeably and
			
 
				+serves as the resource domain for the entire subtree.
			
 
				+
			
 
				+Inside a threaded subtree, threads of a process can be put in
			
 
				+different cgroups and are not subject to the no internal process
			
 
				+constraint - threaded controllers can be enabled on non-leaf cgroups
			
 
				+whether they have threads in them or not.
			
 
				+
			
 
				+As the threaded domain cgroup hosts all the domain resource
			
 
				+consumptions of the subtree, it is considered to have internal
			
 
				+resource consumptions whether there are processes in it or not and
			
 
				+can't have populated child cgroups which aren't threaded.  Because the
			
 
				+root cgroup is not subject to no internal process constraint, it can
			
 
				+serve both as a threaded domain and a parent to domain cgroups.
			
 
				+
			
 
				+The current operation mode or type of the cgroup is shown in the
			
 
				+"cgroup.type" file which indicates whether the cgroup is a normal
			
 
				+domain, a domain which is serving as the domain of a threaded subtree,
			
 
				+or a threaded cgroup.
			
 
				+
			
 
				+On creation, a cgroup is always a domain cgroup and can be made
			
 
				+threaded by writing "threaded" to the "cgroup.type" file.  The
			
 
				+operation is single direction::
			
 
				+
			
 
				+  # echo threaded > cgroup.type
			
 
				+
			
 
				+Once threaded, the cgroup can't be made a domain again.  To enable the
			
 
				+thread mode, the following conditions must be met.
			
 
				+
			
 
				+- As the cgroup will join the parent's resource domain.  The parent
			
 
				+  must either be a valid (threaded) domain or a threaded cgroup.
			
 
				+
			
 
				+- When the parent is an unthreaded domain, it must not have any domain
			
 
				+  controllers enabled or populated domain children.  The root is
			
 
				+  exempt from this requirement.
			
 
				+
			
 
				+Topology-wise, a cgroup can be in an invalid state.  Please consider
			
 
				+the following topology::
			
 
				+
			
 
				+  A (threaded domain) - B (threaded) - C (domain, just created)
			
 
				+
			
 
				+C is created as a domain but isn't connected to a parent which can
			
 
				+host child domains.  C can't be used until it is turned into a
			
 
				+threaded cgroup.  "cgroup.type" file will report "domain (invalid)" in
			
 
				+these cases.  Operations which fail due to invalid topology use
			
 
				+EOPNOTSUPP as the errno.
			
 
				+
			
 
				+A domain cgroup is turned into a threaded domain when one of its child
			
 
				+cgroup becomes threaded or threaded controllers are enabled in the
			
 
				+"cgroup.subtree_control" file while there are processes in the cgroup.
			
 
				+A threaded domain reverts to a normal domain when the conditions
			
 
				+clear.
			
 
				+
			
 
				+When read, "cgroup.threads" contains the list of the thread IDs of all
			
 
				+threads in the cgroup.  Except that the operations are per-thread
			
 
				+instead of per-process, "cgroup.threads" has the same format and
			
 
				+behaves the same way as "cgroup.procs".  While "cgroup.threads" can be
			
 
				+written to in any cgroup, as it can only move threads inside the same
			
 
				+threaded domain, its operations are confined inside each threaded
			
 
				+subtree.
			
 
				+
			
 
				+The threaded domain cgroup serves as the resource domain for the whole
			
 
				+subtree, and, while the threads can be scattered across the subtree,
			
 
				+all the processes are considered to be in the threaded domain cgroup.
			
 
				+"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
			
 
				+processes in the subtree and is not readable in the subtree proper.
			
 
				+However, "cgroup.procs" can be written to from anywhere in the subtree
			
 
				+to migrate all threads of the matching process to the cgroup.
			
 
				+
			
 
				+Only threaded controllers can be enabled in a threaded subtree.  When
			
 
				+a threaded controller is enabled inside a threaded subtree, it only
			
 
				+accounts for and controls resource consumptions associated with the
			
 
				+threads in the cgroup and its descendants.  All consumptions which
			
 
				+aren't tied to a specific thread belong to the threaded domain cgroup.
			
 
				+
			
 
				+Because a threaded subtree is exempt from no internal process
			
 
				+constraint, a threaded controller must be able to handle competition
			
 
				+between threads in a non-leaf cgroup and its child cgroups.  Each
			
 
				+threaded controller defines how such competitions are handled.
			
 
				+
			
 
				+
			
 
				+[Un]populated Notification
			
 
				+--------------------------
			
 
				+
			
 
				+Each non-root cgroup has a "cgroup.events" file which contains
			
 
				+"populated" field indicating whether the cgroup's sub-hierarchy has
			
 
				+live processes in it.  Its value is 0 if there is no live process in
			
 
				+the cgroup and its descendants; otherwise, 1.  poll and [id]notify
			
 
				+events are triggered when the value changes.  This can be used, for
			
 
				+example, to start a clean-up operation after all processes of a given
			
 
				+sub-hierarchy have exited.  The populated state updates and
			
 
				+notifications are recursive.  Consider the following sub-hierarchy
			
 
				+where the numbers in the parentheses represent the numbers of processes
			
 
				+in each cgroup::
			
 
				+
			
 
				+  A(4) - B(0) - C(1)
			
 
				+              \ D(0)
			
 
				+
			
 
				+A, B and C's "populated" fields would be 1 while D's 0.  After the one
			
 
				+process in C exits, B and C's "populated" fields would flip to "0" and
			
 
				+file modified events will be generated on the "cgroup.events" files of
			
 
				+both cgroups.
			
 
				+
			
 
				+
			
 
				+Controlling Controllers
			
 
				+-----------------------
			
 
				+
			
 
				+Enabling and Disabling
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Each cgroup has a "cgroup.controllers" file which lists all
			
 
				+controllers available for the cgroup to enable::
			
 
				+
			
 
				+  # cat cgroup.controllers
			
 
				+  cpu io memory
			
 
				+
			
 
				+No controller is enabled by default.  Controllers can be enabled and
			
 
				+disabled by writing to the "cgroup.subtree_control" file::
			
 
				+
			
 
				+  # echo "+cpu +memory -io" > cgroup.subtree_control
			
 
				+
			
 
				+Only controllers which are listed in "cgroup.controllers" can be
			
 
				+enabled.  When multiple operations are specified as above, either they
			
 
				+all succeed or fail.  If multiple operations on the same controller
			
 
				+are specified, the last one is effective.
			
 
				+
			
 
				+Enabling a controller in a cgroup indicates that the distribution of
			
 
				+the target resource across its immediate children will be controlled.
			
 
				+Consider the following sub-hierarchy.  The enabled controllers are
			
 
				+listed in parentheses::
			
 
				+
			
 
				+  A(cpu,memory) - B(memory) - C()
			
 
				+                            \ D()
			
 
				+
			
 
				+As A has "cpu" and "memory" enabled, A will control the distribution
			
 
				+of CPU cycles and memory to its children, in this case, B.  As B has
			
 
				+"memory" enabled but not "CPU", C and D will compete freely on CPU
			
 
				+cycles but their division of memory available to B will be controlled.
			
 
				+
			
 
				+As a controller regulates the distribution of the target resource to
			
 
				+the cgroup's children, enabling it creates the controller's interface
			
 
				+files in the child cgroups.  In the above example, enabling "cpu" on B
			
 
				+would create the "cpu." prefixed controller interface files in C and
			
 
				+D.  Likewise, disabling "memory" from B would remove the "memory."
			
 
				+prefixed controller interface files from C and D.  This means that the
			
 
				+controller interface files - anything which doesn't start with
			
 
				+"cgroup." are owned by the parent rather than the cgroup itself.
			
 
				+
			
 
				+
			
 
				+Top-down Constraint
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Resources are distributed top-down and a cgroup can further distribute
			
 
				+a resource only if the resource has been distributed to it from the
			
 
				+parent.  This means that all non-root "cgroup.subtree_control" files
			
 
				+can only contain controllers which are enabled in the parent's
			
 
				+"cgroup.subtree_control" file.  A controller can be enabled only if
			
 
				+the parent has the controller enabled and a controller can't be
			
 
				+disabled if one or more children have it enabled.
			
 
				+
			
 
				+
			
 
				+No Internal Process Constraint
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Non-root cgroups can distribute domain resources to their children
			
 
				+only when they don't have any processes of their own.  In other words,
			
 
				+only domain cgroups which don't contain any processes can have domain
			
 
				+controllers enabled in their "cgroup.subtree_control" files.
			
 
				+
			
 
				+This guarantees that, when a domain controller is looking at the part
			
 
				+of the hierarchy which has it enabled, processes are always only on
			
 
				+the leaves.  This rules out situations where child cgroups compete
			
 
				+against internal processes of the parent.
			
 
				+
			
 
				+The root cgroup is exempt from this restriction.  Root contains
			
 
				+processes and anonymous resource consumption which can't be associated
			
 
				+with any other cgroups and requires special treatment from most
			
 
				+controllers.  How resource consumption in the root cgroup is governed
			
 
				+is up to each controller (for more information on this topic please
			
 
				+refer to the Non-normative information section in the Controllers
			
 
				+chapter).
			
 
				+
			
 
				+Note that the restriction doesn't get in the way if there is no
			
 
				+enabled controller in the cgroup's "cgroup.subtree_control".  This is
			
 
				+important as otherwise it wouldn't be possible to create children of a
			
 
				+populated cgroup.  To control resource distribution of a cgroup, the
			
 
				+cgroup must create children and transfer all its processes to the
			
 
				+children before enabling controllers in its "cgroup.subtree_control"
			
 
				+file.
			
 
				+
			
 
				+
			
 
				+Delegation
			
 
				+----------
			
 
				+
			
 
				+Model of Delegation
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+A cgroup can be delegated in two ways.  First, to a less privileged
			
 
				+user by granting write access of the directory and its "cgroup.procs",
			
 
				+"cgroup.threads" and "cgroup.subtree_control" files to the user.
			
 
				+Second, if the "nsdelegate" mount option is set, automatically to a
			
 
				+cgroup namespace on namespace creation.
			
 
				+
			
 
				+Because the resource control interface files in a given directory
			
 
				+control the distribution of the parent's resources, the delegatee
			
 
				+shouldn't be allowed to write to them.  For the first method, this is
			
 
				+achieved by not granting access to these files.  For the second, the
			
 
				+kernel rejects writes to all files other than "cgroup.procs" and
			
 
				+"cgroup.subtree_control" on a namespace root from inside the
			
 
				+namespace.
			
 
				+
			
 
				+The end results are equivalent for both delegation types.  Once
			
 
				+delegated, the user can build sub-hierarchy under the directory,
			
 
				+organize processes inside it as it sees fit and further distribute the
			
 
				+resources it received from the parent.  The limits and other settings
			
 
				+of all resource controllers are hierarchical and regardless of what
			
 
				+happens in the delegated sub-hierarchy, nothing can escape the
			
 
				+resource restrictions imposed by the parent.
			
 
				+
			
 
				+Currently, cgroup doesn't impose any restrictions on the number of
			
 
				+cgroups in or nesting depth of a delegated sub-hierarchy; however,
			
 
				+this may be limited explicitly in the future.
			
 
				+
			
 
				+
			
 
				+Delegation Containment
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+A delegated sub-hierarchy is contained in the sense that processes
			
 
				+can't be moved into or out of the sub-hierarchy by the delegatee.
			
 
				+
			
 
				+For delegations to a less privileged user, this is achieved by
			
 
				+requiring the following conditions for a process with a non-root euid
			
 
				+to migrate a target process into a cgroup by writing its PID to the
			
 
				+"cgroup.procs" file.
			
 
				+
			
 
				+- The writer must have write access to the "cgroup.procs" file.
			
 
				+
			
 
				+- The writer must have write access to the "cgroup.procs" file of the
			
 
				+  common ancestor of the source and destination cgroups.
			
 
				+
			
 
				+The above two constraints ensure that while a delegatee may migrate
			
 
				+processes around freely in the delegated sub-hierarchy it can't pull
			
 
				+in from or push out to outside the sub-hierarchy.
			
 
				+
			
 
				+For an example, let's assume cgroups C0 and C1 have been delegated to
			
 
				+user U0 who created C00, C01 under C0 and C10 under C1 as follows and
			
 
				+all processes under C0 and C1 belong to U0::
			
 
				+
			
 
				+  ~~~~~~~~~~~~~ - C0 - C00
			
 
				+  ~ cgroup    ~      \ C01
			
 
				+  ~ hierarchy ~
			
 
				+  ~~~~~~~~~~~~~ - C1 - C10
			
 
				+
			
 
				+Let's also say U0 wants to write the PID of a process which is
			
 
				+currently in C10 into "C00/cgroup.procs".  U0 has write access to the
			
 
				+file; however, the common ancestor of the source cgroup C10 and the
			
 
				+destination cgroup C00 is above the points of delegation and U0 would
			
 
				+not have write access to its "cgroup.procs" files and thus the write
			
 
				+will be denied with -EACCES.
			
 
				+
			
 
				+For delegations to namespaces, containment is achieved by requiring
			
 
				+that both the source and destination cgroups are reachable from the
			
 
				+namespace of the process which is attempting the migration.  If either
			
 
				+is not reachable, the migration is rejected with -ENOENT.
			
 
				+
			
 
				+
			
 
				+Guidelines
			
 
				+----------
			
 
				+
			
 
				+Organize Once and Control
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Migrating a process across cgroups is a relatively expensive operation
			
 
				+and stateful resources such as memory are not moved together with the
			
 
				+process.  This is an explicit design decision as there often exist
			
 
				+inherent trade-offs between migration and various hot paths in terms
			
 
				+of synchronization cost.
			
 
				+
			
 
				+As such, migrating processes across cgroups frequently as a means to
			
 
				+apply different resource restrictions is discouraged.  A workload
			
 
				+should be assigned to a cgroup according to the system's logical and
			
 
				+resource structure once on start-up.  Dynamic adjustments to resource
			
 
				+distribution can be made by changing controller configuration through
			
 
				+the interface files.
			
 
				+
			
 
				+
			
 
				+Avoid Name Collisions
			
 
				+~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Interface files for a cgroup and its children cgroups occupy the same
			
 
				+directory and it is possible to create children cgroups which collide
			
 
				+with interface files.
			
 
				+
			
 
				+All cgroup core interface files are prefixed with "cgroup." and each
			
 
				+controller's interface files are prefixed with the controller name and
			
 
				+a dot.  A controller's name is composed of lower case alphabets and
			
 
				+'_'s but never begins with an '_' so it can be used as the prefix
			
 
				+character for collision avoidance.  Also, interface file names won't
			
 
				+start or end with terms which are often used in categorizing workloads
			
 
				+such as job, service, slice, unit or workload.
			
 
				+
			
 
				+cgroup doesn't do anything to prevent name collisions and it's the
			
 
				+user's responsibility to avoid them.
			
 
				+
			
 
				+
			
 
				+Resource Distribution Models
			
 
				+============================
			
 
				+
			
 
				+cgroup controllers implement several resource distribution schemes
			
 
				+depending on the resource type and expected use cases.  This section
			
 
				+describes major schemes in use along with their expected behaviors.
			
 
				+
			
 
				+
			
 
				+Weights
			
 
				+-------
			
 
				+
			
 
				+A parent's resource is distributed by adding up the weights of all
			
 
				+active children and giving each the fraction matching the ratio of its
			
 
				+weight against the sum.  As only children which can make use of the
			
 
				+resource at the moment participate in the distribution, this is
			
 
				+work-conserving.  Due to the dynamic nature, this model is usually
			
 
				+used for stateless resources.
			
 
				+
			
 
				+All weights are in the range [1, 10000] with the default at 100.  This
			
 
				+allows symmetric multiplicative biases in both directions at fine
			
 
				+enough granularity while staying in the intuitive range.
			
 
				+
			
 
				+As long as the weight is in range, all configuration combinations are
			
 
				+valid and there is no reason to reject configuration changes or
			
 
				+process migrations.
			
 
				+
			
 
				+"cpu.weight" proportionally distributes CPU cycles to active children
			
 
				+and is an example of this type.
			
 
				+
			
 
				+
			
 
				+Limits
			
 
				+------
			
 
				+
			
 
				+A child can only consume upto the configured amount of the resource.
			
 
				+Limits can be over-committed - the sum of the limits of children can
			
 
				+exceed the amount of resource available to the parent.
			
 
				+
			
 
				+Limits are in the range [0, max] and defaults to "max", which is noop.
			
 
				+
			
 
				+As limits can be over-committed, all configuration combinations are
			
 
				+valid and there is no reason to reject configuration changes or
			
 
				+process migrations.
			
 
				+
			
 
				+"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
			
 
				+on an IO device and is an example of this type.
			
 
				+
			
 
				+
			
 
				+Protections
			
 
				+-----------
			
 
				+
			
 
				+A cgroup is protected to be allocated upto the configured amount of
			
 
				+the resource if the usages of all its ancestors are under their
			
 
				+protected levels.  Protections can be hard guarantees or best effort
			
 
				+soft boundaries.  Protections can also be over-committed in which case
			
 
				+only upto the amount available to the parent is protected among
			
 
				+children.
			
 
				+
			
 
				+Protections are in the range [0, max] and defaults to 0, which is
			
 
				+noop.
			
 
				+
			
 
				+As protections can be over-committed, all configuration combinations
			
 
				+are valid and there is no reason to reject configuration changes or
			
 
				+process migrations.
			
 
				+
			
 
				+"memory.low" implements best-effort memory protection and is an
			
 
				+example of this type.
			
 
				+
			
 
				+
			
 
				+Allocations
			
 
				+-----------
			
 
				+
			
 
				+A cgroup is exclusively allocated a certain amount of a finite
			
 
				+resource.  Allocations can't be over-committed - the sum of the
			
 
				+allocations of children can not exceed the amount of resource
			
 
				+available to the parent.
			
 
				+
			
 
				+Allocations are in the range [0, max] and defaults to 0, which is no
			
 
				+resource.
			
 
				+
			
 
				+As allocations can't be over-committed, some configuration
			
 
				+combinations are invalid and should be rejected.  Also, if the
			
 
				+resource is mandatory for execution of processes, process migrations
			
 
				+may be rejected.
			
 
				+
			
 
				+"cpu.rt.max" hard-allocates realtime slices and is an example of this
			
 
				+type.
			
 
				+
			
 
				+
			
 
				+Interface Files
			
 
				+===============
			
 
				+
			
 
				+Format
			
 
				+------
			
 
				+
			
 
				+All interface files should be in one of the following formats whenever
			
 
				+possible::
			
 
				+
			
 
				+  New-line separated values
			
 
				+  (when only one value can be written at once)
			
 
				+
			
 
				+	VAL0\n
			
 
				+	VAL1\n
			
 
				+	...
			
 
				+
			
 
				+  Space separated values
			
 
				+  (when read-only or multiple values can be written at once)
			
 
				+
			
 
				+	VAL0 VAL1 ...\n
			
 
				+
			
 
				+  Flat keyed
			
 
				+
			
 
				+	KEY0 VAL0\n
			
 
				+	KEY1 VAL1\n
			
 
				+	...
			
 
				+
			
 
				+  Nested keyed
			
 
				+
			
 
				+	KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
			
 
				+	KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
			
 
				+	...
			
 
				+
			
 
				+For a writable file, the format for writing should generally match
			
 
				+reading; however, controllers may allow omitting later fields or
			
 
				+implement restricted shortcuts for most common use cases.
			
 
				+
			
 
				+For both flat and nested keyed files, only the values for a single key
			
 
				+can be written at a time.  For nested keyed files, the sub key pairs
			
 
				+may be specified in any order and not all pairs have to be specified.
			
 
				+
			
 
				+
			
 
				+Conventions
			
 
				+-----------
			
 
				+
			
 
				+- Settings for a single feature should be contained in a single file.
			
 
				+
			
 
				+- The root cgroup should be exempt from resource control and thus
			
 
				+  shouldn't have resource control interface files.  Also,
			
 
				+  informational files on the root cgroup which end up showing global
			
 
				+  information available elsewhere shouldn't exist.
			
 
				+
			
 
				+- If a controller implements weight based resource distribution, its
			
 
				+  interface file should be named "weight" and have the range [1,
			
 
				+  10000] with 100 as the default.  The values are chosen to allow
			
 
				+  enough and symmetric bias in both directions while keeping it
			
 
				+  intuitive (the default is 100%).
			
 
				+
			
 
				+- If a controller implements an absolute resource guarantee and/or
			
 
				+  limit, the interface files should be named "min" and "max"
			
 
				+  respectively.  If a controller implements best effort resource
			
 
				+  guarantee and/or limit, the interface files should be named "low"
			
 
				+  and "high" respectively.
			
 
				+
			
 
				+  In the above four control files, the special token "max" should be
			
 
				+  used to represent upward infinity for both reading and writing.
			
 
				+
			
 
				+- If a setting has a configurable default value and keyed specific
			
 
				+  overrides, the default entry should be keyed with "default" and
			
 
				+  appear as the first entry in the file.
			
 
				+
			
 
				+  The default value can be updated by writing either "default $VAL" or
			
 
				+  "$VAL".
			
 
				+
			
 
				+  When writing to update a specific override, "default" can be used as
			
 
				+  the value to indicate removal of the override.  Override entries
			
 
				+  with "default" as the value must not appear when read.
			
 
				+
			
 
				+  For example, a setting which is keyed by major:minor device numbers
			
 
				+  with integer values may look like the following::
			
 
				+
			
 
				+    # cat cgroup-example-interface-file
			
 
				+    default 150
			
 
				+    8:0 300
			
 
				+
			
 
				+  The default value can be updated by::
			
 
				+
			
 
				+    # echo 125 > cgroup-example-interface-file
			
 
				+
			
 
				+  or::
			
 
				+
			
 
				+    # echo "default 125" > cgroup-example-interface-file
			
 
				+
			
 
				+  An override can be set by::
			
 
				+
			
 
				+    # echo "8:16 170" > cgroup-example-interface-file
			
 
				+
			
 
				+  and cleared by::
			
 
				+
			
 
				+    # echo "8:0 default" > cgroup-example-interface-file
			
 
				+    # cat cgroup-example-interface-file
			
 
				+    default 125
			
 
				+    8:16 170
			
 
				+
			
 
				+- For events which are not very high frequency, an interface file
			
 
				+  "events" should be created which lists event key value pairs.
			
 
				+  Whenever a notifiable event happens, file modified event should be
			
 
				+  generated on the file.
			
 
				+
			
 
				+
			
 
				+Core Interface Files
			
 
				+--------------------
			
 
				+
			
 
				+All cgroup core files are prefixed with "cgroup."
			
 
				+
			
 
				+  cgroup.type
			
 
				+
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.
			
 
				+
			
 
				+	When read, it indicates the current type of the cgroup, which
			
 
				+	can be one of the following values.
			
 
				+
			
 
				+	- "domain" : A normal valid domain cgroup.
			
 
				+
			
 
				+	- "domain threaded" : A threaded domain cgroup which is
			
 
				+          serving as the root of a threaded subtree.
			
 
				+
			
 
				+	- "domain invalid" : A cgroup which is in an invalid state.
			
 
				+	  It can't be populated or have controllers enabled.  It may
			
 
				+	  be allowed to become a threaded cgroup.
			
 
				+
			
 
				+	- "threaded" : A threaded cgroup which is a member of a
			
 
				+          threaded subtree.
			
 
				+
			
 
				+	A cgroup can be turned into a threaded cgroup by writing
			
 
				+	"threaded" to this file.
			
 
				+
			
 
				+  cgroup.procs
			
 
				+	A read-write new-line separated values file which exists on
			
 
				+	all cgroups.
			
 
				+
			
 
				+	When read, it lists the PIDs of all processes which belong to
			
 
				+	the cgroup one-per-line.  The PIDs are not ordered and the
			
 
				+	same PID may show up more than once if the process got moved
			
 
				+	to another cgroup and then back or the PID got recycled while
			
 
				+	reading.
			
 
				+
			
 
				+	A PID can be written to migrate the process associated with
			
 
				+	the PID to the cgroup.  The writer should match all of the
			
 
				+	following conditions.
			
 
				+
			
 
				+	- It must have write access to the "cgroup.procs" file.
			
 
				+
			
 
				+	- It must have write access to the "cgroup.procs" file of the
			
 
				+	  common ancestor of the source and destination cgroups.
			
 
				+
			
 
				+	When delegating a sub-hierarchy, write access to this file
			
 
				+	should be granted along with the containing directory.
			
 
				+
			
 
				+	In a threaded cgroup, reading this file fails with EOPNOTSUPP
			
 
				+	as all the processes belong to the thread root.  Writing is
			
 
				+	supported and moves every thread of the process to the cgroup.
			
 
				+
			
 
				+  cgroup.threads
			
 
				+	A read-write new-line separated values file which exists on
			
 
				+	all cgroups.
			
 
				+
			
 
				+	When read, it lists the TIDs of all threads which belong to
			
 
				+	the cgroup one-per-line.  The TIDs are not ordered and the
			
 
				+	same TID may show up more than once if the thread got moved to
			
 
				+	another cgroup and then back or the TID got recycled while
			
 
				+	reading.
			
 
				+
			
 
				+	A TID can be written to migrate the thread associated with the
			
 
				+	TID to the cgroup.  The writer should match all of the
			
 
				+	following conditions.
			
 
				+
			
 
				+	- It must have write access to the "cgroup.threads" file.
			
 
				+
			
 
				+	- The cgroup that the thread is currently in must be in the
			
 
				+          same resource domain as the destination cgroup.
			
 
				+
			
 
				+	- It must have write access to the "cgroup.procs" file of the
			
 
				+	  common ancestor of the source and destination cgroups.
			
 
				+
			
 
				+	When delegating a sub-hierarchy, write access to this file
			
 
				+	should be granted along with the containing directory.
			
 
				+
			
 
				+  cgroup.controllers
			
 
				+	A read-only space separated values file which exists on all
			
 
				+	cgroups.
			
 
				+
			
 
				+	It shows space separated list of all controllers available to
			
 
				+	the cgroup.  The controllers are not ordered.
			
 
				+
			
 
				+  cgroup.subtree_control
			
 
				+	A read-write space separated values file which exists on all
			
 
				+	cgroups.  Starts out empty.
			
 
				+
			
 
				+	When read, it shows space separated list of the controllers
			
 
				+	which are enabled to control resource distribution from the
			
 
				+	cgroup to its children.
			
 
				+
			
 
				+	Space separated list of controllers prefixed with '+' or '-'
			
 
				+	can be written to enable or disable controllers.  A controller
			
 
				+	name prefixed with '+' enables the controller and '-'
			
 
				+	disables.  If a controller appears more than once on the list,
			
 
				+	the last one is effective.  When multiple enable and disable
			
 
				+	operations are specified, either all succeed or all fail.
			
 
				+
			
 
				+  cgroup.events
			
 
				+	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				+	The following entries are defined.  Unless specified
			
 
				+	otherwise, a value change in this file generates a file
			
 
				+	modified event.
			
 
				+
			
 
				+	  populated
			
 
				+		1 if the cgroup or its descendants contains any live
			
 
				+		processes; otherwise, 0.
			
 
				+
			
 
				+  cgroup.max.descendants
			
 
				+	A read-write single value files.  The default is "max".
			
 
				+
			
 
				+	Maximum allowed number of descent cgroups.
			
 
				+	If the actual number of descendants is equal or larger,
			
 
				+	an attempt to create a new cgroup in the hierarchy will fail.
			
 
				+
			
 
				+  cgroup.max.depth
			
 
				+	A read-write single value files.  The default is "max".
			
 
				+
			
 
				+	Maximum allowed descent depth below the current cgroup.
			
 
				+	If the actual descent depth is equal or larger,
			
 
				+	an attempt to create a new child cgroup will fail.
			
 
				+
			
 
				+  cgroup.stat
			
 
				+	A read-only flat-keyed file with the following entries:
			
 
				+
			
 
				+	  nr_descendants
			
 
				+		Total number of visible descendant cgroups.
			
 
				+
			
 
				+	  nr_dying_descendants
			
 
				+		Total number of dying descendant cgroups. A cgroup becomes
			
 
				+		dying after being deleted by a user. The cgroup will remain
			
 
				+		in dying state for some time undefined time (which can depend
			
 
				+		on system load) before being completely destroyed.
			
 
				+
			
 
				+		A process can't enter a dying cgroup under any circumstances,
			
 
				+		a dying cgroup can't revive.
			
 
				+
			
 
				+		A dying cgroup can consume system resources not exceeding
			
 
				+		limits, which were active at the moment of cgroup deletion.
			
 
				+
			
 
				+
			
 
				+Controllers
			
 
				+===========
			
 
				+
			
 
				+CPU
			
 
				+---
			
 
				+
			
 
				+The "cpu" controllers regulates distribution of CPU cycles.  This
			
 
				+controller implements weight and absolute bandwidth limit models for
			
 
				+normal scheduling policy and absolute bandwidth allocation model for
			
 
				+realtime scheduling policy.
			
 
				+
			
 
				+WARNING: cgroup2 doesn't yet support control of realtime processes and
			
 
				+the cpu controller can only be enabled when all RT processes are in
			
 
				+the root cgroup.  Be aware that system management software may already
			
 
				+have placed RT processes into nonroot cgroups during the system boot
			
 
				+process, and these processes may need to be moved to the root cgroup
			
 
				+before the cpu controller can be enabled.
			
 
				+
			
 
				+
			
 
				+CPU Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+All time durations are in microseconds.
			
 
				+
			
 
				+  cpu.stat
			
 
				+	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				+	This file exists whether the controller is enabled or not.
			
 
				+
			
 
				+	It always reports the following three stats:
			
 
				+
			
 
				+	- usage_usec
			
 
				+	- user_usec
			
 
				+	- system_usec
			
 
				+
			
 
				+	and the following three when the controller is enabled:
			
 
				+
			
 
				+	- nr_periods
			
 
				+	- nr_throttled
			
 
				+	- throttled_usec
			
 
				+
			
 
				+  cpu.weight
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "100".
			
 
				+
			
 
				+	The weight in the range [1, 10000].
			
 
				+
			
 
				+  cpu.weight.nice
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "0".
			
 
				+
			
 
				+	The nice value is in the range [-20, 19].
			
 
				+
			
 
				+	This interface file is an alternative interface for
			
 
				+	"cpu.weight" and allows reading and setting weight using the
			
 
				+	same values used by nice(2).  Because the range is smaller and
			
 
				+	granularity is coarser for the nice values, the read value is
			
 
				+	the closest approximation of the current weight.
			
 
				+
			
 
				+  cpu.max
			
 
				+	A read-write two value file which exists on non-root cgroups.
			
 
				+	The default is "max 100000".
			
 
				+
			
 
				+	The maximum bandwidth limit.  It's in the following format::
			
 
				+
			
 
				+	  $MAX $PERIOD
			
 
				+
			
 
				+	which indicates that the group may consume upto $MAX in each
			
 
				+	$PERIOD duration.  "max" for $MAX indicates no limit.  If only
			
 
				+	one number is written, $MAX is updated.
			
 
				+
			
 
				+
			
 
				+Memory
			
 
				+------
			
 
				+
			
 
				+The "memory" controller regulates distribution of memory.  Memory is
			
 
				+stateful and implements both limit and protection models.  Due to the
			
 
				+intertwining between memory usage and reclaim pressure and the
			
 
				+stateful nature of memory, the distribution model is relatively
			
 
				+complex.
			
 
				+
			
 
				+While not completely water-tight, all major memory usages by a given
			
 
				+cgroup are tracked so that the total memory consumption can be
			
 
				+accounted and controlled to a reasonable extent.  Currently, the
			
 
				+following types of memory usages are tracked.
			
 
				+
			
 
				+- Userland memory - page cache and anonymous memory.
			
 
				+
			
 
				+- Kernel data structures such as dentries and inodes.
			
 
				+
			
 
				+- TCP socket buffers.
			
 
				+
			
 
				+The above list may expand in the future for better coverage.
			
 
				+
			
 
				+
			
 
				+Memory Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+All memory amounts are in bytes.  If a value which is not aligned to
			
 
				+PAGE_SIZE is written, the value may be rounded up to the closest
			
 
				+PAGE_SIZE multiple when read back.
			
 
				+
			
 
				+  memory.current
			
 
				+	A read-only single value file which exists on non-root
			
 
				+	cgroups.
			
 
				+
			
 
				+	The total amount of memory currently being used by the cgroup
			
 
				+	and its descendants.
			
 
				+
			
 
				+  memory.min
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "0".
			
 
				+
			
 
				+	Hard memory protection.  If the memory usage of a cgroup
			
 
				+	is within its effective min boundary, the cgroup's memory
			
 
				+	won't be reclaimed under any conditions. If there is no
			
 
				+	unprotected reclaimable memory available, OOM killer
			
 
				+	is invoked.
			
 
				+
			
 
				+       Effective min boundary is limited by memory.min values of
			
 
				+	all ancestor cgroups. If there is memory.min overcommitment
			
 
				+	(child cgroup or cgroups are requiring more protected memory
			
 
				+	than parent will allow), then each child cgroup will get
			
 
				+	the part of parent's protection proportional to its
			
 
				+	actual memory usage below memory.min.
			
 
				+
			
 
				+	Putting more memory than generally available under this
			
 
				+	protection is discouraged and may lead to constant OOMs.
			
 
				+
			
 
				+	If a memory cgroup is not populated with processes,
			
 
				+	its memory.min is ignored.
			
 
				+
			
 
				+  memory.low
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "0".
			
 
				+
			
 
				+	Best-effort memory protection.  If the memory usage of a
			
 
				+	cgroup is within its effective low boundary, the cgroup's
			
 
				+	memory won't be reclaimed unless memory can be reclaimed
			
 
				+	from unprotected cgroups.
			
 
				+
			
 
				+	Effective low boundary is limited by memory.low values of
			
 
				+	all ancestor cgroups. If there is memory.low overcommitment
			
 
				+	(child cgroup or cgroups are requiring more protected memory
			
 
				+	than parent will allow), then each child cgroup will get
			
 
				+	the part of parent's protection proportional to its
			
 
				+	actual memory usage below memory.low.
			
 
				+
			
 
				+	Putting more memory than generally available under this
			
 
				+	protection is discouraged.
			
 
				+
			
 
				+  memory.high
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "max".
			
 
				+
			
 
				+	Memory usage throttle limit.  This is the main mechanism to
			
 
				+	control memory usage of a cgroup.  If a cgroup's usage goes
			
 
				+	over the high boundary, the processes of the cgroup are
			
 
				+	throttled and put under heavy reclaim pressure.
			
 
				+
			
 
				+	Going over the high limit never invokes the OOM killer and
			
 
				+	under extreme conditions the limit may be breached.
			
 
				+
			
 
				+  memory.max
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "max".
			
 
				+
			
 
				+	Memory usage hard limit.  This is the final protection
			
 
				+	mechanism.  If a cgroup's memory usage reaches this limit and
			
 
				+	can't be reduced, the OOM killer is invoked in the cgroup.
			
 
				+	Under certain circumstances, the usage may go over the limit
			
 
				+	temporarily.
			
 
				+
			
 
				+	This is the ultimate protection mechanism.  As long as the
			
 
				+	high limit is used and monitored properly, this limit's
			
 
				+	utility is limited to providing the final safety net.
			
 
				+
			
 
				+  memory.events
			
 
				+	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				+	The following entries are defined.  Unless specified
			
 
				+	otherwise, a value change in this file generates a file
			
 
				+	modified event.
			
 
				+
			
 
				+	  low
			
 
				+		The number of times the cgroup is reclaimed due to
			
 
				+		high memory pressure even though its usage is under
			
 
				+		the low boundary.  This usually indicates that the low
			
 
				+		boundary is over-committed.
			
 
				+
			
 
				+	  high
			
 
				+		The number of times processes of the cgroup are
			
 
				+		throttled and routed to perform direct memory reclaim
			
 
				+		because the high memory boundary was exceeded.  For a
			
 
				+		cgroup whose memory usage is capped by the high limit
			
 
				+		rather than global memory pressure, this event's
			
 
				+		occurrences are expected.
			
 
				+
			
 
				+	  max
			
 
				+		The number of times the cgroup's memory usage was
			
 
				+		about to go over the max boundary.  If direct reclaim
			
 
				+		fails to bring it down, the cgroup goes to OOM state.
			
 
				+
			
 
				+	  oom
			
 
				+		The number of time the cgroup's memory usage was
			
 
				+		reached the limit and allocation was about to fail.
			
 
				+
			
 
				+		Depending on context result could be invocation of OOM
			
 
				+		killer and retrying allocation or failing allocation.
			
 
				+
			
 
				+		Failed allocation in its turn could be returned into
			
 
				+		userspace as -ENOMEM or silently ignored in cases like
			
 
				+		disk readahead.  For now OOM in memory cgroup kills
			
 
				+		tasks iff shortage has happened inside page fault.
			
 
				+
			
 
				+	  oom_kill
			
 
				+		The number of processes belonging to this cgroup
			
 
				+		killed by any kind of OOM killer.
			
 
				+
			
 
				+  memory.stat
			
 
				+	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				+
			
 
				+	This breaks down the cgroup's memory footprint into different
			
 
				+	types of memory, type-specific details, and other information
			
 
				+	on the state and past events of the memory management system.
			
 
				+
			
 
				+	All memory amounts are in bytes.
			
 
				+
			
 
				+	The entries are ordered to be human readable, and new entries
			
 
				+	can show up in the middle. Don't rely on items remaining in a
			
 
				+	fixed position; use the keys to look up specific values!
			
 
				+
			
 
				+	  anon
			
 
				+		Amount of memory used in anonymous mappings such as
			
 
				+		brk(), sbrk(), and mmap(MAP_ANONYMOUS)
			
 
				+
			
 
				+	  file
			
 
				+		Amount of memory used to cache filesystem data,
			
 
				+		including tmpfs and shared memory.
			
 
				+
			
 
				+	  kernel_stack
			
 
				+		Amount of memory allocated to kernel stacks.
			
 
				+
			
 
				+	  slab
			
 
				+		Amount of memory used for storing in-kernel data
			
 
				+		structures.
			
 
				+
			
 
				+	  sock
			
 
				+		Amount of memory used in network transmission buffers
			
 
				+
			
 
				+	  shmem
			
 
				+		Amount of cached filesystem data that is swap-backed,
			
 
				+		such as tmpfs, shm segments, shared anonymous mmap()s
			
 
				+
			
 
				+	  file_mapped
			
 
				+		Amount of cached filesystem data mapped with mmap()
			
 
				+
			
 
				+	  file_dirty
			
 
				+		Amount of cached filesystem data that was modified but
			
 
				+		not yet written back to disk
			
 
				+
			
 
				+	  file_writeback
			
 
				+		Amount of cached filesystem data that was modified and
			
 
				+		is currently being written back to disk
			
 
				+
			
 
				+	  inactive_anon, active_anon, inactive_file, active_file, unevictable
			
 
				+		Amount of memory, swap-backed and filesystem-backed,
			
 
				+		on the internal memory management lists used by the
			
 
				+		page reclaim algorithm
			
 
				+
			
 
				+	  slab_reclaimable
			
 
				+		Part of "slab" that might be reclaimed, such as
			
 
				+		dentries and inodes.
			
 
				+
			
 
				+	  slab_unreclaimable
			
 
				+		Part of "slab" that cannot be reclaimed on memory
			
 
				+		pressure.
			
 
				+
			
 
				+	  pgfault
			
 
				+		Total number of page faults incurred
			
 
				+
			
 
				+	  pgmajfault
			
 
				+		Number of major page faults incurred
			
 
				+
			
 
				+	  workingset_refault
			
 
				+
			
 
				+		Number of refaults of previously evicted pages
			
 
				+
			
 
				+	  workingset_activate
			
 
				+
			
 
				+		Number of refaulted pages that were immediately activated
			
 
				+
			
 
				+	  workingset_nodereclaim
			
 
				+
			
 
				+		Number of times a shadow node has been reclaimed
			
 
				+
			
 
				+	  pgrefill
			
 
				+
			
 
				+		Amount of scanned pages (in an active LRU list)
			
 
				+
			
 
				+	  pgscan
			
 
				+
			
 
				+		Amount of scanned pages (in an inactive LRU list)
			
 
				+
			
 
				+	  pgsteal
			
 
				+
			
 
				+		Amount of reclaimed pages
			
 
				+
			
 
				+	  pgactivate
			
 
				+
			
 
				+		Amount of pages moved to the active LRU list
			
 
				+
			
 
				+	  pgdeactivate
			
 
				+
			
 
				+		Amount of pages moved to the inactive LRU lis
			
 
				+
			
 
				+	  pglazyfree
			
 
				+
			
 
				+		Amount of pages postponed to be freed under memory pressure
			
 
				+
			
 
				+	  pglazyfreed
			
 
				+
			
 
				+		Amount of reclaimed lazyfree pages
			
 
				+
			
 
				+  memory.swap.current
			
 
				+	A read-only single value file which exists on non-root
			
 
				+	cgroups.
			
 
				+
			
 
				+	The total amount of swap currently being used by the cgroup
			
 
				+	and its descendants.
			
 
				+
			
 
				+  memory.swap.max
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "max".
			
 
				+
			
 
				+	Swap usage hard limit.  If a cgroup's swap usage reaches this
			
 
				+	limit, anonymous memory of the cgroup will not be swapped out.
			
 
				+
			
 
				+  memory.swap.events
			
 
				+	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				+	The following entries are defined.  Unless specified
			
 
				+	otherwise, a value change in this file generates a file
			
 
				+	modified event.
			
 
				+
			
 
				+	  max
			
 
				+		The number of times the cgroup's swap usage was about
			
 
				+		to go over the max boundary and swap allocation
			
 
				+		failed.
			
 
				+
			
 
				+	  fail
			
 
				+		The number of times swap allocation failed either
			
 
				+		because of running out of swap system-wide or max
			
 
				+		limit.
			
 
				+
			
 
				+	When reduced under the current usage, the existing swap
			
 
				+	entries are reclaimed gradually and the swap usage may stay
			
 
				+	higher than the limit for an extended period of time.  This
			
 
				+	reduces the impact on the workload and memory management.
			
 
				+
			
 
				+
			
 
				+Usage Guidelines
			
 
				+~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+"memory.high" is the main mechanism to control memory usage.
			
 
				+Over-committing on high limit (sum of high limits > available memory)
			
 
				+and letting global memory pressure to distribute memory according to
			
 
				+usage is a viable strategy.
			
 
				+
			
 
				+Because breach of the high limit doesn't trigger the OOM killer but
			
 
				+throttles the offending cgroup, a management agent has ample
			
 
				+opportunities to monitor and take appropriate actions such as granting
			
 
				+more memory or terminating the workload.
			
 
				+
			
 
				+Determining whether a cgroup has enough memory is not trivial as
			
 
				+memory usage doesn't indicate whether the workload can benefit from
			
 
				+more memory.  For example, a workload which writes data received from
			
 
				+network to a file can use all available memory but can also operate as
			
 
				+performant with a small amount of memory.  A measure of memory
			
 
				+pressure - how much the workload is being impacted due to lack of
			
 
				+memory - is necessary to determine whether a workload needs more
			
 
				+memory; unfortunately, memory pressure monitoring mechanism isn't
			
 
				+implemented yet.
			
 
				+
			
 
				+
			
 
				+Memory Ownership
			
 
				+~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+A memory area is charged to the cgroup which instantiated it and stays
			
 
				+charged to the cgroup until the area is released.  Migrating a process
			
 
				+to a different cgroup doesn't move the memory usages that it
			
 
				+instantiated while in the previous cgroup to the new cgroup.
			
 
				+
			
 
				+A memory area may be used by processes belonging to different cgroups.
			
 
				+To which cgroup the area will be charged is in-deterministic; however,
			
 
				+over time, the memory area is likely to end up in a cgroup which has
			
 
				+enough memory allowance to avoid high reclaim pressure.
			
 
				+
			
 
				+If a cgroup sweeps a considerable amount of memory which is expected
			
 
				+to be accessed repeatedly by other cgroups, it may make sense to use
			
 
				+POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
			
 
				+belonging to the affected files to ensure correct memory ownership.
			
 
				+
			
 
				+
			
 
				+IO
			
 
				+--
			
 
				+
			
 
				+The "io" controller regulates the distribution of IO resources.  This
			
 
				+controller implements both weight based and absolute bandwidth or IOPS
			
 
				+limit distribution; however, weight based distribution is available
			
 
				+only if cfq-iosched is in use and neither scheme is available for
			
 
				+blk-mq devices.
			
 
				+
			
 
				+
			
 
				+IO Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+  io.stat
			
 
				+	A read-only nested-keyed file which exists on non-root
			
 
				+	cgroups.
			
 
				+
			
 
				+	Lines are keyed by $MAJ:$MIN device numbers and not ordered.
			
 
				+	The following nested keys are defined.
			
 
				+
			
 
				+	  ======	===================
			
 
				+	  rbytes	Bytes read
			
 
				+	  wbytes	Bytes written
			
 
				+	  rios		Number of read IOs
			
 
				+	  wios		Number of write IOs
			
 
				+	  ======	===================
			
 
				+
			
 
				+	An example read output follows:
			
 
				+
			
 
				+	  8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
			
 
				+	  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
			
 
				+
			
 
				+  io.weight
			
 
				+	A read-write flat-keyed file which exists on non-root cgroups.
			
 
				+	The default is "default 100".
			
 
				+
			
 
				+	The first line is the default weight applied to devices
			
 
				+	without specific override.  The rest are overrides keyed by
			
 
				+	$MAJ:$MIN device numbers and not ordered.  The weights are in
			
 
				+	the range [1, 10000] and specifies the relative amount IO time
			
 
				+	the cgroup can use in relation to its siblings.
			
 
				+
			
 
				+	The default weight can be updated by writing either "default
			
 
				+	$WEIGHT" or simply "$WEIGHT".  Overrides can be set by writing
			
 
				+	"$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
			
 
				+
			
 
				+	An example read output follows::
			
 
				+
			
 
				+	  default 100
			
 
				+	  8:16 200
			
 
				+	  8:0 50
			
 
				+
			
 
				+  io.max
			
 
				+	A read-write nested-keyed file which exists on non-root
			
 
				+	cgroups.
			
 
				+
			
 
				+	BPS and IOPS based IO limit.  Lines are keyed by $MAJ:$MIN
			
 
				+	device numbers and not ordered.  The following nested keys are
			
 
				+	defined.
			
 
				+
			
 
				+	  =====		==================================
			
 
				+	  rbps		Max read bytes per second
			
 
				+	  wbps		Max write bytes per second
			
 
				+	  riops		Max read IO operations per second
			
 
				+	  wiops		Max write IO operations per second
			
 
				+	  =====		==================================
			
 
				+
			
 
				+	When writing, any number of nested key-value pairs can be
			
 
				+	specified in any order.  "max" can be specified as the value
			
 
				+	to remove a specific limit.  If the same key is specified
			
 
				+	multiple times, the outcome is undefined.
			
 
				+
			
 
				+	BPS and IOPS are measured in each IO direction and IOs are
			
 
				+	delayed if limit is reached.  Temporary bursts are allowed.
			
 
				+
			
 
				+	Setting read limit at 2M BPS and write at 120 IOPS for 8:16::
			
 
				+
			
 
				+	  echo "8:16 rbps=2097152 wiops=120" > io.max
			
 
				+
			
 
				+	Reading returns the following::
			
 
				+
			
 
				+	  8:16 rbps=2097152 wbps=max riops=max wiops=120
			
 
				+
			
 
				+	Write IOPS limit can be removed by writing the following::
			
 
				+
			
 
				+	  echo "8:16 wiops=max" > io.max
			
 
				+
			
 
				+	Reading now returns the following::
			
 
				+
			
 
				+	  8:16 rbps=2097152 wbps=max riops=max wiops=max
			
 
				+
			
 
				+
			
 
				+Writeback
			
 
				+~~~~~~~~~
			
 
				+
			
 
				+Page cache is dirtied through buffered writes and shared mmaps and
			
 
				+written asynchronously to the backing filesystem by the writeback
			
 
				+mechanism.  Writeback sits between the memory and IO domains and
			
 
				+regulates the proportion of dirty memory by balancing dirtying and
			
 
				+write IOs.
			
 
				+
			
 
				+The io controller, in conjunction with the memory controller,
			
 
				+implements control of page cache writeback IOs.  The memory controller
			
 
				+defines the memory domain that dirty memory ratio is calculated and
			
 
				+maintained for and the io controller defines the io domain which
			
 
				+writes out dirty pages for the memory domain.  Both system-wide and
			
 
				+per-cgroup dirty memory states are examined and the more restrictive
			
 
				+of the two is enforced.
			
 
				+
			
 
				+cgroup writeback requires explicit support from the underlying
			
 
				+filesystem.  Currently, cgroup writeback is implemented on ext2, ext4
			
 
				+and btrfs.  On other filesystems, all writeback IOs are attributed to
			
 
				+the root cgroup.
			
 
				+
			
 
				+There are inherent differences in memory and writeback management
			
 
				+which affects how cgroup ownership is tracked.  Memory is tracked per
			
 
				+page while writeback per inode.  For the purpose of writeback, an
			
 
				+inode is assigned to a cgroup and all IO requests to write dirty pages
			
 
				+from the inode are attributed to that cgroup.
			
 
				+
			
 
				+As cgroup ownership for memory is tracked per page, there can be pages
			
 
				+which are associated with different cgroups than the one the inode is
			
 
				+associated with.  These are called foreign pages.  The writeback
			
 
				+constantly keeps track of foreign pages and, if a particular foreign
			
 
				+cgroup becomes the majority over a certain period of time, switches
			
 
				+the ownership of the inode to that cgroup.
			
 
				+
			
 
				+While this model is enough for most use cases where a given inode is
			
 
				+mostly dirtied by a single cgroup even when the main writing cgroup
			
 
				+changes over time, use cases where multiple cgroups write to a single
			
 
				+inode simultaneously are not supported well.  In such circumstances, a
			
 
				+significant portion of IOs are likely to be attributed incorrectly.
			
 
				+As memory controller assigns page ownership on the first use and
			
 
				+doesn't update it until the page is released, even if writeback
			
 
				+strictly follows page ownership, multiple cgroups dirtying overlapping
			
 
				+areas wouldn't work as expected.  It's recommended to avoid such usage
			
 
				+patterns.
			
 
				+
			
 
				+The sysctl knobs which affect writeback behavior are applied to cgroup
			
 
				+writeback as follows.
			
 
				+
			
 
				+  vm.dirty_background_ratio, vm.dirty_ratio
			
 
				+	These ratios apply the same to cgroup writeback with the
			
 
				+	amount of available memory capped by limits imposed by the
			
 
				+	memory controller and system-wide clean memory.
			
 
				+
			
 
				+  vm.dirty_background_bytes, vm.dirty_bytes
			
 
				+	For cgroup writeback, this is calculated into ratio against
			
 
				+	total available memory and applied the same way as
			
 
				+	vm.dirty[_background]_ratio.
			
 
				+
			
 
				+
			
 
				+PID
			
 
				+---
			
 
				+
			
 
				+The process number controller is used to allow a cgroup to stop any
			
 
				+new tasks from being fork()'d or clone()'d after a specified limit is
			
 
				+reached.
			
 
				+
			
 
				+The number of tasks in a cgroup can be exhausted in ways which other
			
 
				+controllers cannot prevent, thus warranting its own controller.  For
			
 
				+example, a fork bomb is likely to exhaust the number of tasks before
			
 
				+hitting memory restrictions.
			
 
				+
			
 
				+Note that PIDs used in this controller refer to TIDs, process IDs as
			
 
				+used by the kernel.
			
 
				+
			
 
				+
			
 
				+PID Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+  pids.max
			
 
				+	A read-write single value file which exists on non-root
			
 
				+	cgroups.  The default is "max".
			
 
				+
			
 
				+	Hard limit of number of processes.
			
 
				+
			
 
				+  pids.current
			
 
				+	A read-only single value file which exists on all cgroups.
			
 
				+
			
 
				+	The number of processes currently in the cgroup and its
			
 
				+	descendants.
			
 
				+
			
 
				+Organisational operations are not blocked by cgroup policies, so it is
			
 
				+possible to have pids.current > pids.max.  This can be done by either
			
 
				+setting the limit to be smaller than pids.current, or attaching enough
			
 
				+processes to the cgroup such that pids.current is larger than
			
 
				+pids.max.  However, it is not possible to violate a cgroup PID policy
			
 
				+through fork() or clone(). These will return -EAGAIN if the creation
			
 
				+of a new process would cause a cgroup policy to be violated.
			
 
				+
			
 
				+
			
 
				+Device controller
			
 
				+-----------------
			
 
				+
			
 
				+Device controller manages access to device files. It includes both
			
 
				+creation of new device files (using mknod), and access to the
			
 
				+existing device files.
			
 
				+
			
 
				+Cgroup v2 device controller has no interface files and is implemented
			
 
				+on top of cgroup BPF. To control access to device files, a user may
			
 
				+create bpf programs of the BPF_CGROUP_DEVICE type and attach them
			
 
				+to cgroups. On an attempt to access a device file, corresponding
			
 
				+BPF programs will be executed, and depending on the return value
			
 
				+the attempt will succeed or fail with -EPERM.
			
 
				+
			
 
				+A BPF_CGROUP_DEVICE program takes a pointer to the bpf_cgroup_dev_ctx
			
 
				+structure, which describes the device access attempt: access type
			
 
				+(mknod/read/write) and device (type, major and minor numbers).
			
 
				+If the program returns 0, the attempt fails with -EPERM, otherwise
			
 
				+it succeeds.
			
 
				+
			
 
				+An example of BPF_CGROUP_DEVICE program may be found in the kernel
			
 
				+source tree in the tools/testing/selftests/bpf/dev_cgroup.c file.
			
 
				+
			
 
				+
			
 
				+RDMA
			
 
				+----
			
 
				+
			
 
				+The "rdma" controller regulates the distribution and accounting of
			
 
				+of RDMA resources.
			
 
				+
			
 
				+RDMA Interface Files
			
 
				+~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+  rdma.max
			
 
				+	A readwrite nested-keyed file that exists for all the cgroups
			
 
				+	except root that describes current configured resource limit
			
 
				+	for a RDMA/IB device.
			
 
				+
			
 
				+	Lines are keyed by device name and are not ordered.
			
 
				+	Each line contains space separated resource name and its configured
			
 
				+	limit that can be distributed.
			
 
				+
			
 
				+	The following nested keys are defined.
			
 
				+
			
 
				+	  ==========	=============================
			
 
				+	  hca_handle	Maximum number of HCA Handles
			
 
				+	  hca_object 	Maximum number of HCA Objects
			
 
				+	  ==========	=============================
			
 
				+
			
 
				+	An example for mlx4 and ocrdma device follows::
			
 
				+
			
 
				+	  mlx4_0 hca_handle=2 hca_object=2000
			
 
				+	  ocrdma1 hca_handle=3 hca_object=max
			
 
				+
			
 
				+  rdma.current
			
 
				+	A read-only file that describes current resource usage.
			
 
				+	It exists for all the cgroup except root.
			
 
				+
			
 
				+	An example for mlx4 and ocrdma device follows::
			
 
				+
			
 
				+	  mlx4_0 hca_handle=1 hca_object=20
			
 
				+	  ocrdma1 hca_handle=1 hca_object=23
			
 
				+
			
 
				+
			
 
				+Misc
			
 
				+----
			
 
				+
			
 
				+perf_event
			
 
				+~~~~~~~~~~
			
 
				+
			
 
				+perf_event controller, if not mounted on a legacy hierarchy, is
			
 
				+automatically enabled on the v2 hierarchy so that perf events can
			
 
				+always be filtered by cgroup v2 path.  The controller can still be
			
 
				+moved to a legacy hierarchy after v2 hierarchy is populated.
			
 
				+
			
 
				+
			
 
				+Non-normative information
			
 
				+-------------------------
			
 
				+
			
 
				+This section contains information that isn't considered to be a part of
			
 
				+the stable kernel API and so is subject to change.
			
 
				+
			
 
				+
			
 
				+CPU controller root cgroup process behaviour
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+When distributing CPU cycles in the root cgroup each thread in this
			
 
				+cgroup is treated as if it was hosted in a separate child cgroup of the
			
 
				+root cgroup. This child cgroup weight is dependent on its thread nice
			
 
				+level.
			
 
				+
			
 
				+For details of this mapping see sched_prio_to_weight array in
			
 
				+kernel/sched/core.c file (values from this array should be scaled
			
 
				+appropriately so the neutral - nice 0 - value is 100 instead of 1024).
			
 
				+
			
 
				+
			
 
				+IO controller root cgroup process behaviour
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Root cgroup processes are hosted in an implicit leaf child node.
			
 
				+When distributing IO resources this implicit child node is taken into
			
 
				+account as if it was a normal child cgroup of the root cgroup with a
			
 
				+weight value of 200.
			
 
				+
			
 
				+
			
 
				+Namespace
			
 
				+=========
			
 
				+
			
 
				+Basics
			
 
				+------
			
 
				+
			
 
				+cgroup namespace provides a mechanism to virtualize the view of the
			
 
				+"/proc/$PID/cgroup" file and cgroup mounts.  The CLONE_NEWCGROUP clone
			
 
				+flag can be used with clone(2) and unshare(2) to create a new cgroup
			
 
				+namespace.  The process running inside the cgroup namespace will have
			
 
				+its "/proc/$PID/cgroup" output restricted to cgroupns root.  The
			
 
				+cgroupns root is the cgroup of the process at the time of creation of
			
 
				+the cgroup namespace.
			
 
				+
			
 
				+Without cgroup namespace, the "/proc/$PID/cgroup" file shows the
			
 
				+complete path of the cgroup of a process.  In a container setup where
			
 
				+a set of cgroups and namespaces are intended to isolate processes the
			
 
				+"/proc/$PID/cgroup" file may leak potential system level information
			
 
				+to the isolated processes.  For Example::
			
 
				+
			
 
				+  # cat /proc/self/cgroup
			
 
				+  0::/batchjobs/container_id1
			
 
				+
			
 
				+The path '/batchjobs/container_id1' can be considered as system-data
			
 
				+and undesirable to expose to the isolated processes.  cgroup namespace
			
 
				+can be used to restrict visibility of this path.  For example, before
			
 
				+creating a cgroup namespace, one would see::
			
 
				+
			
 
				+  # ls -l /proc/self/ns/cgroup
			
 
				+  lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
			
 
				+  # cat /proc/self/cgroup
			
 
				+  0::/batchjobs/container_id1
			
 
				+
			
 
				+After unsharing a new namespace, the view changes::
			
 
				+
			
 
				+  # ls -l /proc/self/ns/cgroup
			
 
				+  lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183]
			
 
				+  # cat /proc/self/cgroup
			
 
				+  0::/
			
 
				+
			
 
				+When some thread from a multi-threaded process unshares its cgroup
			
 
				+namespace, the new cgroupns gets applied to the entire process (all
			
 
				+the threads).  This is natural for the v2 hierarchy; however, for the
			
 
				+legacy hierarchies, this may be unexpected.
			
 
				+
			
 
				+A cgroup namespace is alive as long as there are processes inside or
			
 
				+mounts pinning it.  When the last usage goes away, the cgroup
			
 
				+namespace is destroyed.  The cgroupns root and the actual cgroups
			
 
				+remain.
			
 
				+
			
 
				+
			
 
				+The Root and Views
			
 
				+------------------
			
 
				+
			
 
				+The 'cgroupns root' for a cgroup namespace is the cgroup in which the
			
 
				+process calling unshare(2) is running.  For example, if a process in
			
 
				+/batchjobs/container_id1 cgroup calls unshare, cgroup
			
 
				+/batchjobs/container_id1 becomes the cgroupns root.  For the
			
 
				+init_cgroup_ns, this is the real root ('/') cgroup.
			
 
				+
			
 
				+The cgroupns root cgroup does not change even if the namespace creator
			
 
				+process later moves to a different cgroup::
			
 
				+
			
 
				+  # ~/unshare -c # unshare cgroupns in some cgroup
			
 
				+  # cat /proc/self/cgroup
			
 
				+  0::/
			
 
				+  # mkdir sub_cgrp_1
			
 
				+  # echo 0 > sub_cgrp_1/cgroup.procs
			
 
				+  # cat /proc/self/cgroup
			
 
				+  0::/sub_cgrp_1
			
 
				+
			
 
				+Each process gets its namespace-specific view of "/proc/$PID/cgroup"
			
 
				+
			
 
				+Processes running inside the cgroup namespace will be able to see
			
 
				+cgroup paths (in /proc/self/cgroup) only inside their root cgroup.
			
 
				+From within an unshared cgroupns::
			
 
				+
			
 
				+  # sleep 100000 &
			
 
				+  [1] 7353
			
 
				+  # echo 7353 > sub_cgrp_1/cgroup.procs
			
 
				+  # cat /proc/7353/cgroup
			
 
				+  0::/sub_cgrp_1
			
 
				+
			
 
				+From the initial cgroup namespace, the real cgroup path will be
			
 
				+visible::
			
 
				+
			
 
				+  $ cat /proc/7353/cgroup
			
 
				+  0::/batchjobs/container_id1/sub_cgrp_1
			
 
				+
			
 
				+From a sibling cgroup namespace (that is, a namespace rooted at a
			
 
				+different cgroup), the cgroup path relative to its own cgroup
			
 
				+namespace root will be shown.  For instance, if PID 7353's cgroup
			
 
				+namespace root is at '/batchjobs/container_id2', then it will see::
			
 
				+
			
 
				+  # cat /proc/7353/cgroup
			
 
				+  0::/../container_id2/sub_cgrp_1
			
 
				+
			
 
				+Note that the relative path always starts with '/' to indicate that
			
 
				+its relative to the cgroup namespace root of the caller.
			
 
				+
			
 
				+
			
 
				+Migration and setns(2)
			
 
				+----------------------
			
 
				+
			
 
				+Processes inside a cgroup namespace can move into and out of the
			
 
				+namespace root if they have proper access to external cgroups.  For
			
 
				+example, from inside a namespace with cgroupns root at
			
 
				+/batchjobs/container_id1, and assuming that the global hierarchy is
			
 
				+still accessible inside cgroupns::
			
 
				+
			
 
				+  # cat /proc/7353/cgroup
			
 
				+  0::/sub_cgrp_1
			
 
				+  # echo 7353 > batchjobs/container_id2/cgroup.procs
			
 
				+  # cat /proc/7353/cgroup
			
 
				+  0::/../container_id2
			
 
				+
			
 
				+Note that this kind of setup is not encouraged.  A task inside cgroup
			
 
				+namespace should only be exposed to its own cgroupns hierarchy.
			
 
				+
			
 
				+setns(2) to another cgroup namespace is allowed when:
			
 
				+
			
 
				+(a) the process has CAP_SYS_ADMIN against its current user namespace
			
 
				+(b) the process has CAP_SYS_ADMIN against the target cgroup
			
 
				+    namespace's userns
			
 
				+
			
 
				+No implicit cgroup changes happen with attaching to another cgroup
			
 
				+namespace.  It is expected that the someone moves the attaching
			
 
				+process under the target cgroup namespace root.
			
 
				+
			
 
				+
			
 
				+Interaction with Other Namespaces
			
 
				+---------------------------------
			
 
				+
			
 
				+Namespace specific cgroup hierarchy can be mounted by a process
			
 
				+running inside a non-init cgroup namespace::
			
 
				+
			
 
				+  # mount -t cgroup2 none $MOUNT_POINT
			
 
				+
			
 
				+This will mount the unified cgroup hierarchy with cgroupns root as the
			
 
				+filesystem root.  The process needs CAP_SYS_ADMIN against its user and
			
 
				+mount namespaces.
			
 
				+
			
 
				+The virtualization of /proc/self/cgroup file combined with restricting
			
 
				+the view of cgroup hierarchy by namespace-private cgroupfs mount
			
 
				+provides a properly isolated cgroup view inside the container.
			
 
				+
			
 
				+
			
 
				+Information on Kernel Programming
			
 
				+=================================
			
 
				+
			
 
				+This section contains kernel programming information in the areas
			
 
				+where interacting with cgroup is necessary.  cgroup core and
			
 
				+controllers are not covered.
			
 
				+
			
 
				+
			
 
				+Filesystem Support for Writeback
			
 
				+--------------------------------
			
 
				+
			
 
				+A filesystem can support cgroup writeback by updating
			
 
				+address_space_operations->writepage[s]() to annotate bio's using the
			
 
				+following two functions.
			
 
				+
			
 
				+  wbc_init_bio(@wbc, @bio)
			
 
				+	Should be called for each bio carrying writeback data and
			
 
				+	associates the bio with the inode's owner cgroup.  Can be
			
 
				+	called anytime between bio allocation and submission.
			
 
				+
			
 
				+  wbc_account_io(@wbc, @page, @bytes)
			
 
				+	Should be called for each data segment being written out.
			
 
				+	While this function doesn't care exactly when it's called
			
 
				+	during the writeback session, it's the easiest and most
			
 
				+	natural to call it as data segments are added to a bio.
			
 
				+
			
 
				+With writeback bio's annotated, cgroup support can be enabled per
			
 
				+super_block by setting SB_I_CGROUPWB in ->s_iflags.  This allows for
			
 
				+selective disabling of cgroup writeback support which is helpful when
			
 
				+certain filesystem features, e.g. journaled data mode, are
			
 
				+incompatible.
			
 
				+
			
 
				+wbc_init_bio() binds the specified bio to its cgroup.  Depending on
			
 
				+the configuration, the bio may be executed at a lower priority and if
			
 
				+the writeback session is holding shared resources, e.g. a journal
			
 
				+entry, may lead to priority inversion.  There is no one easy solution
			
 
				+for the problem.  Filesystems can try to work around specific problem
			
 
				+cases by skipping wbc_init_bio() or using bio_associate_blkcg()
			
 
				+directly.
			
 
				+
			
 
				+
			
 
				+Deprecated v1 Core Features
			
 
				+===========================
			
 
				+
			
 
				+- Multiple hierarchies including named ones are not supported.
			
 
				+
			
 
				+- All v1 mount options are not supported.
			
 
				+
			
 
				+- The "tasks" file is removed and "cgroup.procs" is not sorted.
			
 
				+
			
 
				+- "cgroup.clone_children" is removed.
			
 
				+
			
 
				+- /proc/cgroups is meaningless for v2.  Use "cgroup.controllers" file
			
 
				+  at the root instead.
			
 
				+
			
 
				+
			
 
				+Issues with v1 and Rationales for v2
			
 
				+====================================
			
 
				+
			
 
				+Multiple Hierarchies
			
 
				+--------------------
			
 
				+
			
 
				+cgroup v1 allowed an arbitrary number of hierarchies and each
			
 
				+hierarchy could host any number of controllers.  While this seemed to
			
 
				+provide a high level of flexibility, it wasn't useful in practice.
			
 
				+
			
 
				+For example, as there is only one instance of each controller, utility
			
 
				+type controllers such as freezer which can be useful in all
			
 
				+hierarchies could only be used in one.  The issue is exacerbated by
			
 
				+the fact that controllers couldn't be moved to another hierarchy once
			
 
				+hierarchies were populated.  Another issue was that all controllers
			
 
				+bound to a hierarchy were forced to have exactly the same view of the
			
 
				+hierarchy.  It wasn't possible to vary the granularity depending on
			
 
				+the specific controller.
			
 
				+
			
 
				+In practice, these issues heavily limited which controllers could be
			
 
				+put on the same hierarchy and most configurations resorted to putting
			
 
				+each controller on its own hierarchy.  Only closely related ones, such
			
 
				+as the cpu and cpuacct controllers, made sense to be put on the same
			
 
				+hierarchy.  This often meant that userland ended up managing multiple
			
 
				+similar hierarchies repeating the same steps on each hierarchy
			
 
				+whenever a hierarchy management operation was necessary.
			
 
				+
			
 
				+Furthermore, support for multiple hierarchies came at a steep cost.
			
 
				+It greatly complicated cgroup core implementation but more importantly
			
 
				+the support for multiple hierarchies restricted how cgroup could be
			
 
				+used in general and what controllers was able to do.
			
 
				+
			
 
				+There was no limit on how many hierarchies there might be, which meant
			
 
				+that a thread's cgroup membership couldn't be described in finite
			
 
				+length.  The key might contain any number of entries and was unlimited
			
 
				+in length, which made it highly awkward to manipulate and led to
			
 
				+addition of controllers which existed only to identify membership,
			
 
				+which in turn exacerbated the original problem of proliferating number
			
 
				+of hierarchies.
			
 
				+
			
 
				+Also, as a controller couldn't have any expectation regarding the
			
 
				+topologies of hierarchies other controllers might be on, each
			
 
				+controller had to assume that all other controllers were attached to
			
 
				+completely orthogonal hierarchies.  This made it impossible, or at
			
 
				+least very cumbersome, for controllers to cooperate with each other.
			
 
				+
			
 
				+In most use cases, putting controllers on hierarchies which are
			
 
				+completely orthogonal to each other isn't necessary.  What usually is
			
 
				+called for is the ability to have differing levels of granularity
			
 
				+depending on the specific controller.  In other words, hierarchy may
			
 
				+be collapsed from leaf towards root when viewed from specific
			
 
				+controllers.  For example, a given configuration might not care about
			
 
				+how memory is distributed beyond a certain level while still wanting
			
 
				+to control how CPU cycles are distributed.
			
 
				+
			
 
				+
			
 
				+Thread Granularity
			
 
				+------------------
			
 
				+
			
 
				+cgroup v1 allowed threads of a process to belong to different cgroups.
			
 
				+This didn't make sense for some controllers and those controllers
			
 
				+ended up implementing different ways to ignore such situations but
			
 
				+much more importantly it blurred the line between API exposed to
			
 
				+individual applications and system management interface.
			
 
				+
			
 
				+Generally, in-process knowledge is available only to the process
			
 
				+itself; thus, unlike service-level organization of processes,
			
 
				+categorizing threads of a process requires active participation from
			
 
				+the application which owns the target process.
			
 
				+
			
 
				+cgroup v1 had an ambiguously defined delegation model which got abused
			
 
				+in combination with thread granularity.  cgroups were delegated to
			
 
				+individual applications so that they can create and manage their own
			
 
				+sub-hierarchies and control resource distributions along them.  This
			
 
				+effectively raised cgroup to the status of a syscall-like API exposed
			
 
				+to lay programs.
			
 
				+
			
 
				+First of all, cgroup has a fundamentally inadequate interface to be
			
 
				+exposed this way.  For a process to access its own knobs, it has to
			
 
				+extract the path on the target hierarchy from /proc/self/cgroup,
			
 
				+construct the path by appending the name of the knob to the path, open
			
 
				+and then read and/or write to it.  This is not only extremely clunky
			
 
				+and unusual but also inherently racy.  There is no conventional way to
			
 
				+define transaction across the required steps and nothing can guarantee
			
 
				+that the process would actually be operating on its own sub-hierarchy.
			
 
				+
			
 
				+cgroup controllers implemented a number of knobs which would never be
			
 
				+accepted as public APIs because they were just adding control knobs to
			
 
				+system-management pseudo filesystem.  cgroup ended up with interface
			
 
				+knobs which were not properly abstracted or refined and directly
			
 
				+revealed kernel internal details.  These knobs got exposed to
			
 
				+individual applications through the ill-defined delegation mechanism
			
 
				+effectively abusing cgroup as a shortcut to implementing public APIs
			
 
				+without going through the required scrutiny.
			
 
				+
			
 
				+This was painful for both userland and kernel.  Userland ended up with
			
 
				+misbehaving and poorly abstracted interfaces and kernel exposing and
			
 
				+locked into constructs inadvertently.
			
 
				+
			
 
				+
			
 
				+Competition Between Inner Nodes and Threads
			
 
				+-------------------------------------------
			
 
				+
			
 
				+cgroup v1 allowed threads to be in any cgroups which created an
			
 
				+interesting problem where threads belonging to a parent cgroup and its
			
 
				+children cgroups competed for resources.  This was nasty as two
			
 
				+different types of entities competed and there was no obvious way to
			
 
				+settle it.  Different controllers did different things.
			
 
				+
			
 
				+The cpu controller considered threads and cgroups as equivalents and
			
 
				+mapped nice levels to cgroup weights.  This worked for some cases but
			
 
				+fell flat when children wanted to be allocated specific ratios of CPU
			
 
				+cycles and the number of internal threads fluctuated - the ratios
			
 
				+constantly changed as the number of competing entities fluctuated.
			
 
				+There also were other issues.  The mapping from nice level to weight
			
 
				+wasn't obvious or universal, and there were various other knobs which
			
 
				+simply weren't available for threads.
			
 
				+
			
 
				+The io controller implicitly created a hidden leaf node for each
			
 
				+cgroup to host the threads.  The hidden leaf had its own copies of all
			
 
				+the knobs with ``leaf_`` prefixed.  While this allowed equivalent
			
 
				+control over internal threads, it was with serious drawbacks.  It
			
 
				+always added an extra layer of nesting which wouldn't be necessary
			
 
				+otherwise, made the interface messy and significantly complicated the
			
 
				+implementation.
			
 
				+
			
 
				+The memory controller didn't have a way to control what happened
			
 
				+between internal tasks and child cgroups and the behavior was not
			
 
				+clearly defined.  There were attempts to add ad-hoc behaviors and
			
 
				+knobs to tailor the behavior to specific workloads which would have
			
 
				+led to problems extremely difficult to resolve in the long term.
			
 
				+
			
 
				+Multiple controllers struggled with internal tasks and came up with
			
 
				+different ways to deal with it; unfortunately, all the approaches were
			
 
				+severely flawed and, furthermore, the widely different behaviors
			
 
				+made cgroup as a whole highly inconsistent.
			
 
				+
			
 
				+This clearly is a problem which needs to be addressed from cgroup core
			
 
				+in a uniform way.
			
 
				+
			
 
				+
			
 
				+Other Interface Issues
			
 
				+----------------------
			
 
				+
			
 
				+cgroup v1 grew without oversight and developed a large number of
			
 
				+idiosyncrasies and inconsistencies.  One issue on the cgroup core side
			
 
				+was how an empty cgroup was notified - a userland helper binary was
			
 
				+forked and executed for each event.  The event delivery wasn't
			
 
				+recursive or delegatable.  The limitations of the mechanism also led
			
 
				+to in-kernel event delivery filtering mechanism further complicating
			
 
				+the interface.
			
 
				+
			
 
				+Controller interfaces were problematic too.  An extreme example is
			
 
				+controllers completely ignoring hierarchical organization and treating
			
 
				+all cgroups as if they were all located directly under the root
			
 
				+cgroup.  Some controllers exposed a large amount of inconsistent
			
 
				+implementation details to userland.
			
 
				+
			
 
				+There also was no consistency across controllers.  When a new cgroup
			
 
				+was created, some controllers defaulted to not imposing extra
			
 
				+restrictions while others disallowed any resource usage until
			
 
				+explicitly configured.  Configuration knobs for the same type of
			
 
				+control used widely differing naming schemes and formats.  Statistics
			
 
				+and information knobs were named arbitrarily and used different
			
 
				+formats and units even in the same controller.
			
 
				+
			
 
				+cgroup v2 establishes common conventions where appropriate and updates
			
 
				+controllers so that they expose minimal and consistent interfaces.
			
 
				+
			
 
				+
			
 
				+Controller Issues and Remedies
			
 
				+------------------------------
			
 
				+
			
 
				+Memory
			
 
				+~~~~~~
			
 
				+
			
 
				+The original lower boundary, the soft limit, is defined as a limit
			
 
				+that is per default unset.  As a result, the set of cgroups that
			
 
				+global reclaim prefers is opt-in, rather than opt-out.  The costs for
			
 
				+optimizing these mostly negative lookups are so high that the
			
 
				+implementation, despite its enormous size, does not even provide the
			
 
				+basic desirable behavior.  First off, the soft limit has no
			
 
				+hierarchical meaning.  All configured groups are organized in a global
			
 
				+rbtree and treated like equal peers, regardless where they are located
			
 
				+in the hierarchy.  This makes subtree delegation impossible.  Second,
			
 
				+the soft limit reclaim pass is so aggressive that it not just
			
 
				+introduces high allocation latencies into the system, but also impacts
			
 
				+system performance due to overreclaim, to the point where the feature
			
 
				+becomes self-defeating.
			
 
				+
			
 
				+The memory.low boundary on the other hand is a top-down allocated
			
 
				+reserve.  A cgroup enjoys reclaim protection when it's within its low,
			
 
				+which makes delegation of subtrees possible.
			
 
				+
			
 
				+The original high boundary, the hard limit, is defined as a strict
			
 
				+limit that can not budge, even if the OOM killer has to be called.
			
 
				+But this generally goes against the goal of making the most out of the
			
 
				+available memory.  The memory consumption of workloads varies during
			
 
				+runtime, and that requires users to overcommit.  But doing that with a
			
 
				+strict upper limit requires either a fairly accurate prediction of the
			
 
				+working set size or adding slack to the limit.  Since working set size
			
 
				+estimation is hard and error prone, and getting it wrong results in
			
 
				+OOM kills, most users tend to err on the side of a looser limit and
			
 
				+end up wasting precious resources.
			
 
				+
			
 
				+The memory.high boundary on the other hand can be set much more
			
 
				+conservatively.  When hit, it throttles allocations by forcing them
			
 
				+into direct reclaim to work off the excess, but it never invokes the
			
 
				+OOM killer.  As a result, a high boundary that is chosen too
			
 
				+aggressively will not terminate the processes, but instead it will
			
 
				+lead to gradual performance degradation.  The user can monitor this
			
 
				+and make corrections until the minimal memory footprint that still
			
 
				+gives acceptable performance is found.
			
 
				+
			
 
				+In extreme cases, with many concurrent allocations and a complete
			
 
				+breakdown of reclaim progress within the group, the high boundary can
			
 
				+be exceeded.  But even then it's mostly better to satisfy the
			
 
				+allocation from the slack available in other groups or the rest of the
			
 
				+system than killing the group.  Otherwise, memory.max is there to
			
 
				+limit this type of spillover and ultimately contain buggy or even
			
 
				+malicious applications.
			
 
				+
			
 
				+Setting the original memory.limit_in_bytes below the current usage was
			
 
				+subject to a race condition, where concurrent charges could cause the
			
 
				+limit setting to fail. memory.max on the other hand will first set the
			
 
				+limit to prevent new charges, and then reclaim and OOM kill until the
			
 
				+new limit is met - or the task writing to memory.max is killed.
			
 
				+
			
 
				+The combined memory+swap accounting and limiting is replaced by real
			
 
				+control over swap space.
			
 
				+
			
 
				+The main argument for a combined memory+swap facility in the original
			
 
				+cgroup design was that global or parental pressure would always be
			
 
				+able to swap all anonymous memory of a child group, regardless of the
			
 
				+child's own (possibly untrusted) configuration.  However, untrusted
			
 
				+groups can sabotage swapping by other means - such as referencing its
			
 
				+anonymous memory in a tight loop - and an admin can not assume full
			
 
				+swappability when overcommitting untrusted jobs.
			
 
				+
			
 
				+For trusted jobs, on the other hand, a combined counter is not an
			
 
				+intuitive userspace interface, and it flies in the face of the idea
			
 
				+that cgroup controllers should account and limit specific physical
			
 
				+resources.  Swap space is a resource like all others in the system,
			
 
				+and that's why unified hierarchy allows distributing it separately.
			
--- a/Documentation/admin-guide/index.rst
+++ b/Documentation/admin-guide/index.rst
@@ -48,6 +48,7 @@ configure specific aspects of kernel behavior to your liking.
 
				    :maxdepth: 1
			
 
				 
			
 
				    initrd
			
 
				+   cgroup-v2
			
 
				    serial-console
			
 
				    braille-console
			
 
				    parport
			
@@ -60,9 +61,11 @@ configure specific aspects of kernel behavior to your liking.
 
				    mono
			
 
				    java
			
 
				    ras
			
 
				+   bcache
			
 
				    pm/index
			
 
				    thunderbolt
			
 
				    LSM/index
			
 
				+   mm/index
			
 
				 
			
 
				 .. only::  subproject and html
			
 
				 
			
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -106,11 +106,11 @@
 
				 			use by PCI
			
 
				 			Format: <irq>,<irq>...
			
 
				 
			
 
				-	acpi_mask_gpe=  [HW,ACPI]
			
 
				+	acpi_mask_gpe=	[HW,ACPI]
			
 
				 			Due to the existence of _Lxx/_Exx, some GPEs triggered
			
 
				 			by unsupported hardware/firmware features can result in
			
 
				-                        GPE floodings that cannot be automatically disabled by
			
 
				-                        the GPE dispatcher.
			
 
				+			GPE floodings that cannot be automatically disabled by
			
 
				+			the GPE dispatcher.
			
 
				 			This facility can be used to prevent such uncontrolled
			
 
				 			GPE floodings.
			
 
				 			Format: <int>
			
@@ -256,7 +256,7 @@
 
				 				(may crash computer or cause data corruption)
			
 
				 
			
 
				 	ALSA		[HW,ALSA]
			
 
				-			See Documentation/sound/alsa/alsa-parameters.txt
			
 
				+			See Documentation/sound/alsa-configuration.rst
			
 
				 
			
 
				 	alignment=	[KNL,ARM]
			
 
				 			Allow the default userspace alignment fault handler
			
@@ -472,10 +472,10 @@
 
				 			for platform specific values (SB1, Loongson3 and
			
 
				 			others).
			
 
				 
			
 
				-	ccw_timeout_log [S390]
			
 
				+	ccw_timeout_log	[S390]
			
 
				 			See Documentation/s390/CommonIO for details.
			
 
				 
			
 
				-	cgroup_disable= [KNL] Disable a particular controller
			
 
				+	cgroup_disable=	[KNL] Disable a particular controller
			
 
				 			Format: {name of the controller(s) to disable}
			
 
				 			The effects of cgroup_disable=foo are:
			
 
				 			- foo isn't auto-mounted if you mount all cgroups in
			
@@ -518,7 +518,7 @@
 
				 			those clocks in any way. This parameter is useful for
			
 
				 			debug and development, but should not be needed on a
			
 
				 			platform with proper driver support.  For more
			
 
				-			information, see Documentation/clk.txt.
			
 
				+			information, see Documentation/driver-api/clk.rst.
			
 
				 
			
 
				 	clock=		[BUGS=X86-32, HW] gettimeofday clocksource override.
			
 
				 			[Deprecated]
			
@@ -587,11 +587,6 @@
 
				 			Sets the size of memory pool for coherent, atomic dma
			
 
				 			allocations, by default set to 256K.
			
 
				 
			
 
				-	code_bytes	[X86] How many bytes of object code to print
			
 
				-			in an oops report.
			
 
				-			Range: 0 - 8192
			
 
				-			Default: 64
			
 
				-
			
 
				 	com20020=	[HW,NET] ARCnet - COM20020 chipset
			
 
				 			Format:
			
 
				 			<io>[,<irq>[,<nodeID>[,<backplane>[,<ckp>[,<timeout>]]]]]
			
@@ -641,8 +636,8 @@
 
				 		hvc<n>	Use the hypervisor console device <n>. This is for
			
 
				 			both Xen and PowerPC hypervisors.
			
 
				 
			
 
				-                If the device connected to the port is not a TTY but a braille
			
 
				-                device, prepend "brl," before the device type, for instance
			
 
				+		If the device connected to the port is not a TTY but a braille
			
 
				+		device, prepend "brl," before the device type, for instance
			
 
				 			console=brl,ttyS0
			
 
				 		For now, only VisioBraille is supported.
			
 
				 
			
@@ -662,7 +657,7 @@
 
				 
			
 
				 	consoleblank=	[KNL] The console blank (screen saver) timeout in
			
 
				 			seconds. A value of 0 disables the blank timer.
			
 
				-                       Defaults to 0.
			
 
				+			Defaults to 0.
			
 
				 
			
 
				 	coredump_filter=
			
 
				 			[KNL] Change the default value for
			
@@ -730,7 +725,7 @@
 
				 			or memory reserved is below 4G.
			
 
				 
			
 
				 	cryptomgr.notests
			
 
				-                        [KNL] Disable crypto self-tests
			
 
				+			[KNL] Disable crypto self-tests
			
 
				 
			
 
				 	cs89x0_dma=	[HW,NET]
			
 
				 			Format: <dma>
			
@@ -746,7 +741,7 @@
 
				 			Format: <port#>,<type>
			
 
				 			See also Documentation/input/devices/joystick-parport.rst
			
 
				 
			
 
				-	ddebug_query=   [KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
			
 
				+	ddebug_query=	[KNL,DYNAMIC_DEBUG] Enable debug messages at early boot
			
 
				 			time. See
			
 
				 			Documentation/admin-guide/dynamic-debug-howto.rst for
			
 
				 			details.  Deprecated, see dyndbg.
			
@@ -833,7 +828,7 @@
 
				 			causing system reset or hang due to sending
			
 
				 			INIT from AP to BSP.
			
 
				 
			
 
				-	disable_ddw     [PPC/PSERIES]
			
 
				+	disable_ddw	[PPC/PSERIES]
			
 
				 			Disable Dynamic DMA Window support. Use this if
			
 
				 			to workaround buggy firmware.
			
 
				 
			
@@ -1025,6 +1020,12 @@
 
				 			address. The serial port must already be setup
			
 
				 			and configured. Options are not yet supported.
			
 
				 
			
 
				+		qcom_geni,<addr>
			
 
				+			Start an early, polled-mode console on a Qualcomm
			
 
				+			Generic Interface (GENI) based serial port at the
			
 
				+			specified address. The serial port must already be
			
 
				+			setup and configured. Options are not yet supported.
			
 
				+
			
 
				 	earlyprintk=	[X86,SH,ARM,M68k,S390]
			
 
				 			earlyprintk=vga
			
 
				 			earlyprintk=efi
			
@@ -1188,7 +1189,7 @@
 
				 			parameter will force ia64_sal_cache_flush to call
			
 
				 			ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
			
 
				 
			
 
				-	forcepae [X86-32]
			
 
				+	forcepae	[X86-32]
			
 
				 			Forcefully enable Physical Address Extension (PAE).
			
 
				 			Many Pentium M systems disable PAE but may have a
			
 
				 			functionally usable PAE implementation.
			
@@ -1247,7 +1248,7 @@
 
				 
			
 
				 	gamma=		[HW,DRM]
			
 
				 
			
 
				-	gart_fix_e820=  [X86_64] disable the fix e820 for K8 GART
			
 
				+	gart_fix_e820=	[X86_64] disable the fix e820 for K8 GART
			
 
				 			Format: off | on
			
 
				 			default: on
			
 
				 
			
@@ -1341,23 +1342,32 @@
 
				 			x86-64 are 2M (when the CPU supports "pse") and 1G
			
 
				 			(when the CPU supports the "pdpe1gb" cpuinfo flag).
			
 
				 
			
 
				-	hvc_iucv=	[S390] Number of z/VM IUCV hypervisor console (HVC)
			
 
				-			       terminal devices. Valid values: 0..8
			
 
				-	hvc_iucv_allow=	[S390] Comma-separated list of z/VM user IDs.
			
 
				-			       If specified, z/VM IUCV HVC accepts connections
			
 
				-			       from listed z/VM user IDs only.
			
 
				+	hung_task_panic=
			
 
				+			[KNL] Should the hung task detector generate panics.
			
 
				+			Format: <integer>
			
 
				 
			
 
				+			A nonzero value instructs the kernel to panic when a
			
 
				+			hung task is detected. The default value is controlled
			
 
				+			by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time
			
 
				+			option. The value selected by this boot parameter can
			
 
				+			be changed later by the kernel.hung_task_panic sysctl.
			
 
				+
			
 
				+	hvc_iucv=	[S390]	Number of z/VM IUCV hypervisor console (HVC)
			
 
				+				terminal devices. Valid values: 0..8
			
 
				+	hvc_iucv_allow=	[S390]	Comma-separated list of z/VM user IDs.
			
 
				+				If specified, z/VM IUCV HVC accepts connections
			
 
				+				from listed z/VM user IDs only.
			
 
				 	keep_bootcon	[KNL]
			
 
				 			Do not unregister boot console at start. This is only
			
 
				 			useful for debugging when something happens in the window
			
 
				 			between unregistering the boot console and initializing
			
 
				 			the real console.
			
 
				 
			
 
				-	i2c_bus=	[HW] Override the default board specific I2C bus speed
			
 
				-			     or register an additional I2C bus that is not
			
 
				-			     registered from board initialization code.
			
 
				-			     Format:
			
 
				-			     <bus_id>,<clkrate>
			
 
				+	i2c_bus=	[HW]	Override the default board specific I2C bus speed
			
 
				+				or register an additional I2C bus that is not
			
 
				+				registered from board initialization code.
			
 
				+				Format:
			
 
				+				<bus_id>,<clkrate>
			
 
				 
			
 
				 	i8042.debug	[HW] Toggle i8042 debug mode
			
 
				 	i8042.unmask_kbd_data
			
@@ -1386,7 +1396,7 @@
 
				 			Default: only on s2r transitions on x86; most other
			
 
				 			architectures force reset to be always executed
			
 
				 	i8042.unlock	[HW] Unlock (ignore) the keylock
			
 
				-	i8042.kbdreset  [HW] Reset device connected to KBD port
			
 
				+	i8042.kbdreset	[HW] Reset device connected to KBD port
			
 
				 
			
 
				 	i810=		[HW,DRM]
			
 
				 
			
@@ -1548,13 +1558,13 @@
 
				 			programs exec'd, files mmap'd for exec, and all files
			
 
				 			opened for read by uid=0.
			
 
				 
			
 
				-	ima_template=   [IMA]
			
 
				+	ima_template=	[IMA]
			
 
				 			Select one of defined IMA measurements template formats.
			
 
				 			Formats: { "ima" | "ima-ng" | "ima-sig" }
			
 
				 			Default: "ima-ng"
			
 
				 
			
 
				 	ima_template_fmt=
			
 
				-	                [IMA] Define a custom template format.
			
 
				+			[IMA] Define a custom template format.
			
 
				 			Format: { "field1|...|fieldN" }
			
 
				 
			
 
				 	ima.ahash_minsize= [IMA] Minimum file size for asynchronous hash usage
			
@@ -1597,7 +1607,7 @@
 
				 	inport.irq=	[HW] Inport (ATI XL and Microsoft) busmouse driver
			
 
				 			Format: <irq>
			
 
				 
			
 
				-	int_pln_enable  [x86] Enable power limit notification interrupt
			
 
				+	int_pln_enable	[x86] Enable power limit notification interrupt
			
 
				 
			
 
				 	integrity_audit=[IMA]
			
 
				 			Format: { "0" | "1" }
			
@@ -1650,39 +1660,39 @@
 
				 			0	disables intel_idle and fall back on acpi_idle.
			
 
				 			1 to 9	specify maximum depth of C-state.
			
 
				 
			
 
				-	intel_pstate=  [X86]
			
 
				-		       disable
			
 
				-		         Do not enable intel_pstate as the default
			
 
				-		         scaling driver for the supported processors
			
 
				-		       passive
			
 
				-			 Use intel_pstate as a scaling driver, but configure it
			
 
				-			 to work with generic cpufreq governors (instead of
			
 
				-			 enabling its internal governor).  This mode cannot be
			
 
				-			 used along with the hardware-managed P-states (HWP)
			
 
				-			 feature.
			
 
				-		       force
			
 
				-			 Enable intel_pstate on systems that prohibit it by default
			
 
				-			 in favor of acpi-cpufreq. Forcing the intel_pstate driver
			
 
				-			 instead of acpi-cpufreq may disable platform features, such
			
 
				-			 as thermal controls and power capping, that rely on ACPI
			
 
				-			 P-States information being indicated to OSPM and therefore
			
 
				-			 should be used with caution. This option does not work with
			
 
				-			 processors that aren't supported by the intel_pstate driver
			
 
				-			 or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
			
 
				-		       no_hwp
			
 
				-		         Do not enable hardware P state control (HWP)
			
 
				-			 if available.
			
 
				-		hwp_only
			
 
				-			Only load intel_pstate on systems which support
			
 
				-			hardware P state control (HWP) if available.
			
 
				-		support_acpi_ppc
			
 
				-			Enforce ACPI _PPC performance limits. If the Fixed ACPI
			
 
				-			Description Table, specifies preferred power management
			
 
				-			profile as "Enterprise Server" or "Performance Server",
			
 
				-			then this feature is turned on by default.
			
 
				-		per_cpu_perf_limits
			
 
				-			Allow per-logical-CPU P-State performance control limits using
			
 
				-			cpufreq sysfs interface
			
 
				+	intel_pstate=	[X86]
			
 
				+			disable
			
 
				+			  Do not enable intel_pstate as the default
			
 
				+			  scaling driver for the supported processors
			
 
				+			passive
			
 
				+			  Use intel_pstate as a scaling driver, but configure it
			
 
				+			  to work with generic cpufreq governors (instead of
			
 
				+			  enabling its internal governor).  This mode cannot be
			
 
				+			  used along with the hardware-managed P-states (HWP)
			
 
				+			  feature.
			
 
				+			force
			
 
				+			  Enable intel_pstate on systems that prohibit it by default
			
 
				+			  in favor of acpi-cpufreq. Forcing the intel_pstate driver
			
 
				+			  instead of acpi-cpufreq may disable platform features, such
			
 
				+			  as thermal controls and power capping, that rely on ACPI
			
 
				+			  P-States information being indicated to OSPM and therefore
			
 
				+			  should be used with caution. This option does not work with
			
 
				+			  processors that aren't supported by the intel_pstate driver
			
 
				+			  or on platforms that use pcc-cpufreq instead of acpi-cpufreq.
			
 
				+			no_hwp
			
 
				+			  Do not enable hardware P state control (HWP)
			
 
				+			  if available.
			
 
				+			hwp_only
			
 
				+			  Only load intel_pstate on systems which support
			
 
				+			  hardware P state control (HWP) if available.
			
 
				+			support_acpi_ppc
			
 
				+			  Enforce ACPI _PPC performance limits. If the Fixed ACPI
			
 
				+			  Description Table, specifies preferred power management
			
 
				+			  profile as "Enterprise Server" or "Performance Server",
			
 
				+			  then this feature is turned on by default.
			
 
				+			per_cpu_perf_limits
			
 
				+			  Allow per-logical-CPU P-State performance control limits using
			
 
				+			  cpufreq sysfs interface
			
 
				 
			
 
				 	intremap=	[X86-64, Intel-IOMMU]
			
 
				 			on	enable Interrupt Remapping (default)
			
@@ -1705,7 +1715,6 @@
 
				 		nopanic
			
 
				 		merge
			
 
				 		nomerge
			
 
				-		forcesac
			
 
				 		soft
			
 
				 		pt		[x86, IA-64]
			
 
				 		nobypass	[PPC/POWERNV]
			
@@ -2027,7 +2036,7 @@
 
				 			* [no]ncqtrim: Turn off queued DSM TRIM.
			
 
				 
			
 
				 			* nohrst, nosrst, norst: suppress hard, soft
			
 
				-                          and both resets.
			
 
				+			  and both resets.
			
 
				 
			
 
				 			* rstonce: only attempt one reset during
			
 
				 			  hot-unplug link recovery
			
@@ -2215,7 +2224,7 @@
 
				 			[KNL,SH] Allow user to override the default size for
			
 
				 			per-device physically contiguous DMA buffers.
			
 
				 
			
 
				-        memhp_default_state=online/offline
			
 
				+	memhp_default_state=online/offline
			
 
				 			[KNL] Set the initial state for the memory hotplug
			
 
				 			onlining policy. If not specified, the default value is
			
 
				 			set according to the
			
@@ -2600,6 +2609,9 @@
 
				 			emulation library even if a 387 maths coprocessor
			
 
				 			is present.
			
 
				 
			
 
				+	no5lvl		[X86-64] Disable 5-level paging mode. Forces
			
 
				+			kernel to use 4-level paging instead.
			
 
				+
			
 
				 	no_console_suspend
			
 
				 			[HW] Never suspend the console
			
 
				 			Disable suspending of consoles during suspend and
			
@@ -2765,7 +2777,7 @@
 
				 			[X86,PV_OPS] Disable paravirtualized VMware scheduler
			
 
				 			clock and use the default one.
			
 
				 
			
 
				-	no-steal-acc    [X86,KVM] Disable paravirtualized steal time accounting.
			
 
				+	no-steal-acc	[X86,KVM] Disable paravirtualized steal time accounting.
			
 
				 			steal time is computed, but won't influence scheduler
			
 
				 			behaviour
			
 
				 
			
@@ -2826,7 +2838,7 @@
 
				 	notsc		[BUGS=X86-32] Disable Time Stamp Counter
			
 
				 
			
 
				 	nowatchdog	[KNL] Disable both lockup detectors, i.e.
			
 
				-                        soft-lockup and NMI watchdog (hard-lockup).
			
 
				+			soft-lockup and NMI watchdog (hard-lockup).
			
 
				 
			
 
				 	nowb		[ARM]
			
 
				 
			
@@ -2846,7 +2858,7 @@
 
				 			If the dependencies are under your control, you can
			
 
				 			turn on cpu0_hotplug.
			
 
				 
			
 
				-	nps_mtm_hs_ctr= [KNL,ARC]
			
 
				+	nps_mtm_hs_ctr=	[KNL,ARC]
			
 
				 			This parameter sets the maximum duration, in
			
 
				 			cycles, each HW thread of the CTOP can run
			
 
				 			without interruptions, before HW switches it.
			
@@ -2914,9 +2926,6 @@
 
				 			This will also cause panics on machine check exceptions.
			
 
				 			Useful together with panic=30 to trigger a reboot.
			
 
				 
			
 
				-	OSS		[HW,OSS]
			
 
				-			See Documentation/sound/oss/oss-parameters.txt
			
 
				-
			
 
				 	page_owner=	[KNL] Boot-time page_owner enabling option.
			
 
				 			Storage of the information about who allocated
			
 
				 			each page is disabled in default. With this switch,
			
@@ -2987,7 +2996,7 @@
 
				 
			
 
				 	pci=option[,option...]	[PCI] various PCI subsystem options:
			
 
				 		earlydump	[X86] dump PCI config space before the kernel
			
 
				-			        changes anything
			
 
				+				changes anything
			
 
				 		off		[X86] don't probe for the PCI bus
			
 
				 		bios		[X86-32] force use of PCI BIOS, don't access
			
 
				 				the hardware directly. Use this if your machine
			
@@ -3075,7 +3084,7 @@
 
				 				is enabled by default.  If you need to use this,
			
 
				 				please report a bug.
			
 
				 		nocrs		[X86] Ignore PCI host bridge windows from ACPI.
			
 
				-			        If you need to use this, please report a bug.
			
 
				+				If you need to use this, please report a bug.
			
 
				 		routeirq	Do IRQ routing for all PCI devices.
			
 
				 				This is normally done in pci_enable_device(),
			
 
				 				so this option is a temporary workaround
			
@@ -3150,6 +3159,8 @@
 
				 				on: Turn realloc on
			
 
				 		realloc		same as realloc=on
			
 
				 		noari		do not use PCIe ARI.
			
 
				+		noats		[PCIE, Intel-IOMMU, AMD-IOMMU]
			
 
				+				do not use PCIe ATS (and IOMMU device IOTLB).
			
 
				 		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
			
 
				 				only look for one device below a PCIe downstream
			
 
				 				port.
			
@@ -3918,7 +3929,7 @@
 
				 			cache (risks via metadata attacks are mostly
			
 
				 			unchanged). Debug options disable merging on their
			
 
				 			own.
			
 
				-			For more information see Documentation/vm/slub.txt.
			
 
				+			For more information see Documentation/vm/slub.rst.
			
 
				 
			
 
				 	slab_max_order=	[MM, SLAB]
			
 
				 			Determines the maximum allowed order for slabs.
			
@@ -3932,7 +3943,7 @@
 
				 			slub_debug can create guard zones around objects and
			
 
				 			may poison objects when not in use. Also tracks the
			
 
				 			last alloc / free. For more information see
			
 
				-			Documentation/vm/slub.txt.
			
 
				+			Documentation/vm/slub.rst.
			
 
				 
			
 
				 	slub_memcg_sysfs=	[MM, SLUB]
			
 
				 			Determines whether to enable sysfs directories for
			
@@ -3946,7 +3957,7 @@
 
				 			Determines the maximum allowed order for slabs.
			
 
				 			A high setting may cause OOMs due to memory
			
 
				 			fragmentation. For more information see
			
 
				-			Documentation/vm/slub.txt.
			
 
				+			Documentation/vm/slub.rst.
			
 
				 
			
 
				 	slub_min_objects=	[MM, SLUB]
			
 
				 			The minimum number of objects per slab. SLUB will
			
@@ -3955,12 +3966,12 @@
 
				 			the number of objects indicated. The higher the number
			
 
				 			of objects the smaller the overhead of tracking slabs
			
 
				 			and the less frequently locks need to be acquired.
			
 
				-			For more information see Documentation/vm/slub.txt.
			
 
				+			For more information see Documentation/vm/slub.rst.
			
 
				 
			
 
				 	slub_min_order=	[MM, SLUB]
			
 
				 			Determines the minimum page order for slabs. Must be
			
 
				 			lower than slub_max_order.
			
 
				-			For more information see Documentation/vm/slub.txt.
			
 
				+			For more information see Documentation/vm/slub.rst.
			
 
				 
			
 
				 	slub_nomerge	[MM, SLUB]
			
 
				 			Same with slab_nomerge. This is supported for legacy.
			
@@ -4092,6 +4103,23 @@
 
				 			expediting.  Set to zero to disable automatic
			
 
				 			expediting.
			
 
				 
			
 
				+	ssbd=		[ARM64,HW]
			
 
				+			Speculative Store Bypass Disable control
			
 
				+
			
 
				+			On CPUs that are vulnerable to the Speculative
			
 
				+			Store Bypass vulnerability and offer a
			
 
				+			firmware based mitigation, this parameter
			
 
				+			indicates how the mitigation should be used:
			
 
				+
			
 
				+			force-on:  Unconditionally enable mitigation for
			
 
				+				   for both kernel and userspace
			
 
				+			force-off: Unconditionally disable mitigation for
			
 
				+				   for both kernel and userspace
			
 
				+			kernel:    Always enable mitigation in the
			
 
				+				   kernel, and offer a prctl interface
			
 
				+				   to allow userspace to register its
			
 
				+				   interest in being mitigated too.
			
 
				+
			
 
				 	stack_guard_gap=	[MM]
			
 
				 			override the default stack gap protection. The value
			
 
				 			is in page units and it defines how many pages prior
			
@@ -4304,7 +4332,7 @@
 
				 			[FTRACE] Set and start specified trace events in order
			
 
				 			to facilitate early boot debugging. The event-list is a
			
 
				 			comma separated list of trace events to enable. See
			
 
				-			also Documentation/trace/events.txt
			
 
				+			also Documentation/trace/events.rst
			
 
				 
			
 
				 	trace_options=[option-list]
			
 
				 			[FTRACE] Enable or disable tracer options at boot.
			
@@ -4319,7 +4347,7 @@
 
				 
			
 
				 			      trace_options=stacktrace
			
 
				 
			
 
				-			See also Documentation/trace/ftrace.txt "trace options"
			
 
				+			See also Documentation/trace/ftrace.rst "trace options"
			
 
				 			section.
			
 
				 
			
 
				 	tp_printk[FTRACE]
			
@@ -4358,7 +4386,8 @@
 
				 			Format: [always|madvise|never]
			
 
				 			Can be used to control the default behavior of the system
			
 
				 			with respect to transparent hugepages.
			
 
				-			See Documentation/vm/transhuge.txt for more details.
			
 
				+			See Documentation/admin-guide/mm/transhuge.rst
			
 
				+			for more details.
			
 
				 
			
 
				 	tsc=		Disable clocksource stability checks for TSC.
			
 
				 			Format: <string>
			
@@ -4436,7 +4465,7 @@
 
				 
			
 
				 	usbcore.initial_descriptor_timeout=
			
 
				 			[USB] Specifies timeout for the initial 64-byte
			
 
				-                        USB_REQ_GET_DESCRIPTOR request in milliseconds
			
 
				+			USB_REQ_GET_DESCRIPTOR request in milliseconds
			
 
				 			(default 5000 = 5.0 seconds).
			
 
				 
			
 
				 	usbcore.nousb	[USB] Disable the USB subsystem
			
--- a/Documentation/admin-guide/mm/concepts.rst
+++ b/Documentation/admin-guide/mm/concepts.rst
@@ -0,0 +1,222 @@
 
				+.. _mm_concepts:
			
 
				+
			
 
				+=================
			
 
				+Concepts overview
			
 
				+=================
			
 
				+
			
 
				+The memory management in Linux is complex system that evolved over the
			
 
				+years and included more and more functionality to support variety of
			
 
				+systems from MMU-less microcontrollers to supercomputers. The memory
			
 
				+management for systems without MMU is called ``nommu`` and it
			
 
				+definitely deserves a dedicated document, which hopefully will be
			
 
				+eventually written. Yet, although some of the concepts are the same,
			
 
				+here we assume that MMU is available and CPU can translate a virtual
			
 
				+address to a physical address.
			
 
				+
			
 
				+.. contents:: :local:
			
 
				+
			
 
				+Virtual Memory Primer
			
 
				+=====================
			
 
				+
			
 
				+The physical memory in a computer system is a limited resource and
			
 
				+even for systems that support memory hotplug there is a hard limit on
			
 
				+the amount of memory that can be installed. The physical memory is not
			
 
				+necessary contiguous, it might be accessible as a set of distinct
			
 
				+address ranges. Besides, different CPU architectures, and even
			
 
				+different implementations of the same architecture have different view
			
 
				+how these address ranges defined.
			
 
				+
			
 
				+All this makes dealing directly with physical memory quite complex and
			
 
				+to avoid this complexity a concept of virtual memory was developed.
			
 
				+
			
 
				+The virtual memory abstracts the details of physical memory from the
			
 
				+application software, allows to keep only needed information in the
			
 
				+physical memory (demand paging) and provides a mechanism for the
			
 
				+protection and controlled sharing of data between processes.
			
 
				+
			
 
				+With virtual memory, each and every memory access uses a virtual
			
 
				+address. When the CPU decodes the an instruction that reads (or
			
 
				+writes) from (or to) the system memory, it translates the `virtual`
			
 
				+address encoded in that instruction to a `physical` address that the
			
 
				+memory controller can understand.
			
 
				+
			
 
				+The physical system memory is divided into page frames, or pages. The
			
 
				+size of each page is architecture specific. Some architectures allow
			
 
				+selection of the page size from several supported values; this
			
 
				+selection is performed at the kernel build time by setting an
			
 
				+appropriate kernel configuration option.
			
 
				+
			
 
				+Each physical memory page can be mapped as one or more virtual
			
 
				+pages. These mappings are described by page tables that allow
			
 
				+translation from virtual address used by programs to real address in
			
 
				+the physical memory. The page tables organized hierarchically.
			
 
				+
			
 
				+The tables at the lowest level of the hierarchy contain physical
			
 
				+addresses of actual pages used by the software. The tables at higher
			
 
				+levels contain physical addresses of the pages belonging to the lower
			
 
				+levels. The pointer to the top level page table resides in a
			
 
				+register. When the CPU performs the address translation, it uses this
			
 
				+register to access the top level page table. The high bits of the
			
 
				+virtual address are used to index an entry in the top level page
			
 
				+table. That entry is then used to access the next level in the
			
 
				+hierarchy with the next bits of the virtual address as the index to
			
 
				+that level page table. The lowest bits in the virtual address define
			
 
				+the offset inside the actual page.
			
 
				+
			
 
				+Huge Pages
			
 
				+==========
			
 
				+
			
 
				+The address translation requires several memory accesses and memory
			
 
				+accesses are slow relatively to CPU speed. To avoid spending precious
			
 
				+processor cycles on the address translation, CPUs maintain a cache of
			
 
				+such translations called Translation Lookaside Buffer (or
			
 
				+TLB). Usually TLB is pretty scarce resource and applications with
			
 
				+large memory working set will experience performance hit because of
			
 
				+TLB misses.
			
 
				+
			
 
				+Many modern CPU architectures allow mapping of the memory pages
			
 
				+directly by the higher levels in the page table. For instance, on x86,
			
 
				+it is possible to map 2M and even 1G pages using entries in the second
			
 
				+and the third level page tables. In Linux such pages are called
			
 
				+`huge`. Usage of huge pages significantly reduces pressure on TLB,
			
 
				+improves TLB hit-rate and thus improves overall system performance.
			
 
				+
			
 
				+There are two mechanisms in Linux that enable mapping of the physical
			
 
				+memory with the huge pages. The first one is `HugeTLB filesystem`, or
			
 
				+hugetlbfs. It is a pseudo filesystem that uses RAM as its backing
			
 
				+store. For the files created in this filesystem the data resides in
			
 
				+the memory and mapped using huge pages. The hugetlbfs is described at
			
 
				+:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`.
			
 
				+
			
 
				+Another, more recent, mechanism that enables use of the huge pages is
			
 
				+called `Transparent HugePages`, or THP. Unlike the hugetlbfs that
			
 
				+requires users and/or system administrators to configure what parts of
			
 
				+the system memory should and can be mapped by the huge pages, THP
			
 
				+manages such mappings transparently to the user and hence the
			
 
				+name. See
			
 
				+:ref:`Documentation/admin-guide/mm/transhuge.rst <admin_guide_transhuge>`
			
 
				+for more details about THP.
			
 
				+
			
 
				+Zones
			
 
				+=====
			
 
				+
			
 
				+Often hardware poses restrictions on how different physical memory
			
 
				+ranges can be accessed. In some cases, devices cannot perform DMA to
			
 
				+all the addressable memory. In other cases, the size of the physical
			
 
				+memory exceeds the maximal addressable size of virtual memory and
			
 
				+special actions are required to access portions of the memory. Linux
			
 
				+groups memory pages into `zones` according to their possible
			
 
				+usage. For example, ZONE_DMA will contain memory that can be used by
			
 
				+devices for DMA, ZONE_HIGHMEM will contain memory that is not
			
 
				+permanently mapped into kernel's address space and ZONE_NORMAL will
			
 
				+contain normally addressed pages.
			
 
				+
			
 
				+The actual layout of the memory zones is hardware dependent as not all
			
 
				+architectures define all zones, and requirements for DMA are different
			
 
				+for different platforms.
			
 
				+
			
 
				+Nodes
			
 
				+=====
			
 
				+
			
 
				+Many multi-processor machines are NUMA - Non-Uniform Memory Access -
			
 
				+systems. In such systems the memory is arranged into banks that have
			
 
				+different access latency depending on the "distance" from the
			
 
				+processor. Each bank is referred as `node` and for each node Linux
			
 
				+constructs an independent memory management subsystem. A node has it's
			
 
				+own set of zones, lists of free and used pages and various statistics
			
 
				+counters. You can find more details about NUMA in
			
 
				+:ref:`Documentation/vm/numa.rst <numa>` and in
			
 
				+:ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`.
			
 
				+
			
 
				+Page cache
			
 
				+==========
			
 
				+
			
 
				+The physical memory is volatile and the common case for getting data
			
 
				+into the memory is to read it from files. Whenever a file is read, the
			
 
				+data is put into the `page cache` to avoid expensive disk access on
			
 
				+the subsequent reads. Similarly, when one writes to a file, the data
			
 
				+is placed in the page cache and eventually gets into the backing
			
 
				+storage device. The written pages are marked as `dirty` and when Linux
			
 
				+decides to reuse them for other purposes, it makes sure to synchronize
			
 
				+the file contents on the device with the updated data.
			
 
				+
			
 
				+Anonymous Memory
			
 
				+================
			
 
				+
			
 
				+The `anonymous memory` or `anonymous mappings` represent memory that
			
 
				+is not backed by a filesystem. Such mappings are implicitly created
			
 
				+for program's stack and heap or by explicit calls to mmap(2) system
			
 
				+call. Usually, the anonymous mappings only define virtual memory areas
			
 
				+that the program is allowed to access. The read accesses will result
			
 
				+in creation of a page table entry that references a special physical
			
 
				+page filled with zeroes. When the program performs a write, regular
			
 
				+physical page will be allocated to hold the written data. The page
			
 
				+will be marked dirty and if the kernel will decide to repurpose it,
			
 
				+the dirty page will be swapped out.
			
 
				+
			
 
				+Reclaim
			
 
				+=======
			
 
				+
			
 
				+Throughout the system lifetime, a physical page can be used for storing
			
 
				+different types of data. It can be kernel internal data structures,
			
 
				+DMA'able buffers for device drivers use, data read from a filesystem,
			
 
				+memory allocated by user space processes etc.
			
 
				+
			
 
				+Depending on the page usage it is treated differently by the Linux
			
 
				+memory management. The pages that can be freed at any time, either
			
 
				+because they cache the data available elsewhere, for instance, on a
			
 
				+hard disk, or because they can be swapped out, again, to the hard
			
 
				+disk, are called `reclaimable`. The most notable categories of the
			
 
				+reclaimable pages are page cache and anonymous memory.
			
 
				+
			
 
				+In most cases, the pages holding internal kernel data and used as DMA
			
 
				+buffers cannot be repurposed, and they remain pinned until freed by
			
 
				+their user. Such pages are called `unreclaimable`. However, in certain
			
 
				+circumstances, even pages occupied with kernel data structures can be
			
 
				+reclaimed. For instance, in-memory caches of filesystem metadata can
			
 
				+be re-read from the storage device and therefore it is possible to
			
 
				+discard them from the main memory when system is under memory
			
 
				+pressure.
			
 
				+
			
 
				+The process of freeing the reclaimable physical memory pages and
			
 
				+repurposing them is called (surprise!) `reclaim`. Linux can reclaim
			
 
				+pages either asynchronously or synchronously, depending on the state
			
 
				+of the system. When system is not loaded, most of the memory is free
			
 
				+and allocation request will be satisfied immediately from the free
			
 
				+pages supply. As the load increases, the amount of the free pages goes
			
 
				+down and when it reaches a certain threshold (high watermark), an
			
 
				+allocation request will awaken the ``kswapd`` daemon. It will
			
 
				+asynchronously scan memory pages and either just free them if the data
			
 
				+they contain is available elsewhere, or evict to the backing storage
			
 
				+device (remember those dirty pages?). As memory usage increases even
			
 
				+more and reaches another threshold - min watermark - an allocation
			
 
				+will trigger the `direct reclaim`. In this case allocation is stalled
			
 
				+until enough memory pages are reclaimed to satisfy the request.
			
 
				+
			
 
				+Compaction
			
 
				+==========
			
 
				+
			
 
				+As the system runs, tasks allocate and free the memory and it becomes
			
 
				+fragmented. Although with virtual memory it is possible to present
			
 
				+scattered physical pages as virtually contiguous range, sometimes it is
			
 
				+necessary to allocate large physically contiguous memory areas. Such
			
 
				+need may arise, for instance, when a device driver requires large
			
 
				+buffer for DMA, or when THP allocates a huge page. Memory `compaction`
			
 
				+addresses the fragmentation issue. This mechanism moves occupied pages
			
 
				+from the lower part of a memory zone to free pages in the upper part
			
 
				+of the zone. When a compaction scan is finished free pages are grouped
			
 
				+together at the beginning of the zone and allocations of large
			
 
				+physically contiguous areas become possible.
			
 
				+
			
 
				+Like reclaim, the compaction may happen asynchronously in ``kcompactd``
			
 
				+daemon or synchronously as a result of memory allocation request.
			
 
				+
			
 
				+OOM killer
			
 
				+==========
			
 
				+
			
 
				+It may happen, that on a loaded machine memory will be exhausted. When
			
 
				+the kernel detects that the system runs out of memory (OOM) it invokes
			
 
				+`OOM killer`. Its mission is simple: all it has to do is to select a
			
 
				+task to sacrifice for the sake of the overall system health. The
			
 
				+selected task is killed in a hope that after it exits enough memory
			
 
				+will be freed to continue normal operation.
			
--- a/Documentation/admin-guide/mm/hugetlbpage.rst
+++ b/Documentation/admin-guide/mm/hugetlbpage.rst
@@ -0,0 +1,382 @@
 
				+.. _hugetlbpage:
			
 
				+
			
 
				+=============
			
 
				+HugeTLB Pages
			
 
				+=============
			
 
				+
			
 
				+Overview
			
 
				+========
			
 
				+
			
 
				+The intent of this file is to give a brief summary of hugetlbpage support in
			
 
				+the Linux kernel.  This support is built on top of multiple page size support
			
 
				+that is provided by most modern architectures.  For example, x86 CPUs normally
			
 
				+support 4K and 2M (1G if architecturally supported) page sizes, ia64
			
 
				+architecture supports multiple page sizes 4K, 8K, 64K, 256K, 1M, 4M, 16M,
			
 
				+256M and ppc64 supports 4K and 16M.  A TLB is a cache of virtual-to-physical
			
 
				+translations.  Typically this is a very scarce resource on processor.
			
 
				+Operating systems try to make best use of limited number of TLB resources.
			
 
				+This optimization is more critical now as bigger and bigger physical memories
			
 
				+(several GBs) are more readily available.
			
 
				+
			
 
				+Users can use the huge page support in Linux kernel by either using the mmap
			
 
				+system call or standard SYSV shared memory system calls (shmget, shmat).
			
 
				+
			
 
				+First the Linux kernel needs to be built with the CONFIG_HUGETLBFS
			
 
				+(present under "File systems") and CONFIG_HUGETLB_PAGE (selected
			
 
				+automatically when CONFIG_HUGETLBFS is selected) configuration
			
 
				+options.
			
 
				+
			
 
				+The ``/proc/meminfo`` file provides information about the total number of
			
 
				+persistent hugetlb pages in the kernel's huge page pool.  It also displays
			
 
				+default huge page size and information about the number of free, reserved
			
 
				+and surplus huge pages in the pool of huge pages of default size.
			
 
				+The huge page size is needed for generating the proper alignment and
			
 
				+size of the arguments to system calls that map huge page regions.
			
 
				+
			
 
				+The output of ``cat /proc/meminfo`` will include lines like::
			
 
				+
			
 
				+	HugePages_Total: uuu
			
 
				+	HugePages_Free:  vvv
			
 
				+	HugePages_Rsvd:  www
			
 
				+	HugePages_Surp:  xxx
			
 
				+	Hugepagesize:    yyy kB
			
 
				+	Hugetlb:         zzz kB
			
 
				+
			
 
				+where:
			
 
				+
			
 
				+HugePages_Total
			
 
				+	is the size of the pool of huge pages.
			
 
				+HugePages_Free
			
 
				+	is the number of huge pages in the pool that are not yet
			
 
				+        allocated.
			
 
				+HugePages_Rsvd
			
 
				+	is short for "reserved," and is the number of huge pages for
			
 
				+        which a commitment to allocate from the pool has been made,
			
 
				+        but no allocation has yet been made.  Reserved huge pages
			
 
				+        guarantee that an application will be able to allocate a
			
 
				+        huge page from the pool of huge pages at fault time.
			
 
				+HugePages_Surp
			
 
				+	is short for "surplus," and is the number of huge pages in
			
 
				+        the pool above the value in ``/proc/sys/vm/nr_hugepages``. The
			
 
				+        maximum number of surplus huge pages is controlled by
			
 
				+        ``/proc/sys/vm/nr_overcommit_hugepages``.
			
 
				+Hugepagesize
			
 
				+	is the default hugepage size (in Kb).
			
 
				+Hugetlb
			
 
				+        is the total amount of memory (in kB), consumed by huge
			
 
				+        pages of all sizes.
			
 
				+        If huge pages of different sizes are in use, this number
			
 
				+        will exceed HugePages_Total \* Hugepagesize. To get more
			
 
				+        detailed information, please, refer to
			
 
				+        ``/sys/kernel/mm/hugepages`` (described below).
			
 
				+
			
 
				+
			
 
				+``/proc/filesystems`` should also show a filesystem of type "hugetlbfs"
			
 
				+configured in the kernel.
			
 
				+
			
 
				+``/proc/sys/vm/nr_hugepages`` indicates the current number of "persistent" huge
			
 
				+pages in the kernel's huge page pool.  "Persistent" huge pages will be
			
 
				+returned to the huge page pool when freed by a task.  A user with root
			
 
				+privileges can dynamically allocate more or free some persistent huge pages
			
 
				+by increasing or decreasing the value of ``nr_hugepages``.
			
 
				+
			
 
				+Pages that are used as huge pages are reserved inside the kernel and cannot
			
 
				+be used for other purposes.  Huge pages cannot be swapped out under
			
 
				+memory pressure.
			
 
				+
			
 
				+Once a number of huge pages have been pre-allocated to the kernel huge page
			
 
				+pool, a user with appropriate privilege can use either the mmap system call
			
 
				+or shared memory system calls to use the huge pages.  See the discussion of
			
 
				+:ref:`Using Huge Pages <using_huge_pages>`, below.
			
 
				+
			
 
				+The administrator can allocate persistent huge pages on the kernel boot
			
 
				+command line by specifying the "hugepages=N" parameter, where 'N' = the
			
 
				+number of huge pages requested.  This is the most reliable method of
			
 
				+allocating huge pages as memory has not yet become fragmented.
			
 
				+
			
 
				+Some platforms support multiple huge page sizes.  To allocate huge pages
			
 
				+of a specific size, one must precede the huge pages boot command parameters
			
 
				+with a huge page size selection parameter "hugepagesz=<size>".  <size> must
			
 
				+be specified in bytes with optional scale suffix [kKmMgG].  The default huge
			
 
				+page size may be selected with the "default_hugepagesz=<size>" boot parameter.
			
 
				+
			
 
				+When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
			
 
				+indicates the current number of pre-allocated huge pages of the default size.
			
 
				+Thus, one can use the following command to dynamically allocate/deallocate
			
 
				+default sized persistent huge pages::
			
 
				+
			
 
				+	echo 20 > /proc/sys/vm/nr_hugepages
			
 
				+
			
 
				+This command will try to adjust the number of default sized huge pages in the
			
 
				+huge page pool to 20, allocating or freeing huge pages, as required.
			
 
				+
			
 
				+On a NUMA platform, the kernel will attempt to distribute the huge page pool
			
 
				+over all the set of allowed nodes specified by the NUMA memory policy of the
			
 
				+task that modifies ``nr_hugepages``. The default for the allowed nodes--when the
			
 
				+task has default memory policy--is all on-line nodes with memory.  Allowed
			
 
				+nodes with insufficient available, contiguous memory for a huge page will be
			
 
				+silently skipped when allocating persistent huge pages.  See the
			
 
				+:ref:`discussion below <mem_policy_and_hp_alloc>`
			
 
				+of the interaction of task memory policy, cpusets and per node attributes
			
 
				+with the allocation and freeing of persistent huge pages.
			
 
				+
			
 
				+The success or failure of huge page allocation depends on the amount of
			
 
				+physically contiguous memory that is present in system at the time of the
			
 
				+allocation attempt.  If the kernel is unable to allocate huge pages from
			
 
				+some nodes in a NUMA system, it will attempt to make up the difference by
			
 
				+allocating extra pages on other nodes with sufficient available contiguous
			
 
				+memory, if any.
			
 
				+
			
 
				+System administrators may want to put this command in one of the local rc
			
 
				+init files.  This will enable the kernel to allocate huge pages early in
			
 
				+the boot process when the possibility of getting physical contiguous pages
			
 
				+is still very high.  Administrators can verify the number of huge pages
			
 
				+actually allocated by checking the sysctl or meminfo.  To check the per node
			
 
				+distribution of huge pages in a NUMA system, use::
			
 
				+
			
 
				+	cat /sys/devices/system/node/node*/meminfo | fgrep Huge
			
 
				+
			
 
				+``/proc/sys/vm/nr_overcommit_hugepages`` specifies how large the pool of
			
 
				+huge pages can grow, if more huge pages than ``/proc/sys/vm/nr_hugepages`` are
			
 
				+requested by applications.  Writing any non-zero value into this file
			
 
				+indicates that the hugetlb subsystem is allowed to try to obtain that
			
 
				+number of "surplus" huge pages from the kernel's normal page pool, when the
			
 
				+persistent huge page pool is exhausted. As these surplus huge pages become
			
 
				+unused, they are freed back to the kernel's normal page pool.
			
 
				+
			
 
				+When increasing the huge page pool size via ``nr_hugepages``, any existing
			
 
				+surplus pages will first be promoted to persistent huge pages.  Then, additional
			
 
				+huge pages will be allocated, if necessary and if possible, to fulfill
			
 
				+the new persistent huge page pool size.
			
 
				+
			
 
				+The administrator may shrink the pool of persistent huge pages for
			
 
				+the default huge page size by setting the ``nr_hugepages`` sysctl to a
			
 
				+smaller value.  The kernel will attempt to balance the freeing of huge pages
			
 
				+across all nodes in the memory policy of the task modifying ``nr_hugepages``.
			
 
				+Any free huge pages on the selected nodes will be freed back to the kernel's
			
 
				+normal page pool.
			
 
				+
			
 
				+Caveat: Shrinking the persistent huge page pool via ``nr_hugepages`` such that
			
 
				+it becomes less than the number of huge pages in use will convert the balance
			
 
				+of the in-use huge pages to surplus huge pages.  This will occur even if
			
 
				+the number of surplus pages would exceed the overcommit value.  As long as
			
 
				+this condition holds--that is, until ``nr_hugepages+nr_overcommit_hugepages`` is
			
 
				+increased sufficiently, or the surplus huge pages go out of use and are freed--
			
 
				+no more surplus huge pages will be allowed to be allocated.
			
 
				+
			
 
				+With support for multiple huge page pools at run-time available, much of
			
 
				+the huge page userspace interface in ``/proc/sys/vm`` has been duplicated in
			
 
				+sysfs.
			
 
				+The ``/proc`` interfaces discussed above have been retained for backwards
			
 
				+compatibility. The root huge page control directory in sysfs is::
			
 
				+
			
 
				+	/sys/kernel/mm/hugepages
			
 
				+
			
 
				+For each huge page size supported by the running kernel, a subdirectory
			
 
				+will exist, of the form::
			
 
				+
			
 
				+	hugepages-${size}kB
			
 
				+
			
 
				+Inside each of these directories, the same set of files will exist::
			
 
				+
			
 
				+	nr_hugepages
			
 
				+	nr_hugepages_mempolicy
			
 
				+	nr_overcommit_hugepages
			
 
				+	free_hugepages
			
 
				+	resv_hugepages
			
 
				+	surplus_hugepages
			
 
				+
			
 
				+which function as described above for the default huge page-sized case.
			
 
				+
			
 
				+.. _mem_policy_and_hp_alloc:
			
 
				+
			
 
				+Interaction of Task Memory Policy with Huge Page Allocation/Freeing
			
 
				+===================================================================
			
 
				+
			
 
				+Whether huge pages are allocated and freed via the ``/proc`` interface or
			
 
				+the ``/sysfs`` interface using the ``nr_hugepages_mempolicy`` attribute, the
			
 
				+NUMA nodes from which huge pages are allocated or freed are controlled by the
			
 
				+NUMA memory policy of the task that modifies the ``nr_hugepages_mempolicy``
			
 
				+sysctl or attribute.  When the ``nr_hugepages`` attribute is used, mempolicy
			
 
				+is ignored.
			
 
				+
			
 
				+The recommended method to allocate or free huge pages to/from the kernel
			
 
				+huge page pool, using the ``nr_hugepages`` example above, is::
			
 
				+
			
 
				+    numactl --interleave <node-list> echo 20 \
			
 
				+				>/proc/sys/vm/nr_hugepages_mempolicy
			
 
				+
			
 
				+or, more succinctly::
			
 
				+
			
 
				+    numactl -m <node-list> echo 20 >/proc/sys/vm/nr_hugepages_mempolicy
			
 
				+
			
 
				+This will allocate or free ``abs(20 - nr_hugepages)`` to or from the nodes
			
 
				+specified in <node-list>, depending on whether number of persistent huge pages
			
 
				+is initially less than or greater than 20, respectively.  No huge pages will be
			
 
				+allocated nor freed on any node not included in the specified <node-list>.
			
 
				+
			
 
				+When adjusting the persistent hugepage count via ``nr_hugepages_mempolicy``, any
			
 
				+memory policy mode--bind, preferred, local or interleave--may be used.  The
			
 
				+resulting effect on persistent huge page allocation is as follows:
			
 
				+
			
 
				+#. Regardless of mempolicy mode [see
			
 
				+   :ref:`Documentation/admin-guide/mm/numa_memory_policy.rst <numa_memory_policy>`],
			
 
				+   persistent huge pages will be distributed across the node or nodes
			
 
				+   specified in the mempolicy as if "interleave" had been specified.
			
 
				+   However, if a node in the policy does not contain sufficient contiguous
			
 
				+   memory for a huge page, the allocation will not "fallback" to the nearest
			
 
				+   neighbor node with sufficient contiguous memory.  To do this would cause
			
 
				+   undesirable imbalance in the distribution of the huge page pool, or
			
 
				+   possibly, allocation of persistent huge pages on nodes not allowed by
			
 
				+   the task's memory policy.
			
 
				+
			
 
				+#. One or more nodes may be specified with the bind or interleave policy.
			
 
				+   If more than one node is specified with the preferred policy, only the
			
 
				+   lowest numeric id will be used.  Local policy will select the node where
			
 
				+   the task is running at the time the nodes_allowed mask is constructed.
			
 
				+   For local policy to be deterministic, the task must be bound to a cpu or
			
 
				+   cpus in a single node.  Otherwise, the task could be migrated to some
			
 
				+   other node at any time after launch and the resulting node will be
			
 
				+   indeterminate.  Thus, local policy is not very useful for this purpose.
			
 
				+   Any of the other mempolicy modes may be used to specify a single node.
			
 
				+
			
 
				+#. The nodes allowed mask will be derived from any non-default task mempolicy,
			
 
				+   whether this policy was set explicitly by the task itself or one of its
			
 
				+   ancestors, such as numactl.  This means that if the task is invoked from a
			
 
				+   shell with non-default policy, that policy will be used.  One can specify a
			
 
				+   node list of "all" with numactl --interleave or --membind [-m] to achieve
			
 
				+   interleaving over all nodes in the system or cpuset.
			
 
				+
			
 
				+#. Any task mempolicy specified--e.g., using numactl--will be constrained by
			
 
				+   the resource limits of any cpuset in which the task runs.  Thus, there will
			
 
				+   be no way for a task with non-default policy running in a cpuset with a
			
 
				+   subset of the system nodes to allocate huge pages outside the cpuset
			
 
				+   without first moving to a cpuset that contains all of the desired nodes.
			
 
				+
			
 
				+#. Boot-time huge page allocation attempts to distribute the requested number
			
 
				+   of huge pages over all on-lines nodes with memory.
			
 
				+
			
 
				+Per Node Hugepages Attributes
			
 
				+=============================
			
 
				+
			
 
				+A subset of the contents of the root huge page control directory in sysfs,
			
 
				+described above, will be replicated under each the system device of each
			
 
				+NUMA node with memory in::
			
 
				+
			
 
				+	/sys/devices/system/node/node[0-9]*/hugepages/
			
 
				+
			
 
				+Under this directory, the subdirectory for each supported huge page size
			
 
				+contains the following attribute files::
			
 
				+
			
 
				+	nr_hugepages
			
 
				+	free_hugepages
			
 
				+	surplus_hugepages
			
 
				+
			
 
				+The free\_' and surplus\_' attribute files are read-only.  They return the number
			
 
				+of free and surplus [overcommitted] huge pages, respectively, on the parent
			
 
				+node.
			
 
				+
			
 
				+The ``nr_hugepages`` attribute returns the total number of huge pages on the
			
 
				+specified node.  When this attribute is written, the number of persistent huge
			
 
				+pages on the parent node will be adjusted to the specified value, if sufficient
			
 
				+resources exist, regardless of the task's mempolicy or cpuset constraints.
			
 
				+
			
 
				+Note that the number of overcommit and reserve pages remain global quantities,
			
 
				+as we don't know until fault time, when the faulting task's mempolicy is
			
 
				+applied, from which node the huge page allocation will be attempted.
			
 
				+
			
 
				+.. _using_huge_pages:
			
 
				+
			
 
				+Using Huge Pages
			
 
				+================
			
 
				+
			
 
				+If the user applications are going to request huge pages using mmap system
			
 
				+call, then it is required that system administrator mount a file system of
			
 
				+type hugetlbfs::
			
 
				+
			
 
				+  mount -t hugetlbfs \
			
 
				+	-o uid=<value>,gid=<value>,mode=<value>,pagesize=<value>,size=<value>,\
			
 
				+	min_size=<value>,nr_inodes=<value> none /mnt/huge
			
 
				+
			
 
				+This command mounts a (pseudo) filesystem of type hugetlbfs on the directory
			
 
				+``/mnt/huge``.  Any file created on ``/mnt/huge`` uses huge pages.
			
 
				+
			
 
				+The ``uid`` and ``gid`` options sets the owner and group of the root of the
			
 
				+file system.  By default the ``uid`` and ``gid`` of the current process
			
 
				+are taken.
			
 
				+
			
 
				+The ``mode`` option sets the mode of root of file system to value & 01777.
			
 
				+This value is given in octal. By default the value 0755 is picked.
			
 
				+
			
 
				+If the platform supports multiple huge page sizes, the ``pagesize`` option can
			
 
				+be used to specify the huge page size and associated pool. ``pagesize``
			
 
				+is specified in bytes. If ``pagesize`` is not specified the platform's
			
 
				+default huge page size and associated pool will be used.
			
 
				+
			
 
				+The ``size`` option sets the maximum value of memory (huge pages) allowed
			
 
				+for that filesystem (``/mnt/huge``). The ``size`` option can be specified
			
 
				+in bytes, or as a percentage of the specified huge page pool (``nr_hugepages``).
			
 
				+The size is rounded down to HPAGE_SIZE boundary.
			
 
				+
			
 
				+The ``min_size`` option sets the minimum value of memory (huge pages) allowed
			
 
				+for the filesystem. ``min_size`` can be specified in the same way as ``size``,
			
 
				+either bytes or a percentage of the huge page pool.
			
 
				+At mount time, the number of huge pages specified by ``min_size`` are reserved
			
 
				+for use by the filesystem.
			
 
				+If there are not enough free huge pages available, the mount will fail.
			
 
				+As huge pages are allocated to the filesystem and freed, the reserve count
			
 
				+is adjusted so that the sum of allocated and reserved huge pages is always
			
 
				+at least ``min_size``.
			
 
				+
			
 
				+The option ``nr_inodes`` sets the maximum number of inodes that ``/mnt/huge``
			
 
				+can use.
			
 
				+
			
 
				+If the ``size``, ``min_size`` or ``nr_inodes`` option is not provided on
			
 
				+command line then no limits are set.
			
 
				+
			
 
				+For ``pagesize``, ``size``, ``min_size`` and ``nr_inodes`` options, you can
			
 
				+use [G|g]/[M|m]/[K|k] to represent giga/mega/kilo.
			
 
				+For example, size=2K has the same meaning as size=2048.
			
 
				+
			
 
				+While read system calls are supported on files that reside on hugetlb
			
 
				+file systems, write system calls are not.
			
 
				+
			
 
				+Regular chown, chgrp, and chmod commands (with right permissions) could be
			
 
				+used to change the file attributes on hugetlbfs.
			
 
				+
			
 
				+Also, it is important to note that no such mount command is required if
			
 
				+applications are going to use only shmat/shmget system calls or mmap with
			
 
				+MAP_HUGETLB.  For an example of how to use mmap with MAP_HUGETLB see
			
 
				+:ref:`map_hugetlb <map_hugetlb>` below.
			
 
				+
			
 
				+Users who wish to use hugetlb memory via shared memory segment should be
			
 
				+members of a supplementary group and system admin needs to configure that gid
			
 
				+into ``/proc/sys/vm/hugetlb_shm_group``.  It is possible for same or different
			
 
				+applications to use any combination of mmaps and shm* calls, though the mount of
			
 
				+filesystem will be required for using mmap calls without MAP_HUGETLB.
			
 
				+
			
 
				+Syscalls that operate on memory backed by hugetlb pages only have their lengths
			
 
				+aligned to the native page size of the processor; they will normally fail with
			
 
				+errno set to EINVAL or exclude hugetlb pages that extend beyond the length if
			
 
				+not hugepage aligned.  For example, munmap(2) will fail if memory is backed by
			
 
				+a hugetlb page and the length is smaller than the hugepage size.
			
 
				+
			
 
				+
			
 
				+Examples
			
 
				+========
			
 
				+
			
 
				+.. _map_hugetlb:
			
 
				+
			
 
				+``map_hugetlb``
			
 
				+	see tools/testing/selftests/vm/map_hugetlb.c
			
 
				+
			
 
				+``hugepage-shm``
			
 
				+	see tools/testing/selftests/vm/hugepage-shm.c
			
 
				+
			
 
				+``hugepage-mmap``
			
 
				+	see tools/testing/selftests/vm/hugepage-mmap.c
			
 
				+
			
 
				+The `libhugetlbfs`_  library provides a wide range of userspace tools
			
 
				+to help with huge page usability, environment setup, and control.
			
 
				+
			
 
				+.. _libhugetlbfs: https://github.com/libhugetlbfs/libhugetlbfs
			
--- a/Documentation/admin-guide/mm/idle_page_tracking.rst
+++ b/Documentation/admin-guide/mm/idle_page_tracking.rst
@@ -0,0 +1,116 @@
 
				+.. _idle_page_tracking:
			
 
				+
			
 
				+==================
			
 
				+Idle Page Tracking
			
 
				+==================
			
 
				+
			
 
				+Motivation
			
 
				+==========
			
 
				+
			
 
				+The idle page tracking feature allows to track which memory pages are being
			
 
				+accessed by a workload and which are idle. This information can be useful for
			
 
				+estimating the workload's working set size, which, in turn, can be taken into
			
 
				+account when configuring the workload parameters, setting memory cgroup limits,
			
 
				+or deciding where to place the workload within a compute cluster.
			
 
				+
			
 
				+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
			
 
				+
			
 
				+.. _user_api:
			
 
				+
			
 
				+User API
			
 
				+========
			
 
				+
			
 
				+The idle page tracking API is located at ``/sys/kernel/mm/page_idle``.
			
 
				+Currently, it consists of the only read-write file,
			
 
				+``/sys/kernel/mm/page_idle/bitmap``.
			
 
				+
			
 
				+The file implements a bitmap where each bit corresponds to a memory page. The
			
 
				+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
			
 
				+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
			
 
				+set, the corresponding page is idle.
			
 
				+
			
 
				+A page is considered idle if it has not been accessed since it was marked idle
			
 
				+(for more details on what "accessed" actually means see the :ref:`Implementation
			
 
				+Details <impl_details>` section).
			
 
				+To mark a page idle one has to set the bit corresponding to
			
 
				+the page by writing to the file. A value written to the file is OR-ed with the
			
 
				+current bitmap value.
			
 
				+
			
 
				+Only accesses to user memory pages are tracked. These are pages mapped to a
			
 
				+process address space, page cache and buffer pages, swap cache pages. For other
			
 
				+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
			
 
				+and hence such pages are never reported idle.
			
 
				+
			
 
				+For huge pages the idle flag is set only on the head page, so one has to read
			
 
				+``/proc/kpageflags`` in order to correctly count idle huge pages.
			
 
				+
			
 
				+Reading from or writing to ``/sys/kernel/mm/page_idle/bitmap`` will return
			
 
				+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
			
 
				+if the size of the read/write is not a multiple of 8 bytes. Writing to
			
 
				+this file beyond max PFN will return -ENXIO.
			
 
				+
			
 
				+That said, in order to estimate the amount of pages that are not used by a
			
 
				+workload one should:
			
 
				+
			
 
				+ 1. Mark all the workload's pages as idle by setting corresponding bits in
			
 
				+    ``/sys/kernel/mm/page_idle/bitmap``. The pages can be found by reading
			
 
				+    ``/proc/pid/pagemap`` if the workload is represented by a process, or by
			
 
				+    filtering out alien pages using ``/proc/kpagecgroup`` in case the workload
			
 
				+    is placed in a memory cgroup.
			
 
				+
			
 
				+ 2. Wait until the workload accesses its working set.
			
 
				+
			
 
				+ 3. Read ``/sys/kernel/mm/page_idle/bitmap`` and count the number of bits set.
			
 
				+    If one wants to ignore certain types of pages, e.g. mlocked pages since they
			
 
				+    are not reclaimable, he or she can filter them out using
			
 
				+    ``/proc/kpageflags``.
			
 
				+
			
 
				+See :ref:`Documentation/admin-guide/mm/pagemap.rst <pagemap>` for more
			
 
				+information about ``/proc/pid/pagemap``, ``/proc/kpageflags``, and
			
 
				+``/proc/kpagecgroup``.
			
 
				+
			
 
				+.. _impl_details:
			
 
				+
			
 
				+Implementation Details
			
 
				+======================
			
 
				+
			
 
				+The kernel internally keeps track of accesses to user memory pages in order to
			
 
				+reclaim unreferenced pages first on memory shortage conditions. A page is
			
 
				+considered referenced if it has been recently accessed via a process address
			
 
				+space, in which case one or more PTEs it is mapped to will have the Accessed bit
			
 
				+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
			
 
				+latter happens when:
			
 
				+
			
 
				+ - a userspace process reads or writes a page using a system call (e.g. read(2)
			
 
				+   or write(2))
			
 
				+
			
 
				+ - a page that is used for storing filesystem buffers is read or written,
			
 
				+   because a process needs filesystem metadata stored in it (e.g. lists a
			
 
				+   directory tree)
			
 
				+
			
 
				+ - a page is accessed by a device driver using get_user_pages()
			
 
				+
			
 
				+When a dirty page is written to swap or disk as a result of memory reclaim or
			
 
				+exceeding the dirty memory limit, it is not marked referenced.
			
 
				+
			
 
				+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
			
 
				+is set manually, by writing to ``/sys/kernel/mm/page_idle/bitmap`` (see the
			
 
				+:ref:`User API <user_api>`
			
 
				+section), and cleared automatically whenever a page is referenced as defined
			
 
				+above.
			
 
				+
			
 
				+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
			
 
				+mapped to, otherwise we will not be able to detect accesses to the page coming
			
 
				+from a process address space. To avoid interference with the reclaimer, which,
			
 
				+as noted above, uses the Accessed bit to promote actively referenced pages, one
			
 
				+more page flag is introduced, the Young flag. When the PTE Accessed bit is
			
 
				+cleared as a result of setting or updating a page's Idle flag, the Young flag
			
 
				+is set on the page. The reclaimer treats the Young flag as an extra PTE
			
 
				+Accessed bit and therefore will consider such a page as referenced.
			
 
				+
			
 
				+Since the idle memory tracking feature is based on the memory reclaimer logic,
			
 
				+it only works with pages that are on an LRU list, other pages are silently
			
 
				+ignored. That means it will ignore a user memory page if it is isolated, but
			
 
				+since there are usually not many of them, it should not affect the overall
			
 
				+result noticeably. In order not to stall scanning of the idle page bitmap,
			
 
				+locked pages may be skipped too.
			
--- a/Documentation/admin-guide/mm/index.rst
+++ b/Documentation/admin-guide/mm/index.rst
@@ -0,0 +1,36 @@
 
				+=================
			
 
				+Memory Management
			
 
				+=================
			
 
				+
			
 
				+Linux memory management subsystem is responsible, as the name implies,
			
 
				+for managing the memory in the system. This includes implemnetation of
			
 
				+virtual memory and demand paging, memory allocation both for kernel
			
 
				+internal structures and user space programms, mapping of files into
			
 
				+processes address space and many other cool things.
			
 
				+
			
 
				+Linux memory management is a complex system with many configurable
			
 
				+settings. Most of these settings are available via ``/proc``
			
 
				+filesystem and can be quired and adjusted using ``sysctl``. These APIs
			
 
				+are described in Documentation/sysctl/vm.txt and in `man 5 proc`_.
			
 
				+
			
 
				+.. _man 5 proc: http://man7.org/linux/man-pages/man5/proc.5.html
			
 
				+
			
 
				+Linux memory management has its own jargon and if you are not yet
			
 
				+familiar with it, consider reading
			
 
				+:ref:`Documentation/admin-guide/mm/concepts.rst <mm_concepts>`.
			
 
				+
			
 
				+Here we document in detail how to interact with various mechanisms in
			
 
				+the Linux memory management.
			
 
				+
			
 
				+.. toctree::
			
 
				+   :maxdepth: 1
			
 
				+
			
 
				+   concepts
			
 
				+   hugetlbpage
			
 
				+   idle_page_tracking
			
 
				+   ksm
			
 
				+   numa_memory_policy
			
 
				+   pagemap
			
 
				+   soft-dirty
			
 
				+   transhuge
			
 
				+   userfaultfd
			
--- a/Documentation/admin-guide/mm/ksm.rst
+++ b/Documentation/admin-guide/mm/ksm.rst
@@ -0,0 +1,189 @@
 
				+.. _admin_guide_ksm:
			
 
				+
			
 
				+=======================
			
 
				+Kernel Samepage Merging
			
 
				+=======================
			
 
				+
			
 
				+Overview
			
 
				+========
			
 
				+
			
 
				+KSM is a memory-saving de-duplication feature, enabled by CONFIG_KSM=y,
			
 
				+added to the Linux kernel in 2.6.32.  See ``mm/ksm.c`` for its implementation,
			
 
				+and http://lwn.net/Articles/306704/ and http://lwn.net/Articles/330589/
			
 
				+
			
 
				+KSM was originally developed for use with KVM (where it was known as
			
 
				+Kernel Shared Memory), to fit more virtual machines into physical memory,
			
 
				+by sharing the data common between them.  But it can be useful to any
			
 
				+application which generates many instances of the same data.
			
 
				+
			
 
				+The KSM daemon ksmd periodically scans those areas of user memory
			
 
				+which have been registered with it, looking for pages of identical
			
 
				+content which can be replaced by a single write-protected page (which
			
 
				+is automatically copied if a process later wants to update its
			
 
				+content). The amount of pages that KSM daemon scans in a single pass
			
 
				+and the time between the passes are configured using :ref:`sysfs
			
 
				+intraface <ksm_sysfs>`
			
 
				+
			
 
				+KSM only merges anonymous (private) pages, never pagecache (file) pages.
			
 
				+KSM's merged pages were originally locked into kernel memory, but can now
			
 
				+be swapped out just like other user pages (but sharing is broken when they
			
 
				+are swapped back in: ksmd must rediscover their identity and merge again).
			
 
				+
			
 
				+Controlling KSM with madvise
			
 
				+============================
			
 
				+
			
 
				+KSM only operates on those areas of address space which an application
			
 
				+has advised to be likely candidates for merging, by using the madvise(2)
			
 
				+system call::
			
 
				+
			
 
				+	int madvise(addr, length, MADV_MERGEABLE)
			
 
				+
			
 
				+The app may call
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	int madvise(addr, length, MADV_UNMERGEABLE)
			
 
				+
			
 
				+to cancel that advice and restore unshared pages: whereupon KSM
			
 
				+unmerges whatever it merged in that range.  Note: this unmerging call
			
 
				+may suddenly require more memory than is available - possibly failing
			
 
				+with EAGAIN, but more probably arousing the Out-Of-Memory killer.
			
 
				+
			
 
				+If KSM is not configured into the running kernel, madvise MADV_MERGEABLE
			
 
				+and MADV_UNMERGEABLE simply fail with EINVAL.  If the running kernel was
			
 
				+built with CONFIG_KSM=y, those calls will normally succeed: even if the
			
 
				+the KSM daemon is not currently running, MADV_MERGEABLE still registers
			
 
				+the range for whenever the KSM daemon is started; even if the range
			
 
				+cannot contain any pages which KSM could actually merge; even if
			
 
				+MADV_UNMERGEABLE is applied to a range which was never MADV_MERGEABLE.
			
 
				+
			
 
				+If a region of memory must be split into at least one new MADV_MERGEABLE
			
 
				+or MADV_UNMERGEABLE region, the madvise may return ENOMEM if the process
			
 
				+will exceed ``vm.max_map_count`` (see Documentation/sysctl/vm.txt).
			
 
				+
			
 
				+Like other madvise calls, they are intended for use on mapped areas of
			
 
				+the user address space: they will report ENOMEM if the specified range
			
 
				+includes unmapped gaps (though working on the intervening mapped areas),
			
 
				+and might fail with EAGAIN if not enough memory for internal structures.
			
 
				+
			
 
				+Applications should be considerate in their use of MADV_MERGEABLE,
			
 
				+restricting its use to areas likely to benefit.  KSM's scans may use a lot
			
 
				+of processing power: some installations will disable KSM for that reason.
			
 
				+
			
 
				+.. _ksm_sysfs:
			
 
				+
			
 
				+KSM daemon sysfs interface
			
 
				+==========================
			
 
				+
			
 
				+The KSM daemon is controlled by sysfs files in ``/sys/kernel/mm/ksm/``,
			
 
				+readable by all but writable only by root:
			
 
				+
			
 
				+pages_to_scan
			
 
				+        how many pages to scan before ksmd goes to sleep
			
 
				+        e.g. ``echo 100 > /sys/kernel/mm/ksm/pages_to_scan``.
			
 
				+
			
 
				+        Default: 100 (chosen for demonstration purposes)
			
 
				+
			
 
				+sleep_millisecs
			
 
				+        how many milliseconds ksmd should sleep before next scan
			
 
				+        e.g. ``echo 20 > /sys/kernel/mm/ksm/sleep_millisecs``
			
 
				+
			
 
				+        Default: 20 (chosen for demonstration purposes)
			
 
				+
			
 
				+merge_across_nodes
			
 
				+        specifies if pages from different NUMA nodes can be merged.
			
 
				+        When set to 0, ksm merges only pages which physically reside
			
 
				+        in the memory area of same NUMA node. That brings lower
			
 
				+        latency to access of shared pages. Systems with more nodes, at
			
 
				+        significant NUMA distances, are likely to benefit from the
			
 
				+        lower latency of setting 0. Smaller systems, which need to
			
 
				+        minimize memory usage, are likely to benefit from the greater
			
 
				+        sharing of setting 1 (default). You may wish to compare how
			
 
				+        your system performs under each setting, before deciding on
			
 
				+        which to use. ``merge_across_nodes`` setting can be changed only
			
 
				+        when there are no ksm shared pages in the system: set run 2 to
			
 
				+        unmerge pages first, then to 1 after changing
			
 
				+        ``merge_across_nodes``, to remerge according to the new setting.
			
 
				+
			
 
				+        Default: 1 (merging across nodes as in earlier releases)
			
 
				+
			
 
				+run
			
 
				+        * set to 0 to stop ksmd from running but keep merged pages,
			
 
				+        * set to 1 to run ksmd e.g. ``echo 1 > /sys/kernel/mm/ksm/run``,
			
 
				+        * set to 2 to stop ksmd and unmerge all pages currently merged, but
			
 
				+	  leave mergeable areas registered for next run.
			
 
				+
			
 
				+        Default: 0 (must be changed to 1 to activate KSM, except if
			
 
				+        CONFIG_SYSFS is disabled)
			
 
				+
			
 
				+use_zero_pages
			
 
				+        specifies whether empty pages (i.e. allocated pages that only
			
 
				+        contain zeroes) should be treated specially.  When set to 1,
			
 
				+        empty pages are merged with the kernel zero page(s) instead of
			
 
				+        with each other as it would happen normally. This can improve
			
 
				+        the performance on architectures with coloured zero pages,
			
 
				+        depending on the workload. Care should be taken when enabling
			
 
				+        this setting, as it can potentially degrade the performance of
			
 
				+        KSM for some workloads, for example if the checksums of pages
			
 
				+        candidate for merging match the checksum of an empty
			
 
				+        page. This setting can be changed at any time, it is only
			
 
				+        effective for pages merged after the change.
			
 
				+
			
 
				+        Default: 0 (normal KSM behaviour as in earlier releases)
			
 
				+
			
 
				+max_page_sharing
			
 
				+        Maximum sharing allowed for each KSM page. This enforces a
			
 
				+        deduplication limit to avoid high latency for virtual memory
			
 
				+        operations that involve traversal of the virtual mappings that
			
 
				+        share the KSM page. The minimum value is 2 as a newly created
			
 
				+        KSM page will have at least two sharers. The higher this value
			
 
				+        the faster KSM will merge the memory and the higher the
			
 
				+        deduplication factor will be, but the slower the worst case
			
 
				+        virtual mappings traversal could be for any given KSM
			
 
				+        page. Slowing down this traversal means there will be higher
			
 
				+        latency for certain virtual memory operations happening during
			
 
				+        swapping, compaction, NUMA balancing and page migration, in
			
 
				+        turn decreasing responsiveness for the caller of those virtual
			
 
				+        memory operations. The scheduler latency of other tasks not
			
 
				+        involved with the VM operations doing the virtual mappings
			
 
				+        traversal is not affected by this parameter as these
			
 
				+        traversals are always schedule friendly themselves.
			
 
				+
			
 
				+stable_node_chains_prune_millisecs
			
 
				+        specifies how frequently KSM checks the metadata of the pages
			
 
				+        that hit the deduplication limit for stale information.
			
 
				+        Smaller milllisecs values will free up the KSM metadata with
			
 
				+        lower latency, but they will make ksmd use more CPU during the
			
 
				+        scan. It's a noop if not a single KSM page hit the
			
 
				+        ``max_page_sharing`` yet.
			
 
				+
			
 
				+The effectiveness of KSM and MADV_MERGEABLE is shown in ``/sys/kernel/mm/ksm/``:
			
 
				+
			
 
				+pages_shared
			
 
				+        how many shared pages are being used
			
 
				+pages_sharing
			
 
				+        how many more sites are sharing them i.e. how much saved
			
 
				+pages_unshared
			
 
				+        how many pages unique but repeatedly checked for merging
			
 
				+pages_volatile
			
 
				+        how many pages changing too fast to be placed in a tree
			
 
				+full_scans
			
 
				+        how many times all mergeable areas have been scanned
			
 
				+stable_node_chains
			
 
				+        the number of KSM pages that hit the ``max_page_sharing`` limit
			
 
				+stable_node_dups
			
 
				+        number of duplicated KSM pages
			
 
				+
			
 
				+A high ratio of ``pages_sharing`` to ``pages_shared`` indicates good
			
 
				+sharing, but a high ratio of ``pages_unshared`` to ``pages_sharing``
			
 
				+indicates wasted effort.  ``pages_volatile`` embraces several
			
 
				+different kinds of activity, but a high proportion there would also
			
 
				+indicate poor use of madvise MADV_MERGEABLE.
			
 
				+
			
 
				+The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
			
 
				+``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
			
 
				+be increased accordingly.
			
 
				+
			
 
				+--
			
 
				+Izik Eidus,
			
 
				+Hugh Dickins, 17 Nov 2009
			
--- a/Documentation/admin-guide/mm/numa_memory_policy.rst
+++ b/Documentation/admin-guide/mm/numa_memory_policy.rst
@@ -0,0 +1,495 @@
 
				+.. _numa_memory_policy:
			
 
				+
			
 
				+==================
			
 
				+NUMA Memory Policy
			
 
				+==================
			
 
				+
			
 
				+What is NUMA Memory Policy?
			
 
				+============================
			
 
				+
			
 
				+In the Linux kernel, "memory policy" determines from which node the kernel will
			
 
				+allocate memory in a NUMA system or in an emulated NUMA system.  Linux has
			
 
				+supported platforms with Non-Uniform Memory Access architectures since 2.4.?.
			
 
				+The current memory policy support was added to Linux 2.6 around May 2004.  This
			
 
				+document attempts to describe the concepts and APIs of the 2.6 memory policy
			
 
				+support.
			
 
				+
			
 
				+Memory policies should not be confused with cpusets
			
 
				+(``Documentation/cgroup-v1/cpusets.txt``)
			
 
				+which is an administrative mechanism for restricting the nodes from which
			
 
				+memory may be allocated by a set of processes. Memory policies are a
			
 
				+programming interface that a NUMA-aware application can take advantage of.  When
			
 
				+both cpusets and policies are applied to a task, the restrictions of the cpuset
			
 
				+takes priority.  See :ref:`Memory Policies and cpusets <mem_pol_and_cpusets>`
			
 
				+below for more details.
			
 
				+
			
 
				+Memory Policy Concepts
			
 
				+======================
			
 
				+
			
 
				+Scope of Memory Policies
			
 
				+------------------------
			
 
				+
			
 
				+The Linux kernel supports _scopes_ of memory policy, described here from
			
 
				+most general to most specific:
			
 
				+
			
 
				+System Default Policy
			
 
				+	this policy is "hard coded" into the kernel.  It is the policy
			
 
				+	that governs all page allocations that aren't controlled by
			
 
				+	one of the more specific policy scopes discussed below.  When
			
 
				+	the system is "up and running", the system default policy will
			
 
				+	use "local allocation" described below.  However, during boot
			
 
				+	up, the system default policy will be set to interleave
			
 
				+	allocations across all nodes with "sufficient" memory, so as
			
 
				+	not to overload the initial boot node with boot-time
			
 
				+	allocations.
			
 
				+
			
 
				+Task/Process Policy
			
 
				+	this is an optional, per-task policy.  When defined for a
			
 
				+	specific task, this policy controls all page allocations made
			
 
				+	by or on behalf of the task that aren't controlled by a more
			
 
				+	specific scope. If a task does not define a task policy, then
			
 
				+	all page allocations that would have been controlled by the
			
 
				+	task policy "fall back" to the System Default Policy.
			
 
				+
			
 
				+	The task policy applies to the entire address space of a task. Thus,
			
 
				+	it is inheritable, and indeed is inherited, across both fork()
			
 
				+	[clone() w/o the CLONE_VM flag] and exec*().  This allows a parent task
			
 
				+	to establish the task policy for a child task exec()'d from an
			
 
				+	executable image that has no awareness of memory policy.  See the
			
 
				+	:ref:`Memory Policy APIs <memory_policy_apis>` section,
			
 
				+	below, for an overview of the system call
			
 
				+	that a task may use to set/change its task/process policy.
			
 
				+
			
 
				+	In a multi-threaded task, task policies apply only to the thread
			
 
				+	[Linux kernel task] that installs the policy and any threads
			
 
				+	subsequently created by that thread.  Any sibling threads existing
			
 
				+	at the time a new task policy is installed retain their current
			
 
				+	policy.
			
 
				+
			
 
				+	A task policy applies only to pages allocated after the policy is
			
 
				+	installed.  Any pages already faulted in by the task when the task
			
 
				+	changes its task policy remain where they were allocated based on
			
 
				+	the policy at the time they were allocated.
			
 
				+
			
 
				+.. _vma_policy:
			
 
				+
			
 
				+VMA Policy
			
 
				+	A "VMA" or "Virtual Memory Area" refers to a range of a task's
			
 
				+	virtual address space.  A task may define a specific policy for a range
			
 
				+	of its virtual address space.   See the
			
 
				+	:ref:`Memory Policy APIs <memory_policy_apis>` section,
			
 
				+	below, for an overview of the mbind() system call used to set a VMA
			
 
				+	policy.
			
 
				+
			
 
				+	A VMA policy will govern the allocation of pages that back
			
 
				+	this region of the address space.  Any regions of the task's
			
 
				+	address space that don't have an explicit VMA policy will fall
			
 
				+	back to the task policy, which may itself fall back to the
			
 
				+	System Default Policy.
			
 
				+
			
 
				+	VMA policies have a few complicating details:
			
 
				+
			
 
				+	* VMA policy applies ONLY to anonymous pages.  These include
			
 
				+	  pages allocated for anonymous segments, such as the task
			
 
				+	  stack and heap, and any regions of the address space
			
 
				+	  mmap()ed with the MAP_ANONYMOUS flag.  If a VMA policy is
			
 
				+	  applied to a file mapping, it will be ignored if the mapping
			
 
				+	  used the MAP_SHARED flag.  If the file mapping used the
			
 
				+	  MAP_PRIVATE flag, the VMA policy will only be applied when
			
 
				+	  an anonymous page is allocated on an attempt to write to the
			
 
				+	  mapping-- i.e., at Copy-On-Write.
			
 
				+
			
 
				+	* VMA policies are shared between all tasks that share a
			
 
				+	  virtual address space--a.k.a. threads--independent of when
			
 
				+	  the policy is installed; and they are inherited across
			
 
				+	  fork().  However, because VMA policies refer to a specific
			
 
				+	  region of a task's address space, and because the address
			
 
				+	  space is discarded and recreated on exec*(), VMA policies
			
 
				+	  are NOT inheritable across exec().  Thus, only NUMA-aware
			
 
				+	  applications may use VMA policies.
			
 
				+
			
 
				+	* A task may install a new VMA policy on a sub-range of a
			
 
				+	  previously mmap()ed region.  When this happens, Linux splits
			
 
				+	  the existing virtual memory area into 2 or 3 VMAs, each with
			
 
				+	  it's own policy.
			
 
				+
			
 
				+	* By default, VMA policy applies only to pages allocated after
			
 
				+	  the policy is installed.  Any pages already faulted into the
			
 
				+	  VMA range remain where they were allocated based on the
			
 
				+	  policy at the time they were allocated.  However, since
			
 
				+	  2.6.16, Linux supports page migration via the mbind() system
			
 
				+	  call, so that page contents can be moved to match a newly
			
 
				+	  installed policy.
			
 
				+
			
 
				+Shared Policy
			
 
				+	Conceptually, shared policies apply to "memory objects" mapped
			
 
				+	shared into one or more tasks' distinct address spaces.  An
			
 
				+	application installs shared policies the same way as VMA
			
 
				+	policies--using the mbind() system call specifying a range of
			
 
				+	virtual addresses that map the shared object.  However, unlike
			
 
				+	VMA policies, which can be considered to be an attribute of a
			
 
				+	range of a task's address space, shared policies apply
			
 
				+	directly to the shared object.  Thus, all tasks that attach to
			
 
				+	the object share the policy, and all pages allocated for the
			
 
				+	shared object, by any task, will obey the shared policy.
			
 
				+
			
 
				+	As of 2.6.22, only shared memory segments, created by shmget() or
			
 
				+	mmap(MAP_ANONYMOUS|MAP_SHARED), support shared policy.  When shared
			
 
				+	policy support was added to Linux, the associated data structures were
			
 
				+	added to hugetlbfs shmem segments.  At the time, hugetlbfs did not
			
 
				+	support allocation at fault time--a.k.a lazy allocation--so hugetlbfs
			
 
				+	shmem segments were never "hooked up" to the shared policy support.
			
 
				+	Although hugetlbfs segments now support lazy allocation, their support
			
 
				+	for shared policy has not been completed.
			
 
				+
			
 
				+	As mentioned above in :ref:`VMA policies <vma_policy>` section,
			
 
				+	allocations of page cache pages for regular files mmap()ed
			
 
				+	with MAP_SHARED ignore any VMA policy installed on the virtual
			
 
				+	address range backed by the shared file mapping.  Rather,
			
 
				+	shared page cache pages, including pages backing private
			
 
				+	mappings that have not yet been written by the task, follow
			
 
				+	task policy, if any, else System Default Policy.
			
 
				+
			
 
				+	The shared policy infrastructure supports different policies on subset
			
 
				+	ranges of the shared object.  However, Linux still splits the VMA of
			
 
				+	the task that installs the policy for each range of distinct policy.
			
 
				+	Thus, different tasks that attach to a shared memory segment can have
			
 
				+	different VMA configurations mapping that one shared object.  This
			
 
				+	can be seen by examining the /proc/<pid>/numa_maps of tasks sharing
			
 
				+	a shared memory region, when one task has installed shared policy on
			
 
				+	one or more ranges of the region.
			
 
				+
			
 
				+Components of Memory Policies
			
 
				+-----------------------------
			
 
				+
			
 
				+A NUMA memory policy consists of a "mode", optional mode flags, and
			
 
				+an optional set of nodes.  The mode determines the behavior of the
			
 
				+policy, the optional mode flags determine the behavior of the mode,
			
 
				+and the optional set of nodes can be viewed as the arguments to the
			
 
				+policy behavior.
			
 
				+
			
 
				+Internally, memory policies are implemented by a reference counted
			
 
				+structure, struct mempolicy.  Details of this structure will be
			
 
				+discussed in context, below, as required to explain the behavior.
			
 
				+
			
 
				+NUMA memory policy supports the following 4 behavioral modes:
			
 
				+
			
 
				+Default Mode--MPOL_DEFAULT
			
 
				+	This mode is only used in the memory policy APIs.  Internally,
			
 
				+	MPOL_DEFAULT is converted to the NULL memory policy in all
			
 
				+	policy scopes.  Any existing non-default policy will simply be
			
 
				+	removed when MPOL_DEFAULT is specified.  As a result,
			
 
				+	MPOL_DEFAULT means "fall back to the next most specific policy
			
 
				+	scope."
			
 
				+
			
 
				+	For example, a NULL or default task policy will fall back to the
			
 
				+	system default policy.  A NULL or default vma policy will fall
			
 
				+	back to the task policy.
			
 
				+
			
 
				+	When specified in one of the memory policy APIs, the Default mode
			
 
				+	does not use the optional set of nodes.
			
 
				+
			
 
				+	It is an error for the set of nodes specified for this policy to
			
 
				+	be non-empty.
			
 
				+
			
 
				+MPOL_BIND
			
 
				+	This mode specifies that memory must come from the set of
			
 
				+	nodes specified by the policy.  Memory will be allocated from
			
 
				+	the node in the set with sufficient free memory that is
			
 
				+	closest to the node where the allocation takes place.
			
 
				+
			
 
				+MPOL_PREFERRED
			
 
				+	This mode specifies that the allocation should be attempted
			
 
				+	from the single node specified in the policy.  If that
			
 
				+	allocation fails, the kernel will search other nodes, in order
			
 
				+	of increasing distance from the preferred node based on
			
 
				+	information provided by the platform firmware.
			
 
				+
			
 
				+	Internally, the Preferred policy uses a single node--the
			
 
				+	preferred_node member of struct mempolicy.  When the internal
			
 
				+	mode flag MPOL_F_LOCAL is set, the preferred_node is ignored
			
 
				+	and the policy is interpreted as local allocation.  "Local"
			
 
				+	allocation policy can be viewed as a Preferred policy that
			
 
				+	starts at the node containing the cpu where the allocation
			
 
				+	takes place.
			
 
				+
			
 
				+	It is possible for the user to specify that local allocation
			
 
				+	is always preferred by passing an empty nodemask with this
			
 
				+	mode.  If an empty nodemask is passed, the policy cannot use
			
 
				+	the MPOL_F_STATIC_NODES or MPOL_F_RELATIVE_NODES flags
			
 
				+	described below.
			
 
				+
			
 
				+MPOL_INTERLEAVED
			
 
				+	This mode specifies that page allocations be interleaved, on a
			
 
				+	page granularity, across the nodes specified in the policy.
			
 
				+	This mode also behaves slightly differently, based on the
			
 
				+	context where it is used:
			
 
				+
			
 
				+	For allocation of anonymous pages and shared memory pages,
			
 
				+	Interleave mode indexes the set of nodes specified by the
			
 
				+	policy using the page offset of the faulting address into the
			
 
				+	segment [VMA] containing the address modulo the number of
			
 
				+	nodes specified by the policy.  It then attempts to allocate a
			
 
				+	page, starting at the selected node, as if the node had been
			
 
				+	specified by a Preferred policy or had been selected by a
			
 
				+	local allocation.  That is, allocation will follow the per
			
 
				+	node zonelist.
			
 
				+
			
 
				+	For allocation of page cache pages, Interleave mode indexes
			
 
				+	the set of nodes specified by the policy using a node counter
			
 
				+	maintained per task.  This counter wraps around to the lowest
			
 
				+	specified node after it reaches the highest specified node.
			
 
				+	This will tend to spread the pages out over the nodes
			
 
				+	specified by the policy based on the order in which they are
			
 
				+	allocated, rather than based on any page offset into an
			
 
				+	address range or file.  During system boot up, the temporary
			
 
				+	interleaved system default policy works in this mode.
			
 
				+
			
 
				+NUMA memory policy supports the following optional mode flags:
			
 
				+
			
 
				+MPOL_F_STATIC_NODES
			
 
				+	This flag specifies that the nodemask passed by
			
 
				+	the user should not be remapped if the task or VMA's set of allowed
			
 
				+	nodes changes after the memory policy has been defined.
			
 
				+
			
 
				+	Without this flag, any time a mempolicy is rebound because of a
			
 
				+	change in the set of allowed nodes, the node (Preferred) or
			
 
				+	nodemask (Bind, Interleave) is remapped to the new set of
			
 
				+	allowed nodes.  This may result in nodes being used that were
			
 
				+	previously undesired.
			
 
				+
			
 
				+	With this flag, if the user-specified nodes overlap with the
			
 
				+	nodes allowed by the task's cpuset, then the memory policy is
			
 
				+	applied to their intersection.  If the two sets of nodes do not
			
 
				+	overlap, the Default policy is used.
			
 
				+
			
 
				+	For example, consider a task that is attached to a cpuset with
			
 
				+	mems 1-3 that sets an Interleave policy over the same set.  If
			
 
				+	the cpuset's mems change to 3-5, the Interleave will now occur
			
 
				+	over nodes 3, 4, and 5.  With this flag, however, since only node
			
 
				+	3 is allowed from the user's nodemask, the "interleave" only
			
 
				+	occurs over that node.  If no nodes from the user's nodemask are
			
 
				+	now allowed, the Default behavior is used.
			
 
				+
			
 
				+	MPOL_F_STATIC_NODES cannot be combined with the
			
 
				+	MPOL_F_RELATIVE_NODES flag.  It also cannot be used for
			
 
				+	MPOL_PREFERRED policies that were created with an empty nodemask
			
 
				+	(local allocation).
			
 
				+
			
 
				+MPOL_F_RELATIVE_NODES
			
 
				+	This flag specifies that the nodemask passed
			
 
				+	by the user will be mapped relative to the set of the task or VMA's
			
 
				+	set of allowed nodes.  The kernel stores the user-passed nodemask,
			
 
				+	and if the allowed nodes changes, then that original nodemask will
			
 
				+	be remapped relative to the new set of allowed nodes.
			
 
				+
			
 
				+	Without this flag (and without MPOL_F_STATIC_NODES), anytime a
			
 
				+	mempolicy is rebound because of a change in the set of allowed
			
 
				+	nodes, the node (Preferred) or nodemask (Bind, Interleave) is
			
 
				+	remapped to the new set of allowed nodes.  That remap may not
			
 
				+	preserve the relative nature of the user's passed nodemask to its
			
 
				+	set of allowed nodes upon successive rebinds: a nodemask of
			
 
				+	1,3,5 may be remapped to 7-9 and then to 1-3 if the set of
			
 
				+	allowed nodes is restored to its original state.
			
 
				+
			
 
				+	With this flag, the remap is done so that the node numbers from
			
 
				+	the user's passed nodemask are relative to the set of allowed
			
 
				+	nodes.  In other words, if nodes 0, 2, and 4 are set in the user's
			
 
				+	nodemask, the policy will be effected over the first (and in the
			
 
				+	Bind or Interleave case, the third and fifth) nodes in the set of
			
 
				+	allowed nodes.  The nodemask passed by the user represents nodes
			
 
				+	relative to task or VMA's set of allowed nodes.
			
 
				+
			
 
				+	If the user's nodemask includes nodes that are outside the range
			
 
				+	of the new set of allowed nodes (for example, node 5 is set in
			
 
				+	the user's nodemask when the set of allowed nodes is only 0-3),
			
 
				+	then the remap wraps around to the beginning of the nodemask and,
			
 
				+	if not already set, sets the node in the mempolicy nodemask.
			
 
				+
			
 
				+	For example, consider a task that is attached to a cpuset with
			
 
				+	mems 2-5 that sets an Interleave policy over the same set with
			
 
				+	MPOL_F_RELATIVE_NODES.  If the cpuset's mems change to 3-7, the
			
 
				+	interleave now occurs over nodes 3,5-7.  If the cpuset's mems
			
 
				+	then change to 0,2-3,5, then the interleave occurs over nodes
			
 
				+	0,2-3,5.
			
 
				+
			
 
				+	Thanks to the consistent remapping, applications preparing
			
 
				+	nodemasks to specify memory policies using this flag should
			
 
				+	disregard their current, actual cpuset imposed memory placement
			
 
				+	and prepare the nodemask as if they were always located on
			
 
				+	memory nodes 0 to N-1, where N is the number of memory nodes the
			
 
				+	policy is intended to manage.  Let the kernel then remap to the
			
 
				+	set of memory nodes allowed by the task's cpuset, as that may
			
 
				+	change over time.
			
 
				+
			
 
				+	MPOL_F_RELATIVE_NODES cannot be combined with the
			
 
				+	MPOL_F_STATIC_NODES flag.  It also cannot be used for
			
 
				+	MPOL_PREFERRED policies that were created with an empty nodemask
			
 
				+	(local allocation).
			
 
				+
			
 
				+Memory Policy Reference Counting
			
 
				+================================
			
 
				+
			
 
				+To resolve use/free races, struct mempolicy contains an atomic reference
			
 
				+count field.  Internal interfaces, mpol_get()/mpol_put() increment and
			
 
				+decrement this reference count, respectively.  mpol_put() will only free
			
 
				+the structure back to the mempolicy kmem cache when the reference count
			
 
				+goes to zero.
			
 
				+
			
 
				+When a new memory policy is allocated, its reference count is initialized
			
 
				+to '1', representing the reference held by the task that is installing the
			
 
				+new policy.  When a pointer to a memory policy structure is stored in another
			
 
				+structure, another reference is added, as the task's reference will be dropped
			
 
				+on completion of the policy installation.
			
 
				+
			
 
				+During run-time "usage" of the policy, we attempt to minimize atomic operations
			
 
				+on the reference count, as this can lead to cache lines bouncing between cpus
			
 
				+and NUMA nodes.  "Usage" here means one of the following:
			
 
				+
			
 
				+1) querying of the policy, either by the task itself [using the get_mempolicy()
			
 
				+   API discussed below] or by another task using the /proc/<pid>/numa_maps
			
 
				+   interface.
			
 
				+
			
 
				+2) examination of the policy to determine the policy mode and associated node
			
 
				+   or node lists, if any, for page allocation.  This is considered a "hot
			
 
				+   path".  Note that for MPOL_BIND, the "usage" extends across the entire
			
 
				+   allocation process, which may sleep during page reclaimation, because the
			
 
				+   BIND policy nodemask is used, by reference, to filter ineligible nodes.
			
 
				+
			
 
				+We can avoid taking an extra reference during the usages listed above as
			
 
				+follows:
			
 
				+
			
 
				+1) we never need to get/free the system default policy as this is never
			
 
				+   changed nor freed, once the system is up and running.
			
 
				+
			
 
				+2) for querying the policy, we do not need to take an extra reference on the
			
 
				+   target task's task policy nor vma policies because we always acquire the
			
 
				+   task's mm's mmap_sem for read during the query.  The set_mempolicy() and
			
 
				+   mbind() APIs [see below] always acquire the mmap_sem for write when
			
 
				+   installing or replacing task or vma policies.  Thus, there is no possibility
			
 
				+   of a task or thread freeing a policy while another task or thread is
			
 
				+   querying it.
			
 
				+
			
 
				+3) Page allocation usage of task or vma policy occurs in the fault path where
			
 
				+   we hold them mmap_sem for read.  Again, because replacing the task or vma
			
 
				+   policy requires that the mmap_sem be held for write, the policy can't be
			
 
				+   freed out from under us while we're using it for page allocation.
			
 
				+
			
 
				+4) Shared policies require special consideration.  One task can replace a
			
 
				+   shared memory policy while another task, with a distinct mmap_sem, is
			
 
				+   querying or allocating a page based on the policy.  To resolve this
			
 
				+   potential race, the shared policy infrastructure adds an extra reference
			
 
				+   to the shared policy during lookup while holding a spin lock on the shared
			
 
				+   policy management structure.  This requires that we drop this extra
			
 
				+   reference when we're finished "using" the policy.  We must drop the
			
 
				+   extra reference on shared policies in the same query/allocation paths
			
 
				+   used for non-shared policies.  For this reason, shared policies are marked
			
 
				+   as such, and the extra reference is dropped "conditionally"--i.e., only
			
 
				+   for shared policies.
			
 
				+
			
 
				+   Because of this extra reference counting, and because we must lookup
			
 
				+   shared policies in a tree structure under spinlock, shared policies are
			
 
				+   more expensive to use in the page allocation path.  This is especially
			
 
				+   true for shared policies on shared memory regions shared by tasks running
			
 
				+   on different NUMA nodes.  This extra overhead can be avoided by always
			
 
				+   falling back to task or system default policy for shared memory regions,
			
 
				+   or by prefaulting the entire shared memory region into memory and locking
			
 
				+   it down.  However, this might not be appropriate for all applications.
			
 
				+
			
 
				+.. _memory_policy_apis:
			
 
				+
			
 
				+Memory Policy APIs
			
 
				+==================
			
 
				+
			
 
				+Linux supports 3 system calls for controlling memory policy.  These APIS
			
 
				+always affect only the calling task, the calling task's address space, or
			
 
				+some shared object mapped into the calling task's address space.
			
 
				+
			
 
				+.. note::
			
 
				+   the headers that define these APIs and the parameter data types for
			
 
				+   user space applications reside in a package that is not part of the
			
 
				+   Linux kernel.  The kernel system call interfaces, with the 'sys\_'
			
 
				+   prefix, are defined in <linux/syscalls.h>; the mode and flag
			
 
				+   definitions are defined in <linux/mempolicy.h>.
			
 
				+
			
 
				+Set [Task] Memory Policy::
			
 
				+
			
 
				+	long set_mempolicy(int mode, const unsigned long *nmask,
			
 
				+					unsigned long maxnode);
			
 
				+
			
 
				+Set's the calling task's "task/process memory policy" to mode
			
 
				+specified by the 'mode' argument and the set of nodes defined by
			
 
				+'nmask'.  'nmask' points to a bit mask of node ids containing at least
			
 
				+'maxnode' ids.  Optional mode flags may be passed by combining the
			
 
				+'mode' argument with the flag (for example: MPOL_INTERLEAVE |
			
 
				+MPOL_F_STATIC_NODES).
			
 
				+
			
 
				+See the set_mempolicy(2) man page for more details
			
 
				+
			
 
				+
			
 
				+Get [Task] Memory Policy or Related Information::
			
 
				+
			
 
				+	long get_mempolicy(int *mode,
			
 
				+			   const unsigned long *nmask, unsigned long maxnode,
			
 
				+			   void *addr, int flags);
			
 
				+
			
 
				+Queries the "task/process memory policy" of the calling task, or the
			
 
				+policy or location of a specified virtual address, depending on the
			
 
				+'flags' argument.
			
 
				+
			
 
				+See the get_mempolicy(2) man page for more details
			
 
				+
			
 
				+
			
 
				+Install VMA/Shared Policy for a Range of Task's Address Space::
			
 
				+
			
 
				+	long mbind(void *start, unsigned long len, int mode,
			
 
				+		   const unsigned long *nmask, unsigned long maxnode,
			
 
				+		   unsigned flags);
			
 
				+
			
 
				+mbind() installs the policy specified by (mode, nmask, maxnodes) as a
			
 
				+VMA policy for the range of the calling task's address space specified
			
 
				+by the 'start' and 'len' arguments.  Additional actions may be
			
 
				+requested via the 'flags' argument.
			
 
				+
			
 
				+See the mbind(2) man page for more details.
			
 
				+
			
 
				+Memory Policy Command Line Interface
			
 
				+====================================
			
 
				+
			
 
				+Although not strictly part of the Linux implementation of memory policy,
			
 
				+a command line tool, numactl(8), exists that allows one to:
			
 
				+
			
 
				++ set the task policy for a specified program via set_mempolicy(2), fork(2) and
			
 
				+  exec(2)
			
 
				+
			
 
				++ set the shared policy for a shared memory segment via mbind(2)
			
 
				+
			
 
				+The numactl(8) tool is packaged with the run-time version of the library
			
 
				+containing the memory policy system call wrappers.  Some distributions
			
 
				+package the headers and compile-time libraries in a separate development
			
 
				+package.
			
 
				+
			
 
				+.. _mem_pol_and_cpusets:
			
 
				+
			
 
				+Memory Policies and cpusets
			
 
				+===========================
			
 
				+
			
 
				+Memory policies work within cpusets as described above.  For memory policies
			
 
				+that require a node or set of nodes, the nodes are restricted to the set of
			
 
				+nodes whose memories are allowed by the cpuset constraints.  If the nodemask
			
 
				+specified for the policy contains nodes that are not allowed by the cpuset and
			
 
				+MPOL_F_RELATIVE_NODES is not used, the intersection of the set of nodes
			
 
				+specified for the policy and the set of nodes with memory is used.  If the
			
 
				+result is the empty set, the policy is considered invalid and cannot be
			
 
				+installed.  If MPOL_F_RELATIVE_NODES is used, the policy's nodes are mapped
			
 
				+onto and folded into the task's set of allowed nodes as previously described.
			
 
				+
			
 
				+The interaction of memory policies and cpusets can be problematic when tasks
			
 
				+in two cpusets share access to a memory region, such as shared memory segments
			
 
				+created by shmget() of mmap() with the MAP_ANONYMOUS and MAP_SHARED flags, and
			
 
				+any of the tasks install shared policy on the region, only nodes whose
			
 
				+memories are allowed in both cpusets may be used in the policies.  Obtaining
			
 
				+this information requires "stepping outside" the memory policy APIs to use the
			
 
				+cpuset information and requires that one know in what cpusets other task might
			
 
				+be attaching to the shared region.  Furthermore, if the cpusets' allowed
			
 
				+memory sets are disjoint, "local" allocation is the only valid policy.
			
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -0,0 +1,201 @@
 
				+.. _pagemap:
			
 
				+
			
 
				+=============================
			
 
				+Examining Process Page Tables
			
 
				+=============================
			
 
				+
			
 
				+pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
			
 
				+userspace programs to examine the page tables and related information by
			
 
				+reading files in ``/proc``.
			
 
				+
			
 
				+There are four components to pagemap:
			
 
				+
			
 
				+ * ``/proc/pid/pagemap``.  This file lets a userspace process find out which
			
 
				+   physical frame each virtual page is mapped to.  It contains one 64-bit
			
 
				+   value for each virtual page, containing the following data (from
			
 
				+   ``fs/proc/task_mmu.c``, above pagemap_read):
			
 
				+
			
 
				+    * Bits 0-54  page frame number (PFN) if present
			
 
				+    * Bits 0-4   swap type if swapped
			
 
				+    * Bits 5-54  swap offset if swapped
			
 
				+    * Bit  55    pte is soft-dirty (see
			
 
				+      :ref:`Documentation/admin-guide/mm/soft-dirty.rst <soft_dirty>`)
			
 
				+    * Bit  56    page exclusively mapped (since 4.2)
			
 
				+    * Bits 57-60 zero
			
 
				+    * Bit  61    page is file-page or shared-anon (since 3.5)
			
 
				+    * Bit  62    page swapped
			
 
				+    * Bit  63    page present
			
 
				+
			
 
				+   Since Linux 4.0 only users with the CAP_SYS_ADMIN capability can get PFNs.
			
 
				+   In 4.0 and 4.1 opens by unprivileged fail with -EPERM.  Starting from
			
 
				+   4.2 the PFN field is zeroed if the user does not have CAP_SYS_ADMIN.
			
 
				+   Reason: information about PFNs helps in exploiting Rowhammer vulnerability.
			
 
				+
			
 
				+   If the page is not present but in swap, then the PFN contains an
			
 
				+   encoding of the swap file number and the page's offset into the
			
 
				+   swap. Unmapped pages return a null PFN. This allows determining
			
 
				+   precisely which pages are mapped (or in swap) and comparing mapped
			
 
				+   pages between processes.
			
 
				+
			
 
				+   Efficient users of this interface will use ``/proc/pid/maps`` to
			
 
				+   determine which areas of memory are actually mapped and llseek to
			
 
				+   skip over unmapped regions.
			
 
				+
			
 
				+ * ``/proc/kpagecount``.  This file contains a 64-bit count of the number of
			
 
				+   times each page is mapped, indexed by PFN.
			
 
				+
			
 
				+ * ``/proc/kpageflags``.  This file contains a 64-bit set of flags for each
			
 
				+   page, indexed by PFN.
			
 
				+
			
 
				+   The flags are (from ``fs/proc/page.c``, above kpageflags_read):
			
 
				+
			
 
				+    0. LOCKED
			
 
				+    1. ERROR
			
 
				+    2. REFERENCED
			
 
				+    3. UPTODATE
			
 
				+    4. DIRTY
			
 
				+    5. LRU
			
 
				+    6. ACTIVE
			
 
				+    7. SLAB
			
 
				+    8. WRITEBACK
			
 
				+    9. RECLAIM
			
 
				+    10. BUDDY
			
 
				+    11. MMAP
			
 
				+    12. ANON
			
 
				+    13. SWAPCACHE
			
 
				+    14. SWAPBACKED
			
 
				+    15. COMPOUND_HEAD
			
 
				+    16. COMPOUND_TAIL
			
 
				+    17. HUGE
			
 
				+    18. UNEVICTABLE
			
 
				+    19. HWPOISON
			
 
				+    20. NOPAGE
			
 
				+    21. KSM
			
 
				+    22. THP
			
 
				+    23. BALLOON
			
 
				+    24. ZERO_PAGE
			
 
				+    25. IDLE
			
 
				+
			
 
				+ * ``/proc/kpagecgroup``.  This file contains a 64-bit inode number of the
			
 
				+   memory cgroup each page is charged to, indexed by PFN. Only available when
			
 
				+   CONFIG_MEMCG is set.
			
 
				+
			
 
				+Short descriptions to the page flags
			
 
				+====================================
			
 
				+
			
 
				+0 - LOCKED
			
 
				+   page is being locked for exclusive access, e.g. by undergoing read/write IO
			
 
				+7 - SLAB
			
 
				+   page is managed by the SLAB/SLOB/SLUB/SLQB kernel memory allocator
			
 
				+   When compound page is used, SLUB/SLQB will only set this flag on the head
			
 
				+   page; SLOB will not flag it at all.
			
 
				+10 - BUDDY
			
 
				+    a free memory block managed by the buddy system allocator
			
 
				+    The buddy system organizes free memory in blocks of various orders.
			
 
				+    An order N block has 2^N physically contiguous pages, with the BUDDY flag
			
 
				+    set for and _only_ for the first page.
			
 
				+15 - COMPOUND_HEAD
			
 
				+    A compound page with order N consists of 2^N physically contiguous pages.
			
 
				+    A compound page with order 2 takes the form of "HTTT", where H donates its
			
 
				+    head page and T donates its tail page(s).  The major consumers of compound
			
 
				+    pages are hugeTLB pages
			
 
				+    (:ref:`Documentation/admin-guide/mm/hugetlbpage.rst <hugetlbpage>`),
			
 
				+    the SLUB etc.  memory allocators and various device drivers.
			
 
				+    However in this interface, only huge/giga pages are made visible
			
 
				+    to end users.
			
 
				+16 - COMPOUND_TAIL
			
 
				+    A compound page tail (see description above).
			
 
				+17 - HUGE
			
 
				+    this is an integral part of a HugeTLB page
			
 
				+19 - HWPOISON
			
 
				+    hardware detected memory corruption on this page: don't touch the data!
			
 
				+20 - NOPAGE
			
 
				+    no page frame exists at the requested address
			
 
				+21 - KSM
			
 
				+    identical memory pages dynamically shared between one or more processes
			
 
				+22 - THP
			
 
				+    contiguous pages which construct transparent hugepages
			
 
				+23 - BALLOON
			
 
				+    balloon compaction page
			
 
				+24 - ZERO_PAGE
			
 
				+    zero page for pfn_zero or huge_zero page
			
 
				+25 - IDLE
			
 
				+    page has not been accessed since it was marked idle (see
			
 
				+    :ref:`Documentation/admin-guide/mm/idle_page_tracking.rst <idle_page_tracking>`).
			
 
				+    Note that this flag may be stale in case the page was accessed via
			
 
				+    a PTE. To make sure the flag is up-to-date one has to read
			
 
				+    ``/sys/kernel/mm/page_idle/bitmap`` first.
			
 
				+
			
 
				+IO related page flags
			
 
				+---------------------
			
 
				+
			
 
				+1 - ERROR
			
 
				+   IO error occurred
			
 
				+3 - UPTODATE
			
 
				+   page has up-to-date data
			
 
				+   ie. for file backed page: (in-memory data revision >= on-disk one)
			
 
				+4 - DIRTY
			
 
				+   page has been written to, hence contains new data
			
 
				+   i.e. for file backed page: (in-memory data revision >  on-disk one)
			
 
				+8 - WRITEBACK
			
 
				+   page is being synced to disk
			
 
				+
			
 
				+LRU related page flags
			
 
				+----------------------
			
 
				+
			
 
				+5 - LRU
			
 
				+   page is in one of the LRU lists
			
 
				+6 - ACTIVE
			
 
				+   page is in the active LRU list
			
 
				+18 - UNEVICTABLE
			
 
				+   page is in the unevictable (non-)LRU list It is somehow pinned and
			
 
				+   not a candidate for LRU page reclaims, e.g. ramfs pages,
			
 
				+   shmctl(SHM_LOCK) and mlock() memory segments
			
 
				+2 - REFERENCED
			
 
				+   page has been referenced since last LRU list enqueue/requeue
			
 
				+9 - RECLAIM
			
 
				+   page will be reclaimed soon after its pageout IO completed
			
 
				+11 - MMAP
			
 
				+   a memory mapped page
			
 
				+12 - ANON
			
 
				+   a memory mapped page that is not part of a file
			
 
				+13 - SWAPCACHE
			
 
				+   page is mapped to swap space, i.e. has an associated swap entry
			
 
				+14 - SWAPBACKED
			
 
				+   page is backed by swap/RAM
			
 
				+
			
 
				+The page-types tool in the tools/vm directory can be used to query the
			
 
				+above flags.
			
 
				+
			
 
				+Using pagemap to do something useful
			
 
				+====================================
			
 
				+
			
 
				+The general procedure for using pagemap to find out about a process' memory
			
 
				+usage goes like this:
			
 
				+
			
 
				+ 1. Read ``/proc/pid/maps`` to determine which parts of the memory space are
			
 
				+    mapped to what.
			
 
				+ 2. Select the maps you are interested in -- all of them, or a particular
			
 
				+    library, or the stack or the heap, etc.
			
 
				+ 3. Open ``/proc/pid/pagemap`` and seek to the pages you would like to examine.
			
 
				+ 4. Read a u64 for each page from pagemap.
			
 
				+ 5. Open ``/proc/kpagecount`` and/or ``/proc/kpageflags``.  For each PFN you
			
 
				+    just read, seek to that entry in the file, and read the data you want.
			
 
				+
			
 
				+For example, to find the "unique set size" (USS), which is the amount of
			
 
				+memory that a process is using that is not shared with any other process,
			
 
				+you can go through every map in the process, find the PFNs, look those up
			
 
				+in kpagecount, and tally up the number of pages that are only referenced
			
 
				+once.
			
 
				+
			
 
				+Other notes
			
 
				+===========
			
 
				+
			
 
				+Reading from any of the files will return -EINVAL if you are not starting
			
 
				+the read on an 8-byte boundary (e.g., if you sought an odd number of bytes
			
 
				+into the file), or if the size of the read is not a multiple of 8 bytes.
			
 
				+
			
 
				+Before Linux 3.11 pagemap bits 55-60 were used for "page-shift" (which is
			
 
				+always 12 at most architectures). Since Linux 3.11 their meaning changes
			
 
				+after first clear of soft-dirty bits. Since Linux 4.2 they are used for
			
 
				+flags unconditionally.
			
--- a/Documentation/admin-guide/mm/soft-dirty.rst
+++ b/Documentation/admin-guide/mm/soft-dirty.rst
@@ -0,0 +1,47 @@
 
				+.. _soft_dirty:
			
 
				+
			
 
				+===============
			
 
				+Soft-Dirty PTEs
			
 
				+===============
			
 
				+
			
 
				+The soft-dirty is a bit on a PTE which helps to track which pages a task
			
 
				+writes to. In order to do this tracking one should
			
 
				+
			
 
				+  1. Clear soft-dirty bits from the task's PTEs.
			
 
				+
			
 
				+     This is done by writing "4" into the ``/proc/PID/clear_refs`` file of the
			
 
				+     task in question.
			
 
				+
			
 
				+  2. Wait some time.
			
 
				+
			
 
				+  3. Read soft-dirty bits from the PTEs.
			
 
				+
			
 
				+     This is done by reading from the ``/proc/PID/pagemap``. The bit 55 of the
			
 
				+     64-bit qword is the soft-dirty one. If set, the respective PTE was
			
 
				+     written to since step 1.
			
 
				+
			
 
				+
			
 
				+Internally, to do this tracking, the writable bit is cleared from PTEs
			
 
				+when the soft-dirty bit is cleared. So, after this, when the task tries to
			
 
				+modify a page at some virtual address the #PF occurs and the kernel sets
			
 
				+the soft-dirty bit on the respective PTE.
			
 
				+
			
 
				+Note, that although all the task's address space is marked as r/o after the
			
 
				+soft-dirty bits clear, the #PF-s that occur after that are processed fast.
			
 
				+This is so, since the pages are still mapped to physical memory, and thus all
			
 
				+the kernel does is finds this fact out and puts both writable and soft-dirty
			
 
				+bits on the PTE.
			
 
				+
			
 
				+While in most cases tracking memory changes by #PF-s is more than enough
			
 
				+there is still a scenario when we can lose soft dirty bits -- a task
			
 
				+unmaps a previously mapped memory region and then maps a new one at exactly
			
 
				+the same place. When unmap is called, the kernel internally clears PTE values
			
 
				+including soft dirty bits. To notify user space application about such
			
 
				+memory region renewal the kernel always marks new memory regions (and
			
 
				+expanded regions) as soft dirty.
			
 
				+
			
 
				+This feature is actively used by the checkpoint-restore project. You
			
 
				+can find more details about it on http://criu.org
			
 
				+
			
 
				+
			
 
				+-- Pavel Emelyanov, Apr 9, 2013
			
--- a/Documentation/admin-guide/mm/transhuge.rst
+++ b/Documentation/admin-guide/mm/transhuge.rst
@@ -0,0 +1,418 @@
 
				+.. _admin_guide_transhuge:
			
 
				+
			
 
				+============================
			
 
				+Transparent Hugepage Support
			
 
				+============================
			
 
				+
			
 
				+Objective
			
 
				+=========
			
 
				+
			
 
				+Performance critical computing applications dealing with large memory
			
 
				+working sets are already running on top of libhugetlbfs and in turn
			
 
				+hugetlbfs. Transparent HugePage Support (THP) is an alternative mean of
			
 
				+using huge pages for the backing of virtual memory with huge pages
			
 
				+that supports the automatic promotion and demotion of page sizes and
			
 
				+without the shortcomings of hugetlbfs.
			
 
				+
			
 
				+Currently THP only works for anonymous memory mappings and tmpfs/shmem.
			
 
				+But in the future it can expand to other filesystems.
			
 
				+
			
 
				+.. note::
			
 
				+   in the examples below we presume that the basic page size is 4K and
			
 
				+   the huge page size is 2M, although the actual numbers may vary
			
 
				+   depending on the CPU architecture.
			
 
				+
			
 
				+The reason applications are running faster is because of two
			
 
				+factors. The first factor is almost completely irrelevant and it's not
			
 
				+of significant interest because it'll also have the downside of
			
 
				+requiring larger clear-page copy-page in page faults which is a
			
 
				+potentially negative effect. The first factor consists in taking a
			
 
				+single page fault for each 2M virtual region touched by userland (so
			
 
				+reducing the enter/exit kernel frequency by a 512 times factor). This
			
 
				+only matters the first time the memory is accessed for the lifetime of
			
 
				+a memory mapping. The second long lasting and much more important
			
 
				+factor will affect all subsequent accesses to the memory for the whole
			
 
				+runtime of the application. The second factor consist of two
			
 
				+components:
			
 
				+
			
 
				+1) the TLB miss will run faster (especially with virtualization using
			
 
				+   nested pagetables but almost always also on bare metal without
			
 
				+   virtualization)
			
 
				+
			
 
				+2) a single TLB entry will be mapping a much larger amount of virtual
			
 
				+   memory in turn reducing the number of TLB misses. With
			
 
				+   virtualization and nested pagetables the TLB can be mapped of
			
 
				+   larger size only if both KVM and the Linux guest are using
			
 
				+   hugepages but a significant speedup already happens if only one of
			
 
				+   the two is using hugepages just because of the fact the TLB miss is
			
 
				+   going to run faster.
			
 
				+
			
 
				+THP can be enabled system wide or restricted to certain tasks or even
			
 
				+memory ranges inside task's address space. Unless THP is completely
			
 
				+disabled, there is ``khugepaged`` daemon that scans memory and
			
 
				+collapses sequences of basic pages into huge pages.
			
 
				+
			
 
				+The THP behaviour is controlled via :ref:`sysfs <thp_sysfs>`
			
 
				+interface and using madivse(2) and prctl(2) system calls.
			
 
				+
			
 
				+Transparent Hugepage Support maximizes the usefulness of free memory
			
 
				+if compared to the reservation approach of hugetlbfs by allowing all
			
 
				+unused memory to be used as cache or other movable (or even unmovable
			
 
				+entities). It doesn't require reservation to prevent hugepage
			
 
				+allocation failures to be noticeable from userland. It allows paging
			
 
				+and all other advanced VM features to be available on the
			
 
				+hugepages. It requires no modifications for applications to take
			
 
				+advantage of it.
			
 
				+
			
 
				+Applications however can be further optimized to take advantage of
			
 
				+this feature, like for example they've been optimized before to avoid
			
 
				+a flood of mmap system calls for every malloc(4k). Optimizing userland
			
 
				+is by far not mandatory and khugepaged already can take care of long
			
 
				+lived page allocations even for hugepage unaware applications that
			
 
				+deals with large amounts of memory.
			
 
				+
			
 
				+In certain cases when hugepages are enabled system wide, application
			
 
				+may end up allocating more memory resources. An application may mmap a
			
 
				+large region but only touch 1 byte of it, in that case a 2M page might
			
 
				+be allocated instead of a 4k page for no good. This is why it's
			
 
				+possible to disable hugepages system-wide and to only have them inside
			
 
				+MADV_HUGEPAGE madvise regions.
			
 
				+
			
 
				+Embedded systems should enable hugepages only inside madvise regions
			
 
				+to eliminate any risk of wasting any precious byte of memory and to
			
 
				+only run faster.
			
 
				+
			
 
				+Applications that gets a lot of benefit from hugepages and that don't
			
 
				+risk to lose memory by using hugepages, should use
			
 
				+madvise(MADV_HUGEPAGE) on their critical mmapped regions.
			
 
				+
			
 
				+.. _thp_sysfs:
			
 
				+
			
 
				+sysfs
			
 
				+=====
			
 
				+
			
 
				+Global THP controls
			
 
				+-------------------
			
 
				+
			
 
				+Transparent Hugepage Support for anonymous memory can be entirely disabled
			
 
				+(mostly for debugging purposes) or only enabled inside MADV_HUGEPAGE
			
 
				+regions (to avoid the risk of consuming more memory resources) or enabled
			
 
				+system wide. This can be achieved with one of::
			
 
				+
			
 
				+	echo always >/sys/kernel/mm/transparent_hugepage/enabled
			
 
				+	echo madvise >/sys/kernel/mm/transparent_hugepage/enabled
			
 
				+	echo never >/sys/kernel/mm/transparent_hugepage/enabled
			
 
				+
			
 
				+It's also possible to limit defrag efforts in the VM to generate
			
 
				+anonymous hugepages in case they're not immediately free to madvise
			
 
				+regions or to never try to defrag memory and simply fallback to regular
			
 
				+pages unless hugepages are immediately available. Clearly if we spend CPU
			
 
				+time to defrag memory, we would expect to gain even more by the fact we
			
 
				+use hugepages later instead of regular pages. This isn't always
			
 
				+guaranteed, but it may be more likely in case the allocation is for a
			
 
				+MADV_HUGEPAGE region.
			
 
				+
			
 
				+::
			
 
				+
			
 
				+	echo always >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+	echo defer >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+	echo defer+madvise >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+	echo madvise >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+	echo never >/sys/kernel/mm/transparent_hugepage/defrag
			
 
				+
			
 
				+always
			
 
				+	means that an application requesting THP will stall on
			
 
				+	allocation failure and directly reclaim pages and compact
			
 
				+	memory in an effort to allocate a THP immediately. This may be
			
 
				+	desirable for virtual machines that benefit heavily from THP
			
 
				+	use and are willing to delay the VM start to utilise them.
			
 
				+
			
 
				+defer
			
 
				+	means that an application will wake kswapd in the background
			
 
				+	to reclaim pages and wake kcompactd to compact memory so that
			
 
				+	THP is available in the near future. It's the responsibility
			
 
				+	of khugepaged to then install the THP pages later.
			
 
				+
			
 
				+defer+madvise
			
 
				+	will enter direct reclaim and compaction like ``always``, but
			
 
				+	only for regions that have used madvise(MADV_HUGEPAGE); all
			
 
				+	other regions will wake kswapd in the background to reclaim
			
 
				+	pages and wake kcompactd to compact memory so that THP is
			
 
				+	available in the near future.
			
 
				+
			
 
				+madvise
			
 
				+	will enter direct reclaim like ``always`` but only for regions
			
 
				+	that are have used madvise(MADV_HUGEPAGE). This is the default
			
 
				+	behaviour.
			
 
				+
			
 
				+never
			
 
				+	should be self-explanatory.
			
 
				+
			
 
				+By default kernel tries to use huge zero page on read page fault to
			
 
				+anonymous mapping. It's possible to disable huge zero page by writing 0
			
 
				+or enable it back by writing 1::
			
 
				+
			
 
				+	echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page
			
 
				+	echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page
			
 
				+
			
 
				+Some userspace (such as a test program, or an optimized memory allocation
			
 
				+library) may want to know the size (in bytes) of a transparent hugepage::
			
 
				+
			
 
				+	cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size
			
 
				+
			
 
				+khugepaged will be automatically started when
			
 
				+transparent_hugepage/enabled is set to "always" or "madvise, and it'll
			
 
				+be automatically shutdown if it's set to "never".
			
 
				+
			
 
				+Khugepaged controls
			
 
				+-------------------
			
 
				+
			
 
				+khugepaged runs usually at low frequency so while one may not want to
			
 
				+invoke defrag algorithms synchronously during the page faults, it
			
 
				+should be worth invoking defrag at least in khugepaged. However it's
			
 
				+also possible to disable defrag in khugepaged by writing 0 or enable
			
 
				+defrag in khugepaged by writing 1::
			
 
				+
			
 
				+	echo 0 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
			
 
				+	echo 1 >/sys/kernel/mm/transparent_hugepage/khugepaged/defrag
			
 
				+
			
 
				+You can also control how many pages khugepaged should scan at each
			
 
				+pass::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_to_scan
			
 
				+
			
 
				+and how many milliseconds to wait in khugepaged between each pass (you
			
 
				+can set this to 0 to run khugepaged at 100% utilization of one core)::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/scan_sleep_millisecs
			
 
				+
			
 
				+and how many milliseconds to wait in khugepaged if there's an hugepage
			
 
				+allocation failure to throttle the next allocation attempt::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
			
 
				+
			
 
				+The khugepaged progress can be seen in the number of pages collapsed::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
			
 
				+
			
 
				+for each pass::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/full_scans
			
 
				+
			
 
				+``max_ptes_none`` specifies how many extra small pages (that are
			
 
				+not already mapped) can be allocated when collapsing a group
			
 
				+of small pages into one large page::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_none
			
 
				+
			
 
				+A higher value leads to use additional memory for programs.
			
 
				+A lower value leads to gain less thp performance. Value of
			
 
				+max_ptes_none can waste cpu time very little, you can
			
 
				+ignore it.
			
 
				+
			
 
				+``max_ptes_swap`` specifies how many pages can be brought in from
			
 
				+swap when collapsing a group of pages into a transparent huge page::
			
 
				+
			
 
				+	/sys/kernel/mm/transparent_hugepage/khugepaged/max_ptes_swap
			
 
				+
			
 
				+A higher value can cause excessive swap IO and waste
			
 
				+memory. A lower value can prevent THPs from being
			
 
				+collapsed, resulting fewer pages being collapsed into
			
 
				+THPs, and lower memory access performance.
			
 
				+
			
 
				+Boot parameter
			
 
				+==============
			
 
				+
			
 
				+You can change the sysfs boot time defaults of Transparent Hugepage
			
 
				+Support by passing the parameter ``transparent_hugepage=always`` or
			
 
				+``transparent_hugepage=madvise`` or ``transparent_hugepage=never``
			
 
				+to the kernel command line.
			
 
				+
			
 
				+Hugepages in tmpfs/shmem
			
 
				+========================
			
 
				+
			
 
				+You can control hugepage allocation policy in tmpfs with mount option
			
 
				+``huge=``. It can have following values:
			
 
				+
			
 
				+always
			
 
				+    Attempt to allocate huge pages every time we need a new page;
			
 
				+
			
 
				+never
			
 
				+    Do not allocate huge pages;
			
 
				+
			
 
				+within_size
			
 
				+    Only allocate huge page if it will be fully within i_size.
			
 
				+    Also respect fadvise()/madvise() hints;
			
 
				+
			
 
				+advise
			
 
				+    Only allocate huge pages if requested with fadvise()/madvise();
			
 
				+
			
 
				+The default policy is ``never``.
			
 
				+
			
 
				+``mount -o remount,huge= /mountpoint`` works fine after mount: remounting
			
 
				+``huge=never`` will not attempt to break up huge pages at all, just stop more
			
 
				+from being allocated.
			
 
				+
			
 
				+There's also sysfs knob to control hugepage allocation policy for internal
			
 
				+shmem mount: /sys/kernel/mm/transparent_hugepage/shmem_enabled. The mount
			
 
				+is used for SysV SHM, memfds, shared anonymous mmaps (of /dev/zero or
			
 
				+MAP_ANONYMOUS), GPU drivers' DRM objects, Ashmem.
			
 
				+
			
 
				+In addition to policies listed above, shmem_enabled allows two further
			
 
				+values:
			
 
				+
			
 
				+deny
			
 
				+    For use in emergencies, to force the huge option off from
			
 
				+    all mounts;
			
 
				+force
			
 
				+    Force the huge option on for all - very useful for testing;
			
 
				+
			
 
				+Need of application restart
			
 
				+===========================
			
 
				+
			
 
				+The transparent_hugepage/enabled values and tmpfs mount option only affect
			
 
				+future behavior. So to make them effective you need to restart any
			
 
				+application that could have been using hugepages. This also applies to the
			
 
				+regions registered in khugepaged.
			
 
				+
			
 
				+Monitoring usage
			
 
				+================
			
 
				+
			
 
				+The number of anonymous transparent huge pages currently used by the
			
 
				+system is available by reading the AnonHugePages field in ``/proc/meminfo``.
			
 
				+To identify what applications are using anonymous transparent huge pages,
			
 
				+it is necessary to read ``/proc/PID/smaps`` and count the AnonHugePages fields
			
 
				+for each mapping.
			
 
				+
			
 
				+The number of file transparent huge pages mapped to userspace is available
			
 
				+by reading ShmemPmdMapped and ShmemHugePages fields in ``/proc/meminfo``.
			
 
				+To identify what applications are mapping file transparent huge pages, it
			
 
				+is necessary to read ``/proc/PID/smaps`` and count the FileHugeMapped fields
			
 
				+for each mapping.
			
 
				+
			
 
				+Note that reading the smaps file is expensive and reading it
			
 
				+frequently will incur overhead.
			
 
				+
			
 
				+There are a number of counters in ``/proc/vmstat`` that may be used to
			
 
				+monitor how successfully the system is providing huge pages for use.
			
 
				+
			
 
				+thp_fault_alloc
			
 
				+	is incremented every time a huge page is successfully
			
 
				+	allocated to handle a page fault. This applies to both the
			
 
				+	first time a page is faulted and for COW faults.
			
 
				+
			
 
				+thp_collapse_alloc
			
 
				+	is incremented by khugepaged when it has found
			
 
				+	a range of pages to collapse into one huge page and has
			
 
				+	successfully allocated a new huge page to store the data.
			
 
				+
			
 
				+thp_fault_fallback
			
 
				+	is incremented if a page fault fails to allocate
			
 
				+	a huge page and instead falls back to using small pages.
			
 
				+
			
 
				+thp_collapse_alloc_failed
			
 
				+	is incremented if khugepaged found a range
			
 
				+	of pages that should be collapsed into one huge page but failed
			
 
				+	the allocation.
			
 
				+
			
 
				+thp_file_alloc
			
 
				+	is incremented every time a file huge page is successfully
			
 
				+	allocated.
			
 
				+
			
 
				+thp_file_mapped
			
 
				+	is incremented every time a file huge page is mapped into
			
 
				+	user address space.
			
 
				+
			
 
				+thp_split_page
			
 
				+	is incremented every time a huge page is split into base
			
 
				+	pages. This can happen for a variety of reasons but a common
			
 
				+	reason is that a huge page is old and is being reclaimed.
			
 
				+	This action implies splitting all PMD the page mapped with.
			
 
				+
			
 
				+thp_split_page_failed
			
 
				+	is incremented if kernel fails to split huge
			
 
				+	page. This can happen if the page was pinned by somebody.
			
 
				+
			
 
				+thp_deferred_split_page
			
 
				+	is incremented when a huge page is put onto split
			
 
				+	queue. This happens when a huge page is partially unmapped and
			
 
				+	splitting it would free up some memory. Pages on split queue are
			
 
				+	going to be split under memory pressure.
			
 
				+
			
 
				+thp_split_pmd
			
 
				+	is incremented every time a PMD split into table of PTEs.
			
 
				+	This can happen, for instance, when application calls mprotect() or
			
 
				+	munmap() on part of huge page. It doesn't split huge page, only
			
 
				+	page table entry.
			
 
				+
			
 
				+thp_zero_page_alloc
			
 
				+	is incremented every time a huge zero page is
			
 
				+	successfully allocated. It includes allocations which where
			
 
				+	dropped due race with other allocation. Note, it doesn't count
			
 
				+	every map of the huge zero page, only its allocation.
			
 
				+
			
 
				+thp_zero_page_alloc_failed
			
 
				+	is incremented if kernel fails to allocate
			
 
				+	huge zero page and falls back to using small pages.
			
 
				+
			
 
				+thp_swpout
			
 
				+	is incremented every time a huge page is swapout in one
			
 
				+	piece without splitting.
			
 
				+
			
 
				+thp_swpout_fallback
			
 
				+	is incremented if a huge page has to be split before swapout.
			
 
				+	Usually because failed to allocate some continuous swap space
			
 
				+	for the huge page.
			
 
				+
			
 
				+As the system ages, allocating huge pages may be expensive as the
			
 
				+system uses memory compaction to copy data around memory to free a
			
 
				+huge page for use. There are some counters in ``/proc/vmstat`` to help
			
 
				+monitor this overhead.
			
 
				+
			
 
				+compact_stall
			
 
				+	is incremented every time a process stalls to run
			
 
				+	memory compaction so that a huge page is free for use.
			
 
				+
			
 
				+compact_success
			
 
				+	is incremented if the system compacted memory and
			
 
				+	freed a huge page for use.
			
 
				+
			
 
				+compact_fail
			
 
				+	is incremented if the system tries to compact memory
			
 
				+	but failed.
			
 
				+
			
 
				+compact_pages_moved
			
 
				+	is incremented each time a page is moved. If
			
 
				+	this value is increasing rapidly, it implies that the system
			
 
				+	is copying a lot of data to satisfy the huge page allocation.
			
 
				+	It is possible that the cost of copying exceeds any savings
			
 
				+	from reduced TLB misses.
			
 
				+
			
 
				+compact_pagemigrate_failed
			
 
				+	is incremented when the underlying mechanism
			
 
				+	for moving a page failed.
			
 
				+
			
 
				+compact_blocks_moved
			
 
				+	is incremented each time memory compaction examines
			
 
				+	a huge page aligned range of pages.
			
 
				+
			
 
				+It is possible to establish how long the stalls were using the function
			
 
				+tracer to record how long was spent in __alloc_pages_nodemask and
			
 
				+using the mm_page_alloc tracepoint to identify which allocations were
			
 
				+for huge pages.
			
 
				+
			
 
				+Optimizing the applications
			
 
				+===========================
			
 
				+
			
 
				+To be guaranteed that the kernel will map a 2M page immediately in any
			
 
				+memory region, the mmap region has to be hugepage naturally
			
 
				+aligned. posix_memalign() can provide that guarantee.
			
 
				+
			
 
				+Hugetlbfs
			
 
				+=========
			
 
				+
			
 
				+You can use hugetlbfs on a kernel that has transparent hugepage
			
 
				+support enabled just fine as always. No difference can be noted in
			
 
				+hugetlbfs other than there will be less overall fragmentation. All
			
 
				+usual features belonging to hugetlbfs are preserved and
			
 
				+unaffected. libhugetlbfs will also work fine as usual.
			
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -0,0 +1,241 @@
 
				+.. _userfaultfd:
			
 
				+
			
 
				+===========
			
 
				+Userfaultfd
			
 
				+===========
			
 
				+
			
 
				+Objective
			
 
				+=========
			
 
				+
			
 
				+Userfaults allow the implementation of on-demand paging from userland
			
 
				+and more generally they allow userland to take control of various
			
 
				+memory page faults, something otherwise only the kernel code could do.
			
 
				+
			
 
				+For example userfaults allows a proper and more optimal implementation
			
 
				+of the PROT_NONE+SIGSEGV trick.
			
 
				+
			
 
				+Design
			
 
				+======
			
 
				+
			
 
				+Userfaults are delivered and resolved through the userfaultfd syscall.
			
 
				+
			
 
				+The userfaultfd (aside from registering and unregistering virtual
			
 
				+memory ranges) provides two primary functionalities:
			
 
				+
			
 
				+1) read/POLLIN protocol to notify a userland thread of the faults
			
 
				+   happening
			
 
				+
			
 
				+2) various UFFDIO_* ioctls that can manage the virtual memory regions
			
 
				+   registered in the userfaultfd that allows userland to efficiently
			
 
				+   resolve the userfaults it receives via 1) or to manage the virtual
			
 
				+   memory in the background
			
 
				+
			
 
				+The real advantage of userfaults if compared to regular virtual memory
			
 
				+management of mremap/mprotect is that the userfaults in all their
			
 
				+operations never involve heavyweight structures like vmas (in fact the
			
 
				+userfaultfd runtime load never takes the mmap_sem for writing).
			
 
				+
			
 
				+Vmas are not suitable for page- (or hugepage) granular fault tracking
			
 
				+when dealing with virtual address spaces that could span
			
 
				+Terabytes. Too many vmas would be needed for that.
			
 
				+
			
 
				+The userfaultfd once opened by invoking the syscall, can also be
			
 
				+passed using unix domain sockets to a manager process, so the same
			
 
				+manager process could handle the userfaults of a multitude of
			
 
				+different processes without them being aware about what is going on
			
 
				+(well of course unless they later try to use the userfaultfd
			
 
				+themselves on the same region the manager is already tracking, which
			
 
				+is a corner case that would currently return -EBUSY).
			
 
				+
			
 
				+API
			
 
				+===
			
 
				+
			
 
				+When first opened the userfaultfd must be enabled invoking the
			
 
				+UFFDIO_API ioctl specifying a uffdio_api.api value set to UFFD_API (or
			
 
				+a later API version) which will specify the read/POLLIN protocol
			
 
				+userland intends to speak on the UFFD and the uffdio_api.features
			
 
				+userland requires. The UFFDIO_API ioctl if successful (i.e. if the
			
 
				+requested uffdio_api.api is spoken also by the running kernel and the
			
 
				+requested features are going to be enabled) will return into
			
 
				+uffdio_api.features and uffdio_api.ioctls two 64bit bitmasks of
			
 
				+respectively all the available features of the read(2) protocol and
			
 
				+the generic ioctl available.
			
 
				+
			
 
				+The uffdio_api.features bitmask returned by the UFFDIO_API ioctl
			
 
				+defines what memory types are supported by the userfaultfd and what
			
 
				+events, except page fault notifications, may be generated.
			
 
				+
			
 
				+If the kernel supports registering userfaultfd ranges on hugetlbfs
			
 
				+virtual memory areas, UFFD_FEATURE_MISSING_HUGETLBFS will be set in
			
 
				+uffdio_api.features. Similarly, UFFD_FEATURE_MISSING_SHMEM will be
			
 
				+set if the kernel supports registering userfaultfd ranges on shared
			
 
				+memory (covering all shmem APIs, i.e. tmpfs, IPCSHM, /dev/zero
			
 
				+MAP_SHARED, memfd_create, etc).
			
 
				+
			
 
				+The userland application that wants to use userfaultfd with hugetlbfs
			
 
				+or shared memory need to set the corresponding flag in
			
 
				+uffdio_api.features to enable those features.
			
 
				+
			
 
				+If the userland desires to receive notifications for events other than
			
 
				+page faults, it has to verify that uffdio_api.features has appropriate
			
 
				+UFFD_FEATURE_EVENT_* bits set. These events are described in more
			
 
				+detail below in "Non-cooperative userfaultfd" section.
			
 
				+
			
 
				+Once the userfaultfd has been enabled the UFFDIO_REGISTER ioctl should
			
 
				+be invoked (if present in the returned uffdio_api.ioctls bitmask) to
			
 
				+register a memory range in the userfaultfd by setting the
			
 
				+uffdio_register structure accordingly. The uffdio_register.mode
			
 
				+bitmask will specify to the kernel which kind of faults to track for
			
 
				+the range (UFFDIO_REGISTER_MODE_MISSING would track missing
			
 
				+pages). The UFFDIO_REGISTER ioctl will return the
			
 
				+uffdio_register.ioctls bitmask of ioctls that are suitable to resolve
			
 
				+userfaults on the range registered. Not all ioctls will necessarily be
			
 
				+supported for all memory types depending on the underlying virtual
			
 
				+memory backend (anonymous memory vs tmpfs vs real filebacked
			
 
				+mappings).
			
 
				+
			
 
				+Userland can use the uffdio_register.ioctls to manage the virtual
			
 
				+address space in the background (to add or potentially also remove
			
 
				+memory from the userfaultfd registered range). This means a userfault
			
 
				+could be triggering just before userland maps in the background the
			
 
				+user-faulted page.
			
 
				+
			
 
				+The primary ioctl to resolve userfaults is UFFDIO_COPY. That
			
 
				+atomically copies a page into the userfault registered range and wakes
			
 
				+up the blocked userfaults (unless uffdio_copy.mode &
			
 
				+UFFDIO_COPY_MODE_DONTWAKE is set). Other ioctl works similarly to
			
 
				+UFFDIO_COPY. They're atomic as in guaranteeing that nothing can see an
			
 
				+half copied page since it'll keep userfaulting until the copy has
			
 
				+finished.
			
 
				+
			
 
				+QEMU/KVM
			
 
				+========
			
 
				+
			
 
				+QEMU/KVM is using the userfaultfd syscall to implement postcopy live
			
 
				+migration. Postcopy live migration is one form of memory
			
 
				+externalization consisting of a virtual machine running with part or
			
 
				+all of its memory residing on a different node in the cloud. The
			
 
				+userfaultfd abstraction is generic enough that not a single line of
			
 
				+KVM kernel code had to be modified in order to add postcopy live
			
 
				+migration to QEMU.
			
 
				+
			
 
				+Guest async page faults, FOLL_NOWAIT and all other GUP features work
			
 
				+just fine in combination with userfaults. Userfaults trigger async
			
 
				+page faults in the guest scheduler so those guest processes that
			
 
				+aren't waiting for userfaults (i.e. network bound) can keep running in
			
 
				+the guest vcpus.
			
 
				+
			
 
				+It is generally beneficial to run one pass of precopy live migration
			
 
				+just before starting postcopy live migration, in order to avoid
			
 
				+generating userfaults for readonly guest regions.
			
 
				+
			
 
				+The implementation of postcopy live migration currently uses one
			
 
				+single bidirectional socket but in the future two different sockets
			
 
				+will be used (to reduce the latency of the userfaults to the minimum
			
 
				+possible without having to decrease /proc/sys/net/ipv4/tcp_wmem).
			
 
				+
			
 
				+The QEMU in the source node writes all pages that it knows are missing
			
 
				+in the destination node, into the socket, and the migration thread of
			
 
				+the QEMU running in the destination node runs UFFDIO_COPY|ZEROPAGE
			
 
				+ioctls on the userfaultfd in order to map the received pages into the
			
 
				+guest (UFFDIO_ZEROCOPY is used if the source page was a zero page).
			
 
				+
			
 
				+A different postcopy thread in the destination node listens with
			
 
				+poll() to the userfaultfd in parallel. When a POLLIN event is
			
 
				+generated after a userfault triggers, the postcopy thread read() from
			
 
				+the userfaultfd and receives the fault address (or -EAGAIN in case the
			
 
				+userfault was already resolved and waken by a UFFDIO_COPY|ZEROPAGE run
			
 
				+by the parallel QEMU migration thread).
			
 
				+
			
 
				+After the QEMU postcopy thread (running in the destination node) gets
			
 
				+the userfault address it writes the information about the missing page
			
 
				+into the socket. The QEMU source node receives the information and
			
 
				+roughly "seeks" to that page address and continues sending all
			
 
				+remaining missing pages from that new page offset. Soon after that
			
 
				+(just the time to flush the tcp_wmem queue through the network) the
			
 
				+migration thread in the QEMU running in the destination node will
			
 
				+receive the page that triggered the userfault and it'll map it as
			
 
				+usual with the UFFDIO_COPY|ZEROPAGE (without actually knowing if it
			
 
				+was spontaneously sent by the source or if it was an urgent page
			
 
				+requested through a userfault).
			
 
				+
			
 
				+By the time the userfaults start, the QEMU in the destination node
			
 
				+doesn't need to keep any per-page state bitmap relative to the live
			
 
				+migration around and a single per-page bitmap has to be maintained in
			
 
				+the QEMU running in the source node to know which pages are still
			
 
				+missing in the destination node. The bitmap in the source node is
			
 
				+checked to find which missing pages to send in round robin and we seek
			
 
				+over it when receiving incoming userfaults. After sending each page of
			
 
				+course the bitmap is updated accordingly. It's also useful to avoid
			
 
				+sending the same page twice (in case the userfault is read by the
			
 
				+postcopy thread just before UFFDIO_COPY|ZEROPAGE runs in the migration
			
 
				+thread).
			
 
				+
			
 
				+Non-cooperative userfaultfd
			
 
				+===========================
			
 
				+
			
 
				+When the userfaultfd is monitored by an external manager, the manager
			
 
				+must be able to track changes in the process virtual memory
			
 
				+layout. Userfaultfd can notify the manager about such changes using
			
 
				+the same read(2) protocol as for the page fault notifications. The
			
 
				+manager has to explicitly enable these events by setting appropriate
			
 
				+bits in uffdio_api.features passed to UFFDIO_API ioctl:
			
 
				+
			
 
				+UFFD_FEATURE_EVENT_FORK
			
 
				+	enable userfaultfd hooks for fork(). When this feature is
			
 
				+	enabled, the userfaultfd context of the parent process is
			
 
				+	duplicated into the newly created process. The manager
			
 
				+	receives UFFD_EVENT_FORK with file descriptor of the new
			
 
				+	userfaultfd context in the uffd_msg.fork.
			
 
				+
			
 
				+UFFD_FEATURE_EVENT_REMAP
			
 
				+	enable notifications about mremap() calls. When the
			
 
				+	non-cooperative process moves a virtual memory area to a
			
 
				+	different location, the manager will receive
			
 
				+	UFFD_EVENT_REMAP. The uffd_msg.remap will contain the old and
			
 
				+	new addresses of the area and its original length.
			
 
				+
			
 
				+UFFD_FEATURE_EVENT_REMOVE
			
 
				+	enable notifications about madvise(MADV_REMOVE) and
			
 
				+	madvise(MADV_DONTNEED) calls. The event UFFD_EVENT_REMOVE will
			
 
				+	be generated upon these calls to madvise. The uffd_msg.remove
			
 
				+	will contain start and end addresses of the removed area.
			
 
				+
			
 
				+UFFD_FEATURE_EVENT_UNMAP
			
 
				+	enable notifications about memory unmapping. The manager will
			
 
				+	get UFFD_EVENT_UNMAP with uffd_msg.remove containing start and
			
 
				+	end addresses of the unmapped area.
			
 
				+
			
 
				+Although the UFFD_FEATURE_EVENT_REMOVE and UFFD_FEATURE_EVENT_UNMAP
			
 
				+are pretty similar, they quite differ in the action expected from the
			
 
				+userfaultfd manager. In the former case, the virtual memory is
			
 
				+removed, but the area is not, the area remains monitored by the
			
 
				+userfaultfd, and if a page fault occurs in that area it will be
			
 
				+delivered to the manager. The proper resolution for such page fault is
			
 
				+to zeromap the faulting address. However, in the latter case, when an
			
 
				+area is unmapped, either explicitly (with munmap() system call), or
			
 
				+implicitly (e.g. during mremap()), the area is removed and in turn the
			
 
				+userfaultfd context for such area disappears too and the manager will
			
 
				+not get further userland page faults from the removed area. Still, the
			
 
				+notification is required in order to prevent manager from using
			
 
				+UFFDIO_COPY on the unmapped area.
			
 
				+
			
 
				+Unlike userland page faults which have to be synchronous and require
			
 
				+explicit or implicit wakeup, all the events are delivered
			
 
				+asynchronously and the non-cooperative process resumes execution as
			
 
				+soon as manager executes read(). The userfaultfd manager should
			
 
				+carefully synchronize calls to UFFDIO_COPY with the events
			
 
				+processing. To aid the synchronization, the UFFDIO_COPY ioctl will
			
 
				+return -ENOSPC when the monitored process exits at the time of
			
 
				+UFFDIO_COPY, and -ENOENT, when the non-cooperative process has changed
			
 
				+its virtual memory layout simultaneously with outstanding UFFDIO_COPY
			
 
				+operation.
			
 
				+
			
 
				+The current asynchronous model of the event delivery is optimal for
			
 
				+single threaded non-cooperative userfaultfd manager implementations. A
			
 
				+synchronous event delivery model can be added later as a new
			
 
				+userfaultfd feature to facilitate multithreading enhancements of the
			
 
				+non cooperative manager, for example to allow UFFDIO_COPY ioctls to
			
 
				+run in parallel to the event reception. Single threaded
			
 
				+implementations should continue to use the current async event
			
 
				+delivery model instead.
			
--- a/Documentation/admin-guide/ramoops.rst
+++ b/Documentation/admin-guide/ramoops.rst
@@ -61,7 +61,7 @@ Setting the ramoops parameters can be done in several different manners:
 
				 	mem=128M ramoops.mem_address=0x8000000 ramoops.ecc=1
			
 
				 
			
 
				  B. Use Device Tree bindings, as described in
			
 
				- ``Documentation/device-tree/bindings/reserved-memory/admin-guide/ramoops.rst``.
			
 
				+ ``Documentation/devicetree/bindings/reserved-memory/ramoops.txt``.
			
 
				  For example::
			
 
				 
			
 
				 	reserved-memory {
			
--- a/Documentation/arm/Marvell/README
+++ b/Documentation/arm/Marvell/README
@@ -302,19 +302,15 @@ Berlin family (Multimedia Solutions)
 
				 	88DE3010, Armada 1000 (no Linux support)
			
 
				 		Core:		Marvell PJ1 (ARMv5TE), Dual-core
			
 
				 		Product Brief:	http://www.marvell.com.cn/digital-entertainment/assets/armada_1000_pb.pdf
			
 
				-	88DE3005, Armada 1500-mini
			
 
				 	88DE3005, Armada 1500 Mini
			
 
				 		Design name:	BG2CD
			
 
				 		Core:		ARM Cortex-A9, PL310 L2CC
			
 
				-		Homepage:	http://www.marvell.com/multimedia-solutions/armada-1500-mini/
			
 
				-        88DE3006, Armada 1500 Mini Plus
			
 
				-                Design name:    BG2CDP
			
 
				-                Core:           Dual Core ARM Cortex-A7
			
 
				-                Homepage:       http://www.marvell.com/multimedia-solutions/armada-1500-mini-plus/
			
 
				+	88DE3006, Armada 1500 Mini Plus
			
 
				+		Design name:	BG2CDP
			
 
				+		Core:		Dual Core ARM Cortex-A7
			
 
				 	88DE3100, Armada 1500
			
 
				 		Design name:	BG2
			
 
				 		Core:		Marvell PJ4B-MP (ARMv7), Tauros3 L2CC
			
 
				-		Product Brief:	http://www.marvell.com/digital-entertainment/armada-1500/assets/Marvell-ARMADA-1500-Product-Brief.pdf
			
 
				 	88DE3114, Armada 1500 Pro
			
 
				 		Design name:	BG2Q
			
 
				 		Core:		Quad Core ARM Cortex-A9, PL310 L2CC
			
@@ -324,13 +320,16 @@ Berlin family (Multimedia Solutions)
 
				 	88DE3218, ARMADA 1500 Ultra
			
 
				 		Core:		ARM Cortex-A53
			
 
				 
			
 
				-  Homepage: http://www.marvell.com/multimedia-solutions/
			
 
				+  Homepage: https://www.synaptics.com/products/multimedia-solutions
			
 
				   Directory: arch/arm/mach-berlin
			
 
				 
			
 
				   Comments:
			
 
				+
			
 
				    * This line of SoCs is based on Marvell Sheeva or ARM Cortex CPUs
			
 
				      with Synopsys DesignWare (IRQ, GPIO, Timers, ...) and PXA IP (SDHCI, USB, ETH, ...).
			
 
				 
			
 
				+   * The Berlin family was acquired by Synaptics from Marvell in 2017.
			
 
				+
			
 
				 CPU Cores
			
 
				 ---------
			
 
				 
			
--- a/Documentation/arm/OMAP/README
+++ b/Documentation/arm/OMAP/README
@@ -5,3 +5,7 @@ KERNEL		NEW DEPENDENCIES
 
				 v4.3+		Update is needed for custom .config files to make sure
			
 
				 		CONFIG_REGULATOR_PBIAS is enabled for MMC1 to work
			
 
				 		properly.
			
 
				+
			
 
				+v4.18+		Update is needed for custom .config files to make sure
			
 
				+		CONFIG_MMC_SDHCI_OMAP is enabled for all MMC instances
			
 
				+		to work in DRA7 and K2G based boards.
			
--- a/Documentation/misc-devices/lcd-panel-cgram.txt
+++ b/Documentation/misc-devices/lcd-panel-cgram.txt
--- a/Documentation/block/biodoc.txt
+++ b/Documentation/block/biodoc.txt
@@ -752,18 +752,6 @@ completion of the request to the block layer. This means ending tag
 
				 operations before calling end_that_request_last()! For an example of a user
			
 
				 of these helpers, see the IDE tagged command queueing support.
			
 
				 
			
 
				-Certain hardware conditions may dictate a need to invalidate the block tag
			
 
				-queue. For instance, on IDE any tagged request error needs to clear both
			
 
				-the hardware and software block queue and enable the driver to sanely restart
			
 
				-all the outstanding requests. There's a third helper to do that:
			
 
				-
			
 
				-	blk_queue_invalidate_tags(struct request_queue *q)
			
 
				-
			
 
				-	Clear the internal block tag queue and re-add all the pending requests
			
 
				-	to the request queue. The driver will receive them again on the
			
 
				-	next request_fn run, just like it did the first time it encountered
			
 
				-	them.
			
 
				-
			
 
				 3.2.5.2 Tag info
			
 
				 
			
 
				 Some block functions exist to query current tag status or to go from a
			
@@ -805,8 +793,7 @@ Internally, block manages tags in the blk_queue_tag structure:
 
				 Most of the above is simple and straight forward, however busy_list may need
			
 
				 a bit of explaining. Normally we don't care too much about request ordering,
			
 
				 but in the event of any barrier requests in the tag queue we need to ensure
			
 
				-that requests are restarted in the order they were queue. This may happen
			
 
				-if the driver needs to use blk_queue_invalidate_tags().
			
 
				+that requests are restarted in the order they were queue.
			
 
				 
			
 
				 3.3 I/O Submission
			
 
				 
			
--- a/Documentation/block/cmdline-partition.txt
+++ b/Documentation/block/cmdline-partition.txt
@@ -1,7 +1,9 @@
 
				 Embedded device command line partition parsing
			
 
				 =====================================================================
			
 
				 
			
 
				-Support for reading the block device partition table from the command line.
			
 
				+The "blkdevparts" command line option adds support for reading the
			
 
				+block device partition table from the kernel command line.
			
 
				+
			
 
				 It is typically used for fixed block (eMMC) embedded devices.
			
 
				 It has no MBR, so saves storage space. Bootloader can be easily accessed
			
 
				 by absolute address of data on the block device.
			
@@ -14,22 +16,27 @@ blkdevparts=<blkdev-def>[;<blkdev-def>]
 
				     <partdef> := <size>[@<offset>](part-name)
			
 
				 
			
 
				 <blkdev-id>
			
 
				-    block device disk name, embedded device used fixed block device,
			
 
				-    it's disk name also fixed. such as: mmcblk0, mmcblk1, mmcblk0boot0.
			
 
				+    block device disk name. Embedded device uses fixed block device.
			
 
				+    Its disk name is also fixed, such as: mmcblk0, mmcblk1, mmcblk0boot0.
			
 
				 
			
 
				 <size>
			
 
				     partition size, in bytes, such as: 512, 1m, 1G.
			
 
				+    size may contain an optional suffix of (upper or lower case):
			
 
				+      K, M, G, T, P, E.
			
 
				+    "-" is used to denote all remaining space.
			
 
				 
			
 
				 <offset>
			
 
				     partition start address, in bytes.
			
 
				+    offset may contain an optional suffix of (upper or lower case):
			
 
				+      K, M, G, T, P, E.
			
 
				 
			
 
				 (part-name)
			
 
				-    partition name, kernel send uevent with "PARTNAME". application can create
			
 
				-    a link to block device partition with the name "PARTNAME".
			
 
				-    user space application can access partition by partition name.
			
 
				+    partition name. Kernel sends uevent with "PARTNAME". Application can
			
 
				+    create a link to block device partition with the name "PARTNAME".
			
 
				+    User space application can access partition by partition name.
			
 
				 
			
 
				 Example:
			
 
				-    eMMC disk name is "mmcblk0" and "mmcblk0boot0"
			
 
				+    eMMC disk names are "mmcblk0" and "mmcblk0boot0".
			
 
				 
			
 
				   bootargs:
			
 
				     'blkdevparts=mmcblk0:1G(data0),1G(data1),-;mmcblk0boot0:1m(boot),-(kernel)'
			
--- a/Documentation/block/null_blk.txt
+++ b/Documentation/block/null_blk.txt
@@ -71,13 +71,16 @@ use_per_node_hctx=[0/1]: Default: 0
 
				   1: The multi-queue block layer is instantiated with a hardware dispatch
			
 
				      queue for each CPU node in the system.
			
 
				 
			
 
				-use_lightnvm=[0/1]: Default: 0
			
 
				-  Register device with LightNVM. Requires blk-mq and CONFIG_NVM to be enabled.
			
 
				-
			
 
				 no_sched=[0/1]: Default: 0
			
 
				   0: nullb* use default blk-mq io scheduler.
			
 
				   1: nullb* doesn't use io scheduler.
			
 
				 
			
 
				+blocking=[0/1]: Default: 0
			
 
				+  0: Register as a non-blocking blk-mq driver device.
			
 
				+  1: Register as a blocking blk-mq driver device, null_blk will set
			
 
				+     the BLK_MQ_F_BLOCKING flag, indicating that it sometimes/always
			
 
				+     needs to block in its ->queue_rq() function.
			
 
				+
			
 
				 shared_tags=[0/1]: Default: 0
			
 
				   0: Tag set is not shared.
			
 
				   1: Tag set shared between devices for blk-mq. Only makes sense with
			
--- a/Documentation/blockdev/zram.txt
+++ b/Documentation/blockdev/zram.txt
@@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace:
 
				  same_pages       the number of same element filled pages written to this disk.
			
 
				                   No memory is allocated for such pages.
			
 
				  pages_compacted  the number of pages freed during compaction
			
 
				+ huge_pages	  the number of incompressible pages
			
 
				 
			
 
				 9) Deactivate:
			
 
				 	swapoff /dev/zram0
			
@@ -242,5 +243,29 @@ to backing storage rather than keeping it in memory.
 
				 User should set up backing device via /sys/block/zramX/backing_dev
			
 
				 before disksize setting.
			
 
				 
			
 
				+= memory tracking
			
 
				+
			
 
				+With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the
			
 
				+zram block. It could be useful to catch cold or incompressible
			
 
				+pages of the process with*pagemap.
			
 
				+If you enable the feature, you could see block state via
			
 
				+/sys/kernel/debug/zram/zram0/block_state". The output is as follows,
			
 
				+
			
 
				+	  300    75.033841 .wh
			
 
				+	  301    63.806904 s..
			
 
				+	  302    63.806919 ..h
			
 
				+
			
 
				+First column is zram's block index.
			
 
				+Second column is access time since the system was booted
			
 
				+Third column is state of the block.
			
 
				+(s: same page
			
 
				+w: written page to backing store
			
 
				+h: huge page)
			
 
				+
			
 
				+First line of above example says 300th block is accessed at 75.033841sec
			
 
				+and the block's state is huge so it is written back to the backing
			
 
				+storage. It's a debugging feature so anyone shouldn't rely on it to work
			
 
				+properly.
			
 
				+
			
 
				 Nitin Gupta
			
 
				 ngupta@vflare.org
			
--- a/Documentation/bpf/README.rst
+++ b/Documentation/bpf/README.rst
@@ -0,0 +1,36 @@
 
				+=================
			
 
				+BPF documentation
			
 
				+=================
			
 
				+
			
 
				+This directory contains documentation for the BPF (Berkeley Packet
			
 
				+Filter) facility, with a focus on the extended BPF version (eBPF).
			
 
				+
			
 
				+This kernel side documentation is still work in progress.  The main
			
 
				+textual documentation is (for historical reasons) described in
			
 
				+`Documentation/networking/filter.txt`_, which describe both classical
			
 
				+and extended BPF instruction-set.
			
 
				+The Cilium project also maintains a `BPF and XDP Reference Guide`_
			
 
				+that goes into great technical depth about the BPF Architecture.
			
 
				+
			
 
				+The primary info for the bpf syscall is available in the `man-pages`_
			
 
				+for `bpf(2)`_.
			
 
				+
			
 
				+
			
 
				+
			
 
				+Frequently asked questions (FAQ)
			
 
				+================================
			
 
				+
			
 
				+Two sets of Questions and Answers (Q&A) are maintained.
			
 
				+
			
 
				+* QA for common questions about BPF see: bpf_design_QA_
			
 
				+
			
 
				+* QA for developers interacting with BPF subsystem: bpf_devel_QA_
			
 
				+
			
 
				+
			
 
				+.. Links:
			
 
				+.. _bpf_design_QA: bpf_design_QA.rst
			
 
				+.. _bpf_devel_QA:  bpf_devel_QA.rst
			
 
				+.. _Documentation/networking/filter.txt: ../networking/filter.txt
			
 
				+.. _man-pages: https://www.kernel.org/doc/man-pages/
			
 
				+.. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
			
 
				+.. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/
			
--- a/Documentation/bpf/bpf_design_QA.rst
+++ b/Documentation/bpf/bpf_design_QA.rst
@@ -0,0 +1,221 @@
 
				+==============
			
 
				+BPF Design Q&A
			
 
				+==============
			
 
				+
			
 
				+BPF extensibility and applicability to networking, tracing, security
			
 
				+in the linux kernel and several user space implementations of BPF
			
 
				+virtual machine led to a number of misunderstanding on what BPF actually is.
			
 
				+This short QA is an attempt to address that and outline a direction
			
 
				+of where BPF is heading long term.
			
 
				+
			
 
				+.. contents::
			
 
				+    :local:
			
 
				+    :depth: 3
			
 
				+
			
 
				+Questions and Answers
			
 
				+=====================
			
 
				+
			
 
				+Q: Is BPF a generic instruction set similar to x64 and arm64?
			
 
				+-------------------------------------------------------------
			
 
				+A: NO.
			
 
				+
			
 
				+Q: Is BPF a generic virtual machine ?
			
 
				+-------------------------------------
			
 
				+A: NO.
			
 
				+
			
 
				+BPF is generic instruction set *with* C calling convention.
			
 
				+-----------------------------------------------------------
			
 
				+
			
 
				+Q: Why C calling convention was chosen?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+A: Because BPF programs are designed to run in the linux kernel
			
 
				+which is written in C, hence BPF defines instruction set compatible
			
 
				+with two most used architectures x64 and arm64 (and takes into
			
 
				+consideration important quirks of other architectures) and
			
 
				+defines calling convention that is compatible with C calling
			
 
				+convention of the linux kernel on those architectures.
			
 
				+
			
 
				+Q: can multiple return values be supported in the future?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: NO. BPF allows only register R0 to be used as return value.
			
 
				+
			
 
				+Q: can more than 5 function arguments be supported in the future?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: NO. BPF calling convention only allows registers R1-R5 to be used
			
 
				+as arguments. BPF is not a standalone instruction set.
			
 
				+(unlike x64 ISA that allows msft, cdecl and other conventions)
			
 
				+
			
 
				+Q: can BPF programs access instruction pointer or return address?
			
 
				+-----------------------------------------------------------------
			
 
				+A: NO.
			
 
				+
			
 
				+Q: can BPF programs access stack pointer ?
			
 
				+------------------------------------------
			
 
				+A: NO.
			
 
				+
			
 
				+Only frame pointer (register R10) is accessible.
			
 
				+From compiler point of view it's necessary to have stack pointer.
			
 
				+For example LLVM defines register R11 as stack pointer in its
			
 
				+BPF backend, but it makes sure that generated code never uses it.
			
 
				+
			
 
				+Q: Does C-calling convention diminishes possible use cases?
			
 
				+-----------------------------------------------------------
			
 
				+A: YES.
			
 
				+
			
 
				+BPF design forces addition of major functionality in the form
			
 
				+of kernel helper functions and kernel objects like BPF maps with
			
 
				+seamless interoperability between them. It lets kernel call into
			
 
				+BPF programs and programs call kernel helpers with zero overhead.
			
 
				+As all of them were native C code. That is particularly the case
			
 
				+for JITed BPF programs that are indistinguishable from
			
 
				+native kernel C code.
			
 
				+
			
 
				+Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
			
 
				+------------------------------------------------------------------------
			
 
				+A: Soft yes.
			
 
				+
			
 
				+At least for now until BPF core has support for
			
 
				+bpf-to-bpf calls, indirect calls, loops, global variables,
			
 
				+jump tables, read only sections and all other normal constructs
			
 
				+that C code can produce.
			
 
				+
			
 
				+Q: Can loops be supported in a safe way?
			
 
				+----------------------------------------
			
 
				+A: It's not clear yet.
			
 
				+
			
 
				+BPF developers are trying to find a way to
			
 
				+support bounded loops where the verifier can guarantee that
			
 
				+the program terminates in less than 4096 instructions.
			
 
				+
			
 
				+Instruction level questions
			
 
				+---------------------------
			
 
				+
			
 
				+Q: LD_ABS and LD_IND instructions vs C code
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+
			
 
				+Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
			
 
				+C code cannot express them and has to use builtin intrinsics?
			
 
				+
			
 
				+A: This is artifact of compatibility with classic BPF. Modern
			
 
				+networking code in BPF performs better without them.
			
 
				+See 'direct packet access'.
			
 
				+
			
 
				+Q: BPF instructions mapping not one-to-one to native CPU
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+Q: It seems not all BPF instructions are one-to-one to native CPU.
			
 
				+For example why BPF_JNE and other compare and jumps are not cpu-like?
			
 
				+
			
 
				+A: This was necessary to avoid introducing flags into ISA which are
			
 
				+impossible to make generic and efficient across CPU architectures.
			
 
				+
			
 
				+Q: why BPF_DIV instruction doesn't map to x64 div?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: Because if we picked one-to-one relationship to x64 it would have made
			
 
				+it more complicated to support on arm64 and other archs. Also it
			
 
				+needs div-by-zero runtime check.
			
 
				+
			
 
				+Q: why there is no BPF_SDIV for signed divide operation?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: Because it would be rarely used. llvm errors in such case and
			
 
				+prints a suggestion to use unsigned divide instead
			
 
				+
			
 
				+Q: Why BPF has implicit prologue and epilogue?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: Because architectures like sparc have register windows and in general
			
 
				+there are enough subtle differences between architectures, so naive
			
 
				+store return address into stack won't work. Another reason is BPF has
			
 
				+to be safe from division by zero (and legacy exception path
			
 
				+of LD_ABS insn). Those instructions need to invoke epilogue and
			
 
				+return implicitly.
			
 
				+
			
 
				+Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+A: Because classic BPF didn't have them and BPF authors felt that compiler
			
 
				+workaround would be acceptable. Turned out that programs lose performance
			
 
				+due to lack of these compare instructions and they were added.
			
 
				+These two instructions is a perfect example what kind of new BPF
			
 
				+instructions are acceptable and can be added in the future.
			
 
				+These two already had equivalent instructions in native CPUs.
			
 
				+New instructions that don't have one-to-one mapping to HW instructions
			
 
				+will not be accepted.
			
 
				+
			
 
				+Q: BPF 32-bit subregister requirements
			
 
				+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				+Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
			
 
				+registers which makes BPF inefficient virtual machine for 32-bit
			
 
				+CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
			
 
				+be added to BPF in the future?
			
 
				+
			
 
				+A: NO. The first thing to improve performance on 32-bit archs is to teach
			
 
				+LLVM to generate code that uses 32-bit subregisters. Then second step
			
 
				+is to teach verifier to mark operations where zero-ing upper bits
			
 
				+is unnecessary. Then JITs can take advantage of those markings and
			
 
				+drastically reduce size of generated code and improve performance.
			
 
				+
			
 
				+Q: Does BPF have a stable ABI?
			
 
				+------------------------------
			
 
				+A: YES. BPF instructions, arguments to BPF programs, set of helper
			
 
				+functions and their arguments, recognized return codes are all part
			
 
				+of ABI. However when tracing programs are using bpf_probe_read() helper
			
 
				+to walk kernel internal datastructures and compile with kernel
			
 
				+internal headers these accesses can and will break with newer
			
 
				+kernels. The union bpf_attr -> kern_version is checked at load time
			
 
				+to prevent accidentally loading kprobe-based bpf programs written
			
 
				+for a different kernel. Networking programs don't do kern_version check.
			
 
				+
			
 
				+Q: How much stack space a BPF program uses?
			
 
				+-------------------------------------------
			
 
				+A: Currently all program types are limited to 512 bytes of stack
			
 
				+space, but the verifier computes the actual amount of stack used
			
 
				+and both interpreter and most JITed code consume necessary amount.
			
 
				+
			
 
				+Q: Can BPF be offloaded to HW?
			
 
				+------------------------------
			
 
				+A: YES. BPF HW offload is supported by NFP driver.
			
 
				+
			
 
				+Q: Does classic BPF interpreter still exist?
			
 
				+--------------------------------------------
			
 
				+A: NO. Classic BPF programs are converted into extend BPF instructions.
			
 
				+
			
 
				+Q: Can BPF call arbitrary kernel functions?
			
 
				+-------------------------------------------
			
 
				+A: NO. BPF programs can only call a set of helper functions which
			
 
				+is defined for every program type.
			
 
				+
			
 
				+Q: Can BPF overwrite arbitrary kernel memory?
			
 
				+---------------------------------------------
			
 
				+A: NO.
			
 
				+
			
 
				+Tracing bpf programs can *read* arbitrary memory with bpf_probe_read()
			
 
				+and bpf_probe_read_str() helpers. Networking programs cannot read
			
 
				+arbitrary memory, since they don't have access to these helpers.
			
 
				+Programs can never read or write arbitrary memory directly.
			
 
				+
			
 
				+Q: Can BPF overwrite arbitrary user memory?
			
 
				+-------------------------------------------
			
 
				+A: Sort-of.
			
 
				+
			
 
				+Tracing BPF programs can overwrite the user memory
			
 
				+of the current task with bpf_probe_write_user(). Every time such
			
 
				+program is loaded the kernel will print warning message, so
			
 
				+this helper is only useful for experiments and prototypes.
			
 
				+Tracing BPF programs are root only.
			
 
				+
			
 
				+Q: bpf_trace_printk() helper warning
			
 
				+------------------------------------
			
 
				+Q: When bpf_trace_printk() helper is used the kernel prints nasty
			
 
				+warning message. Why is that?
			
 
				+
			
 
				+A: This is done to nudge program authors into better interfaces when
			
 
				+programs need to pass data to user space. Like bpf_perf_event_output()
			
 
				+can be used to efficiently stream data via perf ring buffer.
			
 
				+BPF maps can be used for asynchronous data sharing between kernel
			
 
				+and user space. bpf_trace_printk() should only be used for debugging.
			
 
				+
			
 
				+Q: New functionality via kernel modules?
			
 
				+----------------------------------------
			
 
				+Q: Can BPF functionality such as new program or map types, new
			
 
				+helpers, etc be added out of kernel module code?
			
 
				+
			
 
				+A: NO.
			
--- a/Documentation/bpf/bpf_design_QA.txt
+++ b/Documentation/bpf/bpf_design_QA.txt
@@ -1,156 +0,0 @@
 
				-BPF extensibility and applicability to networking, tracing, security
			
 
				-in the linux kernel and several user space implementations of BPF
			
 
				-virtual machine led to a number of misunderstanding on what BPF actually is.
			
 
				-This short QA is an attempt to address that and outline a direction
			
 
				-of where BPF is heading long term.
			
 
				-
			
 
				-Q: Is BPF a generic instruction set similar to x64 and arm64?
			
 
				-A: NO.
			
 
				-
			
 
				-Q: Is BPF a generic virtual machine ?
			
 
				-A: NO.
			
 
				-
			
 
				-BPF is generic instruction set _with_ C calling convention.
			
 
				-
			
 
				-Q: Why C calling convention was chosen?
			
 
				-A: Because BPF programs are designed to run in the linux kernel
			
 
				-   which is written in C, hence BPF defines instruction set compatible
			
 
				-   with two most used architectures x64 and arm64 (and takes into
			
 
				-   consideration important quirks of other architectures) and
			
 
				-   defines calling convention that is compatible with C calling
			
 
				-   convention of the linux kernel on those architectures.
			
 
				-
			
 
				-Q: can multiple return values be supported in the future?
			
 
				-A: NO. BPF allows only register R0 to be used as return value.
			
 
				-
			
 
				-Q: can more than 5 function arguments be supported in the future?
			
 
				-A: NO. BPF calling convention only allows registers R1-R5 to be used
			
 
				-   as arguments. BPF is not a standalone instruction set.
			
 
				-   (unlike x64 ISA that allows msft, cdecl and other conventions)
			
 
				-
			
 
				-Q: can BPF programs access instruction pointer or return address?
			
 
				-A: NO.
			
 
				-
			
 
				-Q: can BPF programs access stack pointer ?
			
 
				-A: NO. Only frame pointer (register R10) is accessible.
			
 
				-   From compiler point of view it's necessary to have stack pointer.
			
 
				-   For example LLVM defines register R11 as stack pointer in its
			
 
				-   BPF backend, but it makes sure that generated code never uses it.
			
 
				-
			
 
				-Q: Does C-calling convention diminishes possible use cases?
			
 
				-A: YES. BPF design forces addition of major functionality in the form
			
 
				-   of kernel helper functions and kernel objects like BPF maps with
			
 
				-   seamless interoperability between them. It lets kernel call into
			
 
				-   BPF programs and programs call kernel helpers with zero overhead.
			
 
				-   As all of them were native C code. That is particularly the case
			
 
				-   for JITed BPF programs that are indistinguishable from
			
 
				-   native kernel C code.
			
 
				-
			
 
				-Q: Does it mean that 'innovative' extensions to BPF code are disallowed?
			
 
				-A: Soft yes. At least for now until BPF core has support for
			
 
				-   bpf-to-bpf calls, indirect calls, loops, global variables,
			
 
				-   jump tables, read only sections and all other normal constructs
			
 
				-   that C code can produce.
			
 
				-
			
 
				-Q: Can loops be supported in a safe way?
			
 
				-A: It's not clear yet. BPF developers are trying to find a way to
			
 
				-   support bounded loops where the verifier can guarantee that
			
 
				-   the program terminates in less than 4096 instructions.
			
 
				-
			
 
				-Q: How come LD_ABS and LD_IND instruction are present in BPF whereas
			
 
				-   C code cannot express them and has to use builtin intrinsics?
			
 
				-A: This is artifact of compatibility with classic BPF. Modern
			
 
				-   networking code in BPF performs better without them.
			
 
				-   See 'direct packet access'.
			
 
				-
			
 
				-Q: It seems not all BPF instructions are one-to-one to native CPU.
			
 
				-   For example why BPF_JNE and other compare and jumps are not cpu-like?
			
 
				-A: This was necessary to avoid introducing flags into ISA which are
			
 
				-   impossible to make generic and efficient across CPU architectures.
			
 
				-
			
 
				-Q: why BPF_DIV instruction doesn't map to x64 div?
			
 
				-A: Because if we picked one-to-one relationship to x64 it would have made
			
 
				-   it more complicated to support on arm64 and other archs. Also it
			
 
				-   needs div-by-zero runtime check.
			
 
				-
			
 
				-Q: why there is no BPF_SDIV for signed divide operation?
			
 
				-A: Because it would be rarely used. llvm errors in such case and
			
 
				-   prints a suggestion to use unsigned divide instead
			
 
				-
			
 
				-Q: Why BPF has implicit prologue and epilogue?
			
 
				-A: Because architectures like sparc have register windows and in general
			
 
				-   there are enough subtle differences between architectures, so naive
			
 
				-   store return address into stack won't work. Another reason is BPF has
			
 
				-   to be safe from division by zero (and legacy exception path
			
 
				-   of LD_ABS insn). Those instructions need to invoke epilogue and
			
 
				-   return implicitly.
			
 
				-
			
 
				-Q: Why BPF_JLT and BPF_JLE instructions were not introduced in the beginning?
			
 
				-A: Because classic BPF didn't have them and BPF authors felt that compiler
			
 
				-   workaround would be acceptable. Turned out that programs lose performance
			
 
				-   due to lack of these compare instructions and they were added.
			
 
				-   These two instructions is a perfect example what kind of new BPF
			
 
				-   instructions are acceptable and can be added in the future.
			
 
				-   These two already had equivalent instructions in native CPUs.
			
 
				-   New instructions that don't have one-to-one mapping to HW instructions
			
 
				-   will not be accepted.
			
 
				-
			
 
				-Q: BPF 32-bit subregisters have a requirement to zero upper 32-bits of BPF
			
 
				-   registers which makes BPF inefficient virtual machine for 32-bit
			
 
				-   CPU architectures and 32-bit HW accelerators. Can true 32-bit registers
			
 
				-   be added to BPF in the future?
			
 
				-A: NO. The first thing to improve performance on 32-bit archs is to teach
			
 
				-   LLVM to generate code that uses 32-bit subregisters. Then second step
			
 
				-   is to teach verifier to mark operations where zero-ing upper bits
			
 
				-   is unnecessary. Then JITs can take advantage of those markings and
			
 
				-   drastically reduce size of generated code and improve performance.
			
 
				-
			
 
				-Q: Does BPF have a stable ABI?
			
 
				-A: YES. BPF instructions, arguments to BPF programs, set of helper
			
 
				-   functions and their arguments, recognized return codes are all part
			
 
				-   of ABI. However when tracing programs are using bpf_probe_read() helper
			
 
				-   to walk kernel internal datastructures and compile with kernel
			
 
				-   internal headers these accesses can and will break with newer
			
 
				-   kernels. The union bpf_attr -> kern_version is checked at load time
			
 
				-   to prevent accidentally loading kprobe-based bpf programs written
			
 
				-   for a different kernel. Networking programs don't do kern_version check.
			
 
				-
			
 
				-Q: How much stack space a BPF program uses?
			
 
				-A: Currently all program types are limited to 512 bytes of stack
			
 
				-   space, but the verifier computes the actual amount of stack used
			
 
				-   and both interpreter and most JITed code consume necessary amount.
			
 
				-
			
 
				-Q: Can BPF be offloaded to HW?
			
 
				-A: YES. BPF HW offload is supported by NFP driver.
			
 
				-
			
 
				-Q: Does classic BPF interpreter still exist?
			
 
				-A: NO. Classic BPF programs are converted into extend BPF instructions.
			
 
				-
			
 
				-Q: Can BPF call arbitrary kernel functions?
			
 
				-A: NO. BPF programs can only call a set of helper functions which
			
 
				-   is defined for every program type.
			
 
				-
			
 
				-Q: Can BPF overwrite arbitrary kernel memory?
			
 
				-A: NO. Tracing bpf programs can _read_ arbitrary memory with bpf_probe_read()
			
 
				-   and bpf_probe_read_str() helpers. Networking programs cannot read
			
 
				-   arbitrary memory, since they don't have access to these helpers.
			
 
				-   Programs can never read or write arbitrary memory directly.
			
 
				-
			
 
				-Q: Can BPF overwrite arbitrary user memory?
			
 
				-A: Sort-of. Tracing BPF programs can overwrite the user memory
			
 
				-   of the current task with bpf_probe_write_user(). Every time such
			
 
				-   program is loaded the kernel will print warning message, so
			
 
				-   this helper is only useful for experiments and prototypes.
			
 
				-   Tracing BPF programs are root only.
			
 
				-
			
 
				-Q: When bpf_trace_printk() helper is used the kernel prints nasty
			
 
				-   warning message. Why is that?
			
 
				-A: This is done to nudge program authors into better interfaces when
			
 
				-   programs need to pass data to user space. Like bpf_perf_event_output()
			
 
				-   can be used to efficiently stream data via perf ring buffer.
			
 
				-   BPF maps can be used for asynchronous data sharing between kernel
			
 
				-   and user space. bpf_trace_printk() should only be used for debugging.
			
 
				-
			
 
				-Q: Can BPF functionality such as new program or map types, new
			
 
				-   helpers, etc be added out of kernel module code?
			
 
				-A: NO.
			
--- a/Documentation/bpf/bpf_devel_QA.rst
+++ b/Documentation/bpf/bpf_devel_QA.rst
@@ -0,0 +1,640 @@
 
				+=================================
			
 
				+HOWTO interact with BPF subsystem
			
 
				+=================================
			
 
				+
			
 
				+This document provides information for the BPF subsystem about various
			
 
				+workflows related to reporting bugs, submitting patches, and queueing
			
 
				+patches for stable kernels.
			
 
				+
			
 
				+For general information about submitting patches, please refer to
			
 
				+`Documentation/process/`_. This document only describes additional specifics
			
 
				+related to BPF.
			
 
				+
			
 
				+.. contents::
			
 
				+    :local:
			
 
				+    :depth: 2
			
 
				+
			
 
				+Reporting bugs
			
 
				+==============
			
 
				+
			
 
				+Q: How do I report bugs for BPF kernel code?
			
 
				+--------------------------------------------
			
 
				+A: Since all BPF kernel development as well as bpftool and iproute2 BPF
			
 
				+loader development happens through the netdev kernel mailing list,
			
 
				+please report any found issues around BPF to the following mailing
			
 
				+list:
			
 
				+
			
 
				+ netdev@vger.kernel.org
			
 
				+
			
 
				+This may also include issues related to XDP, BPF tracing, etc.
			
 
				+
			
 
				+Given netdev has a high volume of traffic, please also add the BPF
			
 
				+maintainers to Cc (from kernel MAINTAINERS_ file):
			
 
				+
			
 
				+* Alexei Starovoitov <ast@kernel.org>
			
 
				+* Daniel Borkmann <daniel@iogearbox.net>
			
 
				+
			
 
				+In case a buggy commit has already been identified, make sure to keep
			
 
				+the actual commit authors in Cc as well for the report. They can
			
 
				+typically be identified through the kernel's git tree.
			
 
				+
			
 
				+**Please do NOT report BPF issues to bugzilla.kernel.org since it
			
 
				+is a guarantee that the reported issue will be overlooked.**
			
 
				+
			
 
				+Submitting patches
			
 
				+==================
			
 
				+
			
 
				+Q: To which mailing list do I need to submit my BPF patches?
			
 
				+------------------------------------------------------------
			
 
				+A: Please submit your BPF patches to the netdev kernel mailing list:
			
 
				+
			
 
				+ netdev@vger.kernel.org
			
 
				+
			
 
				+Historically, BPF came out of networking and has always been maintained
			
 
				+by the kernel networking community. Although these days BPF touches
			
 
				+many other subsystems as well, the patches are still routed mainly
			
 
				+through the networking community.
			
 
				+
			
 
				+In case your patch has changes in various different subsystems (e.g.
			
 
				+tracing, security, etc), make sure to Cc the related kernel mailing
			
 
				+lists and maintainers from there as well, so they are able to review
			
 
				+the changes and provide their Acked-by's to the patches.
			
 
				+
			
 
				+Q: Where can I find patches currently under discussion for BPF subsystem?
			
 
				+-------------------------------------------------------------------------
			
 
				+A: All patches that are Cc'ed to netdev are queued for review under netdev
			
 
				+patchwork project:
			
 
				+
			
 
				+  http://patchwork.ozlabs.org/project/netdev/list/
			
 
				+
			
 
				+Those patches which target BPF, are assigned to a 'bpf' delegate for
			
 
				+further processing from BPF maintainers. The current queue with
			
 
				+patches under review can be found at:
			
 
				+
			
 
				+  https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
			
 
				+
			
 
				+Once the patches have been reviewed by the BPF community as a whole
			
 
				+and approved by the BPF maintainers, their status in patchwork will be
			
 
				+changed to 'Accepted' and the submitter will be notified by mail. This
			
 
				+means that the patches look good from a BPF perspective and have been
			
 
				+applied to one of the two BPF kernel trees.
			
 
				+
			
 
				+In case feedback from the community requires a respin of the patches,
			
 
				+their status in patchwork will be set to 'Changes Requested', and purged
			
 
				+from the current review queue. Likewise for cases where patches would
			
 
				+get rejected or are not applicable to the BPF trees (but assigned to
			
 
				+the 'bpf' delegate).
			
 
				+
			
 
				+Q: How do the changes make their way into Linux?
			
 
				+------------------------------------------------
			
 
				+A: There are two BPF kernel trees (git repositories). Once patches have
			
 
				+been accepted by the BPF maintainers, they will be applied to one
			
 
				+of the two BPF trees:
			
 
				+
			
 
				+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
			
 
				+ * https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
			
 
				+
			
 
				+The bpf tree itself is for fixes only, whereas bpf-next for features,
			
 
				+cleanups or other kind of improvements ("next-like" content). This is
			
 
				+analogous to net and net-next trees for networking. Both bpf and
			
 
				+bpf-next will only have a master branch in order to simplify against
			
 
				+which branch patches should get rebased to.
			
 
				+
			
 
				+Accumulated BPF patches in the bpf tree will regularly get pulled
			
 
				+into the net kernel tree. Likewise, accumulated BPF patches accepted
			
 
				+into the bpf-next tree will make their way into net-next tree. net and
			
 
				+net-next are both run by David S. Miller. From there, they will go
			
 
				+into the kernel mainline tree run by Linus Torvalds. To read up on the
			
 
				+process of net and net-next being merged into the mainline tree, see
			
 
				+the `netdev FAQ`_ under:
			
 
				+
			
 
				+ `Documentation/networking/netdev-FAQ.txt`_
			
 
				+
			
 
				+Occasionally, to prevent merge conflicts, we might send pull requests
			
 
				+to other trees (e.g. tracing) with a small subset of the patches, but
			
 
				+net and net-next are always the main trees targeted for integration.
			
 
				+
			
 
				+The pull requests will contain a high-level summary of the accumulated
			
 
				+patches and can be searched on netdev kernel mailing list through the
			
 
				+following subject lines (``yyyy-mm-dd`` is the date of the pull
			
 
				+request)::
			
 
				+
			
 
				+  pull-request: bpf yyyy-mm-dd
			
 
				+  pull-request: bpf-next yyyy-mm-dd
			
 
				+
			
 
				+Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be applied to?
			
 
				+---------------------------------------------------------------------------------
			
 
				+
			
 
				+A: The process is the very same as described in the `netdev FAQ`_, so
			
 
				+please read up on it. The subject line must indicate whether the
			
 
				+patch is a fix or rather "next-like" content in order to let the
			
 
				+maintainers know whether it is targeted at bpf or bpf-next.
			
 
				+
			
 
				+For fixes eventually landing in bpf -> net tree, the subject must
			
 
				+look like::
			
 
				+
			
 
				+  git format-patch --subject-prefix='PATCH bpf' start..finish
			
 
				+
			
 
				+For features/improvements/etc that should eventually land in
			
 
				+bpf-next -> net-next, the subject must look like::
			
 
				+
			
 
				+  git format-patch --subject-prefix='PATCH bpf-next' start..finish
			
 
				+
			
 
				+If unsure whether the patch or patch series should go into bpf
			
 
				+or net directly, or bpf-next or net-next directly, it is not a
			
 
				+problem either if the subject line says net or net-next as target.
			
 
				+It is eventually up to the maintainers to do the delegation of
			
 
				+the patches.
			
 
				+
			
 
				+If it is clear that patches should go into bpf or bpf-next tree,
			
 
				+please make sure to rebase the patches against those trees in
			
 
				+order to reduce potential conflicts.
			
 
				+
			
 
				+In case the patch or patch series has to be reworked and sent out
			
 
				+again in a second or later revision, it is also required to add a
			
 
				+version number (``v2``, ``v3``, ...) into the subject prefix::
			
 
				+
			
 
				+  git format-patch --subject-prefix='PATCH net-next v2' start..finish
			
 
				+
			
 
				+When changes have been requested to the patch series, always send the
			
 
				+whole patch series again with the feedback incorporated (never send
			
 
				+individual diffs on top of the old series).
			
 
				+
			
 
				+Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
			
 
				+-----------------------------------------------------------------------
			
 
				+A: It means that the patch looks good for mainline inclusion from
			
 
				+a BPF point of view.
			
 
				+
			
 
				+Be aware that this is not a final verdict that the patch will
			
 
				+automatically get accepted into net or net-next trees eventually:
			
 
				+
			
 
				+On the netdev kernel mailing list reviews can come in at any point
			
 
				+in time. If discussions around a patch conclude that they cannot
			
 
				+get included as-is, we will either apply a follow-up fix or drop
			
 
				+them from the trees entirely. Therefore, we also reserve to rebase
			
 
				+the trees when deemed necessary. After all, the purpose of the tree
			
 
				+is to:
			
 
				+
			
 
				+i) accumulate and stage BPF patches for integration into trees
			
 
				+   like net and net-next, and
			
 
				+
			
 
				+ii) run extensive BPF test suite and
			
 
				+    workloads on the patches before they make their way any further.
			
 
				+
			
 
				+Once the BPF pull request was accepted by David S. Miller, then
			
 
				+the patches end up in net or net-next tree, respectively, and
			
 
				+make their way from there further into mainline. Again, see the
			
 
				+`netdev FAQ`_ for additional information e.g. on how often they are
			
 
				+merged to mainline.
			
 
				+
			
 
				+Q: How long do I need to wait for feedback on my BPF patches?
			
 
				+-------------------------------------------------------------
			
 
				+A: We try to keep the latency low. The usual time to feedback will
			
 
				+be around 2 or 3 business days. It may vary depending on the
			
 
				+complexity of changes and current patch load.
			
 
				+
			
 
				+Q: How often do you send pull requests to major kernel trees like net or net-next?
			
 
				+----------------------------------------------------------------------------------
			
 
				+
			
 
				+A: Pull requests will be sent out rather often in order to not
			
 
				+accumulate too many patches in bpf or bpf-next.
			
 
				+
			
 
				+As a rule of thumb, expect pull requests for each tree regularly
			
 
				+at the end of the week. In some cases pull requests could additionally
			
 
				+come also in the middle of the week depending on the current patch
			
 
				+load or urgency.
			
 
				+
			
 
				+Q: Are patches applied to bpf-next when the merge window is open?
			
 
				+-----------------------------------------------------------------
			
 
				+A: For the time when the merge window is open, bpf-next will not be
			
 
				+processed. This is roughly analogous to net-next patch processing,
			
 
				+so feel free to read up on the `netdev FAQ`_ about further details.
			
 
				+
			
 
				+During those two weeks of merge window, we might ask you to resend
			
 
				+your patch series once bpf-next is open again. Once Linus released
			
 
				+a ``v*-rc1`` after the merge window, we continue processing of bpf-next.
			
 
				+
			
 
				+For non-subscribers to kernel mailing lists, there is also a status
			
 
				+page run by David S. Miller on net-next that provides guidance:
			
 
				+
			
 
				+  http://vger.kernel.org/~davem/net-next.html
			
 
				+
			
 
				+Q: Verifier changes and test cases
			
 
				+----------------------------------
			
 
				+Q: I made a BPF verifier change, do I need to add test cases for
			
 
				+BPF kernel selftests_?
			
 
				+
			
 
				+A: If the patch has changes to the behavior of the verifier, then yes,
			
 
				+it is absolutely necessary to add test cases to the BPF kernel
			
 
				+selftests_ suite. If they are not present and we think they are
			
 
				+needed, then we might ask for them before accepting any changes.
			
 
				+
			
 
				+In particular, test_verifier.c is tracking a high number of BPF test
			
 
				+cases, including a lot of corner cases that LLVM BPF back end may
			
 
				+generate out of the restricted C code. Thus, adding test cases is
			
 
				+absolutely crucial to make sure future changes do not accidentally
			
 
				+affect prior use-cases. Thus, treat those test cases as: verifier
			
 
				+behavior that is not tracked in test_verifier.c could potentially
			
 
				+be subject to change.
			
 
				+
			
 
				+Q: samples/bpf preference vs selftests?
			
 
				+---------------------------------------
			
 
				+Q: When should I add code to `samples/bpf/`_ and when to BPF kernel
			
 
				+selftests_ ?
			
 
				+
			
 
				+A: In general, we prefer additions to BPF kernel selftests_ rather than
			
 
				+`samples/bpf/`_. The rationale is very simple: kernel selftests are
			
 
				+regularly run by various bots to test for kernel regressions.
			
 
				+
			
 
				+The more test cases we add to BPF selftests, the better the coverage
			
 
				+and the less likely it is that those could accidentally break. It is
			
 
				+not that BPF kernel selftests cannot demo how a specific feature can
			
 
				+be used.
			
 
				+
			
 
				+That said, `samples/bpf/`_ may be a good place for people to get started,
			
 
				+so it might be advisable that simple demos of features could go into
			
 
				+`samples/bpf/`_, but advanced functional and corner-case testing rather
			
 
				+into kernel selftests.
			
 
				+
			
 
				+If your sample looks like a test case, then go for BPF kernel selftests
			
 
				+instead!
			
 
				+
			
 
				+Q: When should I add code to the bpftool?
			
 
				+-----------------------------------------
			
 
				+A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
			
 
				+a central user space tool for debugging and introspection of BPF programs
			
 
				+and maps that are active in the kernel. If UAPI changes related to BPF
			
 
				+enable for dumping additional information of programs or maps, then
			
 
				+bpftool should be extended as well to support dumping them.
			
 
				+
			
 
				+Q: When should I add code to iproute2's BPF loader?
			
 
				+---------------------------------------------------
			
 
				+A: For UAPI changes related to the XDP or tc layer (e.g. ``cls_bpf``),
			
 
				+the convention is that those control-path related changes are added to
			
 
				+iproute2's BPF loader as well from user space side. This is not only
			
 
				+useful to have UAPI changes properly designed to be usable, but also
			
 
				+to make those changes available to a wider user base of major
			
 
				+downstream distributions.
			
 
				+
			
 
				+Q: Do you accept patches as well for iproute2's BPF loader?
			
 
				+-----------------------------------------------------------
			
 
				+A: Patches for the iproute2's BPF loader have to be sent to:
			
 
				+
			
 
				+  netdev@vger.kernel.org
			
 
				+
			
 
				+While those patches are not processed by the BPF kernel maintainers,
			
 
				+please keep them in Cc as well, so they can be reviewed.
			
 
				+
			
 
				+The official git repository for iproute2 is run by Stephen Hemminger
			
 
				+and can be found at:
			
 
				+
			
 
				+  https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
			
 
				+
			
 
				+The patches need to have a subject prefix of '``[PATCH iproute2
			
 
				+master]``' or '``[PATCH iproute2 net-next]``'. '``master``' or
			
 
				+'``net-next``' describes the target branch where the patch should be
			
 
				+applied to. Meaning, if kernel changes went into the net-next kernel
			
 
				+tree, then the related iproute2 changes need to go into the iproute2
			
 
				+net-next branch, otherwise they can be targeted at master branch. The
			
 
				+iproute2 net-next branch will get merged into the master branch after
			
 
				+the current iproute2 version from master has been released.
			
 
				+
			
 
				+Like BPF, the patches end up in patchwork under the netdev project and
			
 
				+are delegated to 'shemminger' for further processing:
			
 
				+
			
 
				+  http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
			
 
				+
			
 
				+Q: What is the minimum requirement before I submit my BPF patches?
			
 
				+------------------------------------------------------------------
			
 
				+A: When submitting patches, always take the time and properly test your
			
 
				+patches *prior* to submission. Never rush them! If maintainers find
			
 
				+that your patches have not been properly tested, it is a good way to
			
 
				+get them grumpy. Testing patch submissions is a hard requirement!
			
 
				+
			
 
				+Note, fixes that go to bpf tree *must* have a ``Fixes:`` tag included.
			
 
				+The same applies to fixes that target bpf-next, where the affected
			
 
				+commit is in net-next (or in some cases bpf-next). The ``Fixes:`` tag is
			
 
				+crucial in order to identify follow-up commits and tremendously helps
			
 
				+for people having to do backporting, so it is a must have!
			
 
				+
			
 
				+We also don't accept patches with an empty commit message. Take your
			
 
				+time and properly write up a high quality commit message, it is
			
 
				+essential!
			
 
				+
			
 
				+Think about it this way: other developers looking at your code a month
			
 
				+from now need to understand *why* a certain change has been done that
			
 
				+way, and whether there have been flaws in the analysis or assumptions
			
 
				+that the original author did. Thus providing a proper rationale and
			
 
				+describing the use-case for the changes is a must.
			
 
				+
			
 
				+Patch submissions with >1 patch must have a cover letter which includes
			
 
				+a high level description of the series. This high level summary will
			
 
				+then be placed into the merge commit by the BPF maintainers such that
			
 
				+it is also accessible from the git log for future reference.
			
 
				+
			
 
				+Q: Features changing BPF JIT and/or LLVM
			
 
				+----------------------------------------
			
 
				+Q: What do I need to consider when adding a new instruction or feature
			
 
				+that would require BPF JIT and/or LLVM integration as well?
			
 
				+
			
 
				+A: We try hard to keep all BPF JITs up to date such that the same user
			
 
				+experience can be guaranteed when running BPF programs on different
			
 
				+architectures without having the program punt to the less efficient
			
 
				+interpreter in case the in-kernel BPF JIT is enabled.
			
 
				+
			
 
				+If you are unable to implement or test the required JIT changes for
			
 
				+certain architectures, please work together with the related BPF JIT
			
 
				+developers in order to get the feature implemented in a timely manner.
			
 
				+Please refer to the git log (``arch/*/net/``) to locate the necessary
			
 
				+people for helping out.
			
 
				+
			
 
				+Also always make sure to add BPF test cases (e.g. test_bpf.c and
			
 
				+test_verifier.c) for new instructions, so that they can receive
			
 
				+broad test coverage and help run-time testing the various BPF JITs.
			
 
				+
			
 
				+In case of new BPF instructions, once the changes have been accepted
			
 
				+into the Linux kernel, please implement support into LLVM's BPF back
			
 
				+end. See LLVM_ section below for further information.
			
 
				+
			
 
				+Stable submission
			
 
				+=================
			
 
				+
			
 
				+Q: I need a specific BPF commit in stable kernels. What should I do?
			
 
				+--------------------------------------------------------------------
			
 
				+A: In case you need a specific fix in stable kernels, first check whether
			
 
				+the commit has already been applied in the related ``linux-*.y`` branches:
			
 
				+
			
 
				+  https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
			
 
				+
			
 
				+If not the case, then drop an email to the BPF maintainers with the
			
 
				+netdev kernel mailing list in Cc and ask for the fix to be queued up:
			
 
				+
			
 
				+  netdev@vger.kernel.org
			
 
				+
			
 
				+The process in general is the same as on netdev itself, see also the
			
 
				+`netdev FAQ`_ document.
			
 
				+
			
 
				+Q: Do you also backport to kernels not currently maintained as stable?
			
 
				+----------------------------------------------------------------------
			
 
				+A: No. If you need a specific BPF commit in kernels that are currently not
			
 
				+maintained by the stable maintainers, then you are on your own.
			
 
				+
			
 
				+The current stable and longterm stable kernels are all listed here:
			
 
				+
			
 
				+  https://www.kernel.org/
			
 
				+
			
 
				+Q: The BPF patch I am about to submit needs to go to stable as well
			
 
				+-------------------------------------------------------------------
			
 
				+What should I do?
			
 
				+
			
 
				+A: The same rules apply as with netdev patch submissions in general, see
			
 
				+`netdev FAQ`_ under:
			
 
				+
			
 
				+  `Documentation/networking/netdev-FAQ.txt`_
			
 
				+
			
 
				+Never add "``Cc: stable@vger.kernel.org``" to the patch description, but
			
 
				+ask the BPF maintainers to queue the patches instead. This can be done
			
 
				+with a note, for example, under the ``---`` part of the patch which does
			
 
				+not go into the git log. Alternatively, this can be done as a simple
			
 
				+request by mail instead.
			
 
				+
			
 
				+Q: Queue stable patches
			
 
				+-----------------------
			
 
				+Q: Where do I find currently queued BPF patches that will be submitted
			
 
				+to stable?
			
 
				+
			
 
				+A: Once patches that fix critical bugs got applied into the bpf tree, they
			
 
				+are queued up for stable submission under:
			
 
				+
			
 
				+  http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
			
 
				+
			
 
				+They will be on hold there at minimum until the related commit made its
			
 
				+way into the mainline kernel tree.
			
 
				+
			
 
				+After having been under broader exposure, the queued patches will be
			
 
				+submitted by the BPF maintainers to the stable maintainers.
			
 
				+
			
 
				+Testing patches
			
 
				+===============
			
 
				+
			
 
				+Q: How to run BPF selftests
			
 
				+---------------------------
			
 
				+A: After you have booted into the newly compiled kernel, navigate to
			
 
				+the BPF selftests_ suite in order to test BPF functionality (current
			
 
				+working directory points to the root of the cloned git tree)::
			
 
				+
			
 
				+  $ cd tools/testing/selftests/bpf/
			
 
				+  $ make
			
 
				+
			
 
				+To run the verifier tests::
			
 
				+
			
 
				+  $ sudo ./test_verifier
			
 
				+
			
 
				+The verifier tests print out all the current checks being
			
 
				+performed. The summary at the end of running all tests will dump
			
 
				+information of test successes and failures::
			
 
				+
			
 
				+  Summary: 418 PASSED, 0 FAILED
			
 
				+
			
 
				+In order to run through all BPF selftests, the following command is
			
 
				+needed::
			
 
				+
			
 
				+  $ sudo make run_tests
			
 
				+
			
 
				+See the kernels selftest `Documentation/dev-tools/kselftest.rst`_
			
 
				+document for further documentation.
			
 
				+
			
 
				+Q: Which BPF kernel selftests version should I run my kernel against?
			
 
				+---------------------------------------------------------------------
			
 
				+A: If you run a kernel ``xyz``, then always run the BPF kernel selftests
			
 
				+from that kernel ``xyz`` as well. Do not expect that the BPF selftest
			
 
				+from the latest mainline tree will pass all the time.
			
 
				+
			
 
				+In particular, test_bpf.c and test_verifier.c have a large number of
			
 
				+test cases and are constantly updated with new BPF test sequences, or
			
 
				+existing ones are adapted to verifier changes e.g. due to verifier
			
 
				+becoming smarter and being able to better track certain things.
			
 
				+
			
 
				+LLVM
			
 
				+====
			
 
				+
			
 
				+Q: Where do I find LLVM with BPF support?
			
 
				+-----------------------------------------
			
 
				+A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
			
 
				+
			
 
				+All major distributions these days ship LLVM with BPF back end enabled,
			
 
				+so for the majority of use-cases it is not required to compile LLVM by
			
 
				+hand anymore, just install the distribution provided package.
			
 
				+
			
 
				+LLVM's static compiler lists the supported targets through
			
 
				+``llc --version``, make sure BPF targets are listed. Example::
			
 
				+
			
 
				+     $ llc --version
			
 
				+     LLVM (http://llvm.org/):
			
 
				+       LLVM version 6.0.0svn
			
 
				+       Optimized build.
			
 
				+       Default target: x86_64-unknown-linux-gnu
			
 
				+       Host CPU: skylake
			
 
				+
			
 
				+       Registered Targets:
			
 
				+         bpf    - BPF (host endian)
			
 
				+         bpfeb  - BPF (big endian)
			
 
				+         bpfel  - BPF (little endian)
			
 
				+         x86    - 32-bit X86: Pentium-Pro and above
			
 
				+         x86-64 - 64-bit X86: EM64T and AMD64
			
 
				+
			
 
				+For developers in order to utilize the latest features added to LLVM's
			
 
				+BPF back end, it is advisable to run the latest LLVM releases. Support
			
 
				+for new BPF kernel features such as additions to the BPF instruction
			
 
				+set are often developed together.
			
 
				+
			
 
				+All LLVM releases can be found at: http://releases.llvm.org/
			
 
				+
			
 
				+Q: Got it, so how do I build LLVM manually anyway?
			
 
				+--------------------------------------------------
			
 
				+A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
			
 
				+that set up, proceed with building the latest LLVM and clang version
			
 
				+from the git repositories::
			
 
				+
			
 
				+     $ git clone http://llvm.org/git/llvm.git
			
 
				+     $ cd llvm/tools
			
 
				+     $ git clone --depth 1 http://llvm.org/git/clang.git
			
 
				+     $ cd ..; mkdir build; cd build
			
 
				+     $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
			
 
				+                -DBUILD_SHARED_LIBS=OFF           \
			
 
				+                -DCMAKE_BUILD_TYPE=Release        \
			
 
				+                -DLLVM_BUILD_RUNTIME=OFF
			
 
				+     $ make -j $(getconf _NPROCESSORS_ONLN)
			
 
				+
			
 
				+The built binaries can then be found in the build/bin/ directory, where
			
 
				+you can point the PATH variable to.
			
 
				+
			
 
				+Q: Reporting LLVM BPF issues
			
 
				+----------------------------
			
 
				+Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
			
 
				+generation back end or about LLVM generated code that the verifier
			
 
				+refuses to accept?
			
 
				+
			
 
				+A: Yes, please do!
			
 
				+
			
 
				+LLVM's BPF back end is a key piece of the whole BPF
			
 
				+infrastructure and it ties deeply into verification of programs from the
			
 
				+kernel side. Therefore, any issues on either side need to be investigated
			
 
				+and fixed whenever necessary.
			
 
				+
			
 
				+Therefore, please make sure to bring them up at netdev kernel mailing
			
 
				+list and Cc BPF maintainers for LLVM and kernel bits:
			
 
				+
			
 
				+* Yonghong Song <yhs@fb.com>
			
 
				+* Alexei Starovoitov <ast@kernel.org>
			
 
				+* Daniel Borkmann <daniel@iogearbox.net>
			
 
				+
			
 
				+LLVM also has an issue tracker where BPF related bugs can be found:
			
 
				+
			
 
				+  https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
			
 
				+
			
 
				+However, it is better to reach out through mailing lists with having
			
 
				+maintainers in Cc.
			
 
				+
			
 
				+Q: New BPF instruction for kernel and LLVM
			
 
				+------------------------------------------
			
 
				+Q: I have added a new BPF instruction to the kernel, how can I integrate
			
 
				+it into LLVM?
			
 
				+
			
 
				+A: LLVM has a ``-mcpu`` selector for the BPF back end in order to allow
			
 
				+the selection of BPF instruction set extensions. By default the
			
 
				+``generic`` processor target is used, which is the base instruction set
			
 
				+(v1) of BPF.
			
 
				+
			
 
				+LLVM has an option to select ``-mcpu=probe`` where it will probe the host
			
 
				+kernel for supported BPF instruction set extensions and selects the
			
 
				+optimal set automatically.
			
 
				+
			
 
				+For cross-compilation, a specific version can be select manually as well ::
			
 
				+
			
 
				+     $ llc -march bpf -mcpu=help
			
 
				+     Available CPUs for this target:
			
 
				+
			
 
				+       generic - Select the generic processor.
			
 
				+       probe   - Select the probe processor.
			
 
				+       v1      - Select the v1 processor.
			
 
				+       v2      - Select the v2 processor.
			
 
				+     [...]
			
 
				+
			
 
				+Newly added BPF instructions to the Linux kernel need to follow the same
			
 
				+scheme, bump the instruction set version and implement probing for the
			
 
				+extensions such that ``-mcpu=probe`` users can benefit from the
			
 
				+optimization transparently when upgrading their kernels.
			
 
				+
			
 
				+If you are unable to implement support for the newly added BPF instruction
			
 
				+please reach out to BPF developers for help.
			
 
				+
			
 
				+By the way, the BPF kernel selftests run with ``-mcpu=probe`` for better
			
 
				+test coverage.
			
 
				+
			
 
				+Q: clang flag for target bpf?
			
 
				+-----------------------------
			
 
				+Q: In some cases clang flag ``-target bpf`` is used but in other cases the
			
 
				+default clang target, which matches the underlying architecture, is used.
			
 
				+What is the difference and when I should use which?
			
 
				+
			
 
				+A: Although LLVM IR generation and optimization try to stay architecture
			
 
				+independent, ``-target <arch>`` still has some impact on generated code:
			
 
				+
			
 
				+- BPF program may recursively include header file(s) with file scope
			
 
				+  inline assembly codes. The default target can handle this well,
			
 
				+  while ``bpf`` target may fail if bpf backend assembler does not
			
 
				+  understand these assembly codes, which is true in most cases.
			
 
				+
			
 
				+- When compiled without ``-g``, additional elf sections, e.g.,
			
 
				+  .eh_frame and .rela.eh_frame, may be present in the object file
			
 
				+  with default target, but not with ``bpf`` target.
			
 
				+
			
 
				+- The default target may turn a C switch statement into a switch table
			
 
				+  lookup and jump operation. Since the switch table is placed
			
 
				+  in the global readonly section, the bpf program will fail to load.
			
 
				+  The bpf target does not support switch table optimization.
			
 
				+  The clang option ``-fno-jump-tables`` can be used to disable
			
 
				+  switch table generation.
			
 
				+
			
 
				+- For clang ``-target bpf``, it is guaranteed that pointer or long /
			
 
				+  unsigned long types will always have a width of 64 bit, no matter
			
 
				+  whether underlying clang binary or default target (or kernel) is
			
 
				+  32 bit. However, when native clang target is used, then it will
			
 
				+  compile these types based on the underlying architecture's conventions,
			
 
				+  meaning in case of 32 bit architecture, pointer or long / unsigned
			
 
				+  long types e.g. in BPF context structure will have width of 32 bit
			
 
				+  while the BPF LLVM back end still operates in 64 bit. The native
			
 
				+  target is mostly needed in tracing for the case of walking ``pt_regs``
			
 
				+  or other kernel structures where CPU's register width matters.
			
 
				+  Otherwise, ``clang -target bpf`` is generally recommended.
			
 
				+
			
 
				+You should use default target when:
			
 
				+
			
 
				+- Your program includes a header file, e.g., ptrace.h, which eventually
			
 
				+  pulls in some header files containing file scope host assembly codes.
			
 
				+
			
 
				+- You can add ``-fno-jump-tables`` to work around the switch table issue.
			
 
				+
			
 
				+Otherwise, you can use ``bpf`` target. Additionally, you *must* use bpf target
			
 
				+when:
			
 
				+
			
 
				+- Your program uses data structures with pointer or long / unsigned long
			
 
				+  types that interface with BPF helpers or context data structures. Access
			
 
				+  into these structures is verified by the BPF verifier and may result
			
 
				+  in verification failures if the native architecture is not aligned with
			
 
				+  the BPF architecture, e.g. 64-bit. An example of this is
			
 
				+  BPF_PROG_TYPE_SK_MSG require ``-target bpf``
			
 
				+
			
 
				+
			
 
				+.. Links
			
 
				+.. _Documentation/process/: https://www.kernel.org/doc/html/latest/process/
			
 
				+.. _MAINTAINERS: ../../MAINTAINERS
			
 
				+.. _Documentation/networking/netdev-FAQ.txt: ../networking/netdev-FAQ.txt
			
 
				+.. _netdev FAQ: ../networking/netdev-FAQ.txt
			
 
				+.. _samples/bpf/: ../../samples/bpf/
			
 
				+.. _selftests: ../../tools/testing/selftests/bpf/
			
 
				+.. _Documentation/dev-tools/kselftest.rst:
			
 
				+   https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
			
 
				+
			
 
				+Happy BPF hacking!
			
--- a/Documentation/bpf/bpf_devel_QA.txt
+++ b/Documentation/bpf/bpf_devel_QA.txt
@@ -1,570 +0,0 @@
 
				-This document provides information for the BPF subsystem about various
			
 
				-workflows related to reporting bugs, submitting patches, and queueing
			
 
				-patches for stable kernels.
			
 
				-
			
 
				-For general information about submitting patches, please refer to
			
 
				-Documentation/process/. This document only describes additional specifics
			
 
				-related to BPF.
			
 
				-
			
 
				-Reporting bugs:
			
 
				----------------
			
 
				-
			
 
				-Q: How do I report bugs for BPF kernel code?
			
 
				-
			
 
				-A: Since all BPF kernel development as well as bpftool and iproute2 BPF
			
 
				-   loader development happens through the netdev kernel mailing list,
			
 
				-   please report any found issues around BPF to the following mailing
			
 
				-   list:
			
 
				-
			
 
				-     netdev@vger.kernel.org
			
 
				-
			
 
				-   This may also include issues related to XDP, BPF tracing, etc.
			
 
				-
			
 
				-   Given netdev has a high volume of traffic, please also add the BPF
			
 
				-   maintainers to Cc (from kernel MAINTAINERS file):
			
 
				-
			
 
				-     Alexei Starovoitov <ast@kernel.org>
			
 
				-     Daniel Borkmann <daniel@iogearbox.net>
			
 
				-
			
 
				-   In case a buggy commit has already been identified, make sure to keep
			
 
				-   the actual commit authors in Cc as well for the report. They can
			
 
				-   typically be identified through the kernel's git tree.
			
 
				-
			
 
				-   Please do *not* report BPF issues to bugzilla.kernel.org since it
			
 
				-   is a guarantee that the reported issue will be overlooked.
			
 
				-
			
 
				-Submitting patches:
			
 
				--------------------
			
 
				-
			
 
				-Q: To which mailing list do I need to submit my BPF patches?
			
 
				-
			
 
				-A: Please submit your BPF patches to the netdev kernel mailing list:
			
 
				-
			
 
				-     netdev@vger.kernel.org
			
 
				-
			
 
				-   Historically, BPF came out of networking and has always been maintained
			
 
				-   by the kernel networking community. Although these days BPF touches
			
 
				-   many other subsystems as well, the patches are still routed mainly
			
 
				-   through the networking community.
			
 
				-
			
 
				-   In case your patch has changes in various different subsystems (e.g.
			
 
				-   tracing, security, etc), make sure to Cc the related kernel mailing
			
 
				-   lists and maintainers from there as well, so they are able to review
			
 
				-   the changes and provide their Acked-by's to the patches.
			
 
				-
			
 
				-Q: Where can I find patches currently under discussion for BPF subsystem?
			
 
				-
			
 
				-A: All patches that are Cc'ed to netdev are queued for review under netdev
			
 
				-   patchwork project:
			
 
				-
			
 
				-     http://patchwork.ozlabs.org/project/netdev/list/
			
 
				-
			
 
				-   Those patches which target BPF, are assigned to a 'bpf' delegate for
			
 
				-   further processing from BPF maintainers. The current queue with
			
 
				-   patches under review can be found at:
			
 
				-
			
 
				-     https://patchwork.ozlabs.org/project/netdev/list/?delegate=77147
			
 
				-
			
 
				-   Once the patches have been reviewed by the BPF community as a whole
			
 
				-   and approved by the BPF maintainers, their status in patchwork will be
			
 
				-   changed to 'Accepted' and the submitter will be notified by mail. This
			
 
				-   means that the patches look good from a BPF perspective and have been
			
 
				-   applied to one of the two BPF kernel trees.
			
 
				-
			
 
				-   In case feedback from the community requires a respin of the patches,
			
 
				-   their status in patchwork will be set to 'Changes Requested', and purged
			
 
				-   from the current review queue. Likewise for cases where patches would
			
 
				-   get rejected or are not applicable to the BPF trees (but assigned to
			
 
				-   the 'bpf' delegate).
			
 
				-
			
 
				-Q: How do the changes make their way into Linux?
			
 
				-
			
 
				-A: There are two BPF kernel trees (git repositories). Once patches have
			
 
				-   been accepted by the BPF maintainers, they will be applied to one
			
 
				-   of the two BPF trees:
			
 
				-
			
 
				-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf.git/
			
 
				-     https://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git/
			
 
				-
			
 
				-   The bpf tree itself is for fixes only, whereas bpf-next for features,
			
 
				-   cleanups or other kind of improvements ("next-like" content). This is
			
 
				-   analogous to net and net-next trees for networking. Both bpf and
			
 
				-   bpf-next will only have a master branch in order to simplify against
			
 
				-   which branch patches should get rebased to.
			
 
				-
			
 
				-   Accumulated BPF patches in the bpf tree will regularly get pulled
			
 
				-   into the net kernel tree. Likewise, accumulated BPF patches accepted
			
 
				-   into the bpf-next tree will make their way into net-next tree. net and
			
 
				-   net-next are both run by David S. Miller. From there, they will go
			
 
				-   into the kernel mainline tree run by Linus Torvalds. To read up on the
			
 
				-   process of net and net-next being merged into the mainline tree, see
			
 
				-   the netdev FAQ under:
			
 
				-
			
 
				-     Documentation/networking/netdev-FAQ.txt
			
 
				-
			
 
				-   Occasionally, to prevent merge conflicts, we might send pull requests
			
 
				-   to other trees (e.g. tracing) with a small subset of the patches, but
			
 
				-   net and net-next are always the main trees targeted for integration.
			
 
				-
			
 
				-   The pull requests will contain a high-level summary of the accumulated
			
 
				-   patches and can be searched on netdev kernel mailing list through the
			
 
				-   following subject lines (yyyy-mm-dd is the date of the pull request):
			
 
				-
			
 
				-     pull-request: bpf yyyy-mm-dd
			
 
				-     pull-request: bpf-next yyyy-mm-dd
			
 
				-
			
 
				-Q: How do I indicate which tree (bpf vs. bpf-next) my patch should be
			
 
				-   applied to?
			
 
				-
			
 
				-A: The process is the very same as described in the netdev FAQ, so
			
 
				-   please read up on it. The subject line must indicate whether the
			
 
				-   patch is a fix or rather "next-like" content in order to let the
			
 
				-   maintainers know whether it is targeted at bpf or bpf-next.
			
 
				-
			
 
				-   For fixes eventually landing in bpf -> net tree, the subject must
			
 
				-   look like:
			
 
				-
			
 
				-     git format-patch --subject-prefix='PATCH bpf' start..finish
			
 
				-
			
 
				-   For features/improvements/etc that should eventually land in
			
 
				-   bpf-next -> net-next, the subject must look like:
			
 
				-
			
 
				-     git format-patch --subject-prefix='PATCH bpf-next' start..finish
			
 
				-
			
 
				-   If unsure whether the patch or patch series should go into bpf
			
 
				-   or net directly, or bpf-next or net-next directly, it is not a
			
 
				-   problem either if the subject line says net or net-next as target.
			
 
				-   It is eventually up to the maintainers to do the delegation of
			
 
				-   the patches.
			
 
				-
			
 
				-   If it is clear that patches should go into bpf or bpf-next tree,
			
 
				-   please make sure to rebase the patches against those trees in
			
 
				-   order to reduce potential conflicts.
			
 
				-
			
 
				-   In case the patch or patch series has to be reworked and sent out
			
 
				-   again in a second or later revision, it is also required to add a
			
 
				-   version number (v2, v3, ...) into the subject prefix:
			
 
				-
			
 
				-     git format-patch --subject-prefix='PATCH net-next v2' start..finish
			
 
				-
			
 
				-   When changes have been requested to the patch series, always send the
			
 
				-   whole patch series again with the feedback incorporated (never send
			
 
				-   individual diffs on top of the old series).
			
 
				-
			
 
				-Q: What does it mean when a patch gets applied to bpf or bpf-next tree?
			
 
				-
			
 
				-A: It means that the patch looks good for mainline inclusion from
			
 
				-   a BPF point of view.
			
 
				-
			
 
				-   Be aware that this is not a final verdict that the patch will
			
 
				-   automatically get accepted into net or net-next trees eventually:
			
 
				-
			
 
				-   On the netdev kernel mailing list reviews can come in at any point
			
 
				-   in time. If discussions around a patch conclude that they cannot
			
 
				-   get included as-is, we will either apply a follow-up fix or drop
			
 
				-   them from the trees entirely. Therefore, we also reserve to rebase
			
 
				-   the trees when deemed necessary. After all, the purpose of the tree
			
 
				-   is to i) accumulate and stage BPF patches for integration into trees
			
 
				-   like net and net-next, and ii) run extensive BPF test suite and
			
 
				-   workloads on the patches before they make their way any further.
			
 
				-
			
 
				-   Once the BPF pull request was accepted by David S. Miller, then
			
 
				-   the patches end up in net or net-next tree, respectively, and
			
 
				-   make their way from there further into mainline. Again, see the
			
 
				-   netdev FAQ for additional information e.g. on how often they are
			
 
				-   merged to mainline.
			
 
				-
			
 
				-Q: How long do I need to wait for feedback on my BPF patches?
			
 
				-
			
 
				-A: We try to keep the latency low. The usual time to feedback will
			
 
				-   be around 2 or 3 business days. It may vary depending on the
			
 
				-   complexity of changes and current patch load.
			
 
				-
			
 
				-Q: How often do you send pull requests to major kernel trees like
			
 
				-   net or net-next?
			
 
				-
			
 
				-A: Pull requests will be sent out rather often in order to not
			
 
				-   accumulate too many patches in bpf or bpf-next.
			
 
				-
			
 
				-   As a rule of thumb, expect pull requests for each tree regularly
			
 
				-   at the end of the week. In some cases pull requests could additionally
			
 
				-   come also in the middle of the week depending on the current patch
			
 
				-   load or urgency.
			
 
				-
			
 
				-Q: Are patches applied to bpf-next when the merge window is open?
			
 
				-
			
 
				-A: For the time when the merge window is open, bpf-next will not be
			
 
				-   processed. This is roughly analogous to net-next patch processing,
			
 
				-   so feel free to read up on the netdev FAQ about further details.
			
 
				-
			
 
				-   During those two weeks of merge window, we might ask you to resend
			
 
				-   your patch series once bpf-next is open again. Once Linus released
			
 
				-   a v*-rc1 after the merge window, we continue processing of bpf-next.
			
 
				-
			
 
				-   For non-subscribers to kernel mailing lists, there is also a status
			
 
				-   page run by David S. Miller on net-next that provides guidance:
			
 
				-
			
 
				-     http://vger.kernel.org/~davem/net-next.html
			
 
				-
			
 
				-Q: I made a BPF verifier change, do I need to add test cases for
			
 
				-   BPF kernel selftests?
			
 
				-
			
 
				-A: If the patch has changes to the behavior of the verifier, then yes,
			
 
				-   it is absolutely necessary to add test cases to the BPF kernel
			
 
				-   selftests suite. If they are not present and we think they are
			
 
				-   needed, then we might ask for them before accepting any changes.
			
 
				-
			
 
				-   In particular, test_verifier.c is tracking a high number of BPF test
			
 
				-   cases, including a lot of corner cases that LLVM BPF back end may
			
 
				-   generate out of the restricted C code. Thus, adding test cases is
			
 
				-   absolutely crucial to make sure future changes do not accidentally
			
 
				-   affect prior use-cases. Thus, treat those test cases as: verifier
			
 
				-   behavior that is not tracked in test_verifier.c could potentially
			
 
				-   be subject to change.
			
 
				-
			
 
				-Q: When should I add code to samples/bpf/ and when to BPF kernel
			
 
				-   selftests?
			
 
				-
			
 
				-A: In general, we prefer additions to BPF kernel selftests rather than
			
 
				-   samples/bpf/. The rationale is very simple: kernel selftests are
			
 
				-   regularly run by various bots to test for kernel regressions.
			
 
				-
			
 
				-   The more test cases we add to BPF selftests, the better the coverage
			
 
				-   and the less likely it is that those could accidentally break. It is
			
 
				-   not that BPF kernel selftests cannot demo how a specific feature can
			
 
				-   be used.
			
 
				-
			
 
				-   That said, samples/bpf/ may be a good place for people to get started,
			
 
				-   so it might be advisable that simple demos of features could go into
			
 
				-   samples/bpf/, but advanced functional and corner-case testing rather
			
 
				-   into kernel selftests.
			
 
				-
			
 
				-   If your sample looks like a test case, then go for BPF kernel selftests
			
 
				-   instead!
			
 
				-
			
 
				-Q: When should I add code to the bpftool?
			
 
				-
			
 
				-A: The main purpose of bpftool (under tools/bpf/bpftool/) is to provide
			
 
				-   a central user space tool for debugging and introspection of BPF programs
			
 
				-   and maps that are active in the kernel. If UAPI changes related to BPF
			
 
				-   enable for dumping additional information of programs or maps, then
			
 
				-   bpftool should be extended as well to support dumping them.
			
 
				-
			
 
				-Q: When should I add code to iproute2's BPF loader?
			
 
				-
			
 
				-A: For UAPI changes related to the XDP or tc layer (e.g. cls_bpf), the
			
 
				-   convention is that those control-path related changes are added to
			
 
				-   iproute2's BPF loader as well from user space side. This is not only
			
 
				-   useful to have UAPI changes properly designed to be usable, but also
			
 
				-   to make those changes available to a wider user base of major
			
 
				-   downstream distributions.
			
 
				-
			
 
				-Q: Do you accept patches as well for iproute2's BPF loader?
			
 
				-
			
 
				-A: Patches for the iproute2's BPF loader have to be sent to:
			
 
				-
			
 
				-     netdev@vger.kernel.org
			
 
				-
			
 
				-   While those patches are not processed by the BPF kernel maintainers,
			
 
				-   please keep them in Cc as well, so they can be reviewed.
			
 
				-
			
 
				-   The official git repository for iproute2 is run by Stephen Hemminger
			
 
				-   and can be found at:
			
 
				-
			
 
				-     https://git.kernel.org/pub/scm/linux/kernel/git/shemminger/iproute2.git/
			
 
				-
			
 
				-   The patches need to have a subject prefix of '[PATCH iproute2 master]'
			
 
				-   or '[PATCH iproute2 net-next]'. 'master' or 'net-next' describes the
			
 
				-   target branch where the patch should be applied to. Meaning, if kernel
			
 
				-   changes went into the net-next kernel tree, then the related iproute2
			
 
				-   changes need to go into the iproute2 net-next branch, otherwise they
			
 
				-   can be targeted at master branch. The iproute2 net-next branch will get
			
 
				-   merged into the master branch after the current iproute2 version from
			
 
				-   master has been released.
			
 
				-
			
 
				-   Like BPF, the patches end up in patchwork under the netdev project and
			
 
				-   are delegated to 'shemminger' for further processing:
			
 
				-
			
 
				-     http://patchwork.ozlabs.org/project/netdev/list/?delegate=389
			
 
				-
			
 
				-Q: What is the minimum requirement before I submit my BPF patches?
			
 
				-
			
 
				-A: When submitting patches, always take the time and properly test your
			
 
				-   patches *prior* to submission. Never rush them! If maintainers find
			
 
				-   that your patches have not been properly tested, it is a good way to
			
 
				-   get them grumpy. Testing patch submissions is a hard requirement!
			
 
				-
			
 
				-   Note, fixes that go to bpf tree *must* have a Fixes: tag included. The
			
 
				-   same applies to fixes that target bpf-next, where the affected commit
			
 
				-   is in net-next (or in some cases bpf-next). The Fixes: tag is crucial
			
 
				-   in order to identify follow-up commits and tremendously helps for people
			
 
				-   having to do backporting, so it is a must have!
			
 
				-
			
 
				-   We also don't accept patches with an empty commit message. Take your
			
 
				-   time and properly write up a high quality commit message, it is
			
 
				-   essential!
			
 
				-
			
 
				-   Think about it this way: other developers looking at your code a month
			
 
				-   from now need to understand *why* a certain change has been done that
			
 
				-   way, and whether there have been flaws in the analysis or assumptions
			
 
				-   that the original author did. Thus providing a proper rationale and
			
 
				-   describing the use-case for the changes is a must.
			
 
				-
			
 
				-   Patch submissions with >1 patch must have a cover letter which includes
			
 
				-   a high level description of the series. This high level summary will
			
 
				-   then be placed into the merge commit by the BPF maintainers such that
			
 
				-   it is also accessible from the git log for future reference.
			
 
				-
			
 
				-Q: What do I need to consider when adding a new instruction or feature
			
 
				-   that would require BPF JIT and/or LLVM integration as well?
			
 
				-
			
 
				-A: We try hard to keep all BPF JITs up to date such that the same user
			
 
				-   experience can be guaranteed when running BPF programs on different
			
 
				-   architectures without having the program punt to the less efficient
			
 
				-   interpreter in case the in-kernel BPF JIT is enabled.
			
 
				-
			
 
				-   If you are unable to implement or test the required JIT changes for
			
 
				-   certain architectures, please work together with the related BPF JIT
			
 
				-   developers in order to get the feature implemented in a timely manner.
			
 
				-   Please refer to the git log (arch/*/net/) to locate the necessary
			
 
				-   people for helping out.
			
 
				-
			
 
				-   Also always make sure to add BPF test cases (e.g. test_bpf.c and
			
 
				-   test_verifier.c) for new instructions, so that they can receive
			
 
				-   broad test coverage and help run-time testing the various BPF JITs.
			
 
				-
			
 
				-   In case of new BPF instructions, once the changes have been accepted
			
 
				-   into the Linux kernel, please implement support into LLVM's BPF back
			
 
				-   end. See LLVM section below for further information.
			
 
				-
			
 
				-Stable submission:
			
 
				-------------------
			
 
				-
			
 
				-Q: I need a specific BPF commit in stable kernels. What should I do?
			
 
				-
			
 
				-A: In case you need a specific fix in stable kernels, first check whether
			
 
				-   the commit has already been applied in the related linux-*.y branches:
			
 
				-
			
 
				-     https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable.git/
			
 
				-
			
 
				-   If not the case, then drop an email to the BPF maintainers with the
			
 
				-   netdev kernel mailing list in Cc and ask for the fix to be queued up:
			
 
				-
			
 
				-     netdev@vger.kernel.org
			
 
				-
			
 
				-   The process in general is the same as on netdev itself, see also the
			
 
				-   netdev FAQ document.
			
 
				-
			
 
				-Q: Do you also backport to kernels not currently maintained as stable?
			
 
				-
			
 
				-A: No. If you need a specific BPF commit in kernels that are currently not
			
 
				-   maintained by the stable maintainers, then you are on your own.
			
 
				-
			
 
				-   The current stable and longterm stable kernels are all listed here:
			
 
				-
			
 
				-     https://www.kernel.org/
			
 
				-
			
 
				-Q: The BPF patch I am about to submit needs to go to stable as well. What
			
 
				-   should I do?
			
 
				-
			
 
				-A: The same rules apply as with netdev patch submissions in general, see
			
 
				-   netdev FAQ under:
			
 
				-
			
 
				-     Documentation/networking/netdev-FAQ.txt
			
 
				-
			
 
				-   Never add "Cc: stable@vger.kernel.org" to the patch description, but
			
 
				-   ask the BPF maintainers to queue the patches instead. This can be done
			
 
				-   with a note, for example, under the "---" part of the patch which does
			
 
				-   not go into the git log. Alternatively, this can be done as a simple
			
 
				-   request by mail instead.
			
 
				-
			
 
				-Q: Where do I find currently queued BPF patches that will be submitted
			
 
				-   to stable?
			
 
				-
			
 
				-A: Once patches that fix critical bugs got applied into the bpf tree, they
			
 
				-   are queued up for stable submission under:
			
 
				-
			
 
				-     http://patchwork.ozlabs.org/bundle/bpf/stable/?state=*
			
 
				-
			
 
				-   They will be on hold there at minimum until the related commit made its
			
 
				-   way into the mainline kernel tree.
			
 
				-
			
 
				-   After having been under broader exposure, the queued patches will be
			
 
				-   submitted by the BPF maintainers to the stable maintainers.
			
 
				-
			
 
				-Testing patches:
			
 
				-----------------
			
 
				-
			
 
				-Q: Which BPF kernel selftests version should I run my kernel against?
			
 
				-
			
 
				-A: If you run a kernel xyz, then always run the BPF kernel selftests from
			
 
				-   that kernel xyz as well. Do not expect that the BPF selftest from the
			
 
				-   latest mainline tree will pass all the time.
			
 
				-
			
 
				-   In particular, test_bpf.c and test_verifier.c have a large number of
			
 
				-   test cases and are constantly updated with new BPF test sequences, or
			
 
				-   existing ones are adapted to verifier changes e.g. due to verifier
			
 
				-   becoming smarter and being able to better track certain things.
			
 
				-
			
 
				-LLVM:
			
 
				------
			
 
				-
			
 
				-Q: Where do I find LLVM with BPF support?
			
 
				-
			
 
				-A: The BPF back end for LLVM is upstream in LLVM since version 3.7.1.
			
 
				-
			
 
				-   All major distributions these days ship LLVM with BPF back end enabled,
			
 
				-   so for the majority of use-cases it is not required to compile LLVM by
			
 
				-   hand anymore, just install the distribution provided package.
			
 
				-
			
 
				-   LLVM's static compiler lists the supported targets through 'llc --version',
			
 
				-   make sure BPF targets are listed. Example:
			
 
				-
			
 
				-     $ llc --version
			
 
				-     LLVM (http://llvm.org/):
			
 
				-       LLVM version 6.0.0svn
			
 
				-       Optimized build.
			
 
				-       Default target: x86_64-unknown-linux-gnu
			
 
				-       Host CPU: skylake
			
 
				-
			
 
				-       Registered Targets:
			
 
				-         bpf    - BPF (host endian)
			
 
				-         bpfeb  - BPF (big endian)
			
 
				-         bpfel  - BPF (little endian)
			
 
				-         x86    - 32-bit X86: Pentium-Pro and above
			
 
				-         x86-64 - 64-bit X86: EM64T and AMD64
			
 
				-
			
 
				-   For developers in order to utilize the latest features added to LLVM's
			
 
				-   BPF back end, it is advisable to run the latest LLVM releases. Support
			
 
				-   for new BPF kernel features such as additions to the BPF instruction
			
 
				-   set are often developed together.
			
 
				-
			
 
				-   All LLVM releases can be found at: http://releases.llvm.org/
			
 
				-
			
 
				-Q: Got it, so how do I build LLVM manually anyway?
			
 
				-
			
 
				-A: You need cmake and gcc-c++ as build requisites for LLVM. Once you have
			
 
				-   that set up, proceed with building the latest LLVM and clang version
			
 
				-   from the git repositories:
			
 
				-
			
 
				-     $ git clone http://llvm.org/git/llvm.git
			
 
				-     $ cd llvm/tools
			
 
				-     $ git clone --depth 1 http://llvm.org/git/clang.git
			
 
				-     $ cd ..; mkdir build; cd build
			
 
				-     $ cmake .. -DLLVM_TARGETS_TO_BUILD="BPF;X86" \
			
 
				-                -DBUILD_SHARED_LIBS=OFF           \
			
 
				-                -DCMAKE_BUILD_TYPE=Release        \
			
 
				-                -DLLVM_BUILD_RUNTIME=OFF
			
 
				-     $ make -j $(getconf _NPROCESSORS_ONLN)
			
 
				-
			
 
				-   The built binaries can then be found in the build/bin/ directory, where
			
 
				-   you can point the PATH variable to.
			
 
				-
			
 
				-Q: Should I notify BPF kernel maintainers about issues in LLVM's BPF code
			
 
				-   generation back end or about LLVM generated code that the verifier
			
 
				-   refuses to accept?
			
 
				-
			
 
				-A: Yes, please do! LLVM's BPF back end is a key piece of the whole BPF
			
 
				-   infrastructure and it ties deeply into verification of programs from the
			
 
				-   kernel side. Therefore, any issues on either side need to be investigated
			
 
				-   and fixed whenever necessary.
			
 
				-
			
 
				-   Therefore, please make sure to bring them up at netdev kernel mailing
			
 
				-   list and Cc BPF maintainers for LLVM and kernel bits:
			
 
				-
			
 
				-     Yonghong Song <yhs@fb.com>
			
 
				-     Alexei Starovoitov <ast@kernel.org>
			
 
				-     Daniel Borkmann <daniel@iogearbox.net>
			
 
				-
			
 
				-   LLVM also has an issue tracker where BPF related bugs can be found:
			
 
				-
			
 
				-     https://bugs.llvm.org/buglist.cgi?quicksearch=bpf
			
 
				-
			
 
				-   However, it is better to reach out through mailing lists with having
			
 
				-   maintainers in Cc.
			
 
				-
			
 
				-Q: I have added a new BPF instruction to the kernel, how can I integrate
			
 
				-   it into LLVM?
			
 
				-
			
 
				-A: LLVM has a -mcpu selector for the BPF back end in order to allow the
			
 
				-   selection of BPF instruction set extensions. By default the 'generic'
			
 
				-   processor target is used, which is the base instruction set (v1) of BPF.
			
 
				-
			
 
				-   LLVM has an option to select -mcpu=probe where it will probe the host
			
 
				-   kernel for supported BPF instruction set extensions and selects the
			
 
				-   optimal set automatically.
			
 
				-
			
 
				-   For cross-compilation, a specific version can be select manually as well.
			
 
				-
			
 
				-     $ llc -march bpf -mcpu=help
			
 
				-     Available CPUs for this target:
			
 
				-
			
 
				-       generic - Select the generic processor.
			
 
				-       probe   - Select the probe processor.
			
 
				-       v1      - Select the v1 processor.
			
 
				-       v2      - Select the v2 processor.
			
 
				-     [...]
			
 
				-
			
 
				-   Newly added BPF instructions to the Linux kernel need to follow the same
			
 
				-   scheme, bump the instruction set version and implement probing for the
			
 
				-   extensions such that -mcpu=probe users can benefit from the optimization
			
 
				-   transparently when upgrading their kernels.
			
 
				-
			
 
				-   If you are unable to implement support for the newly added BPF instruction
			
 
				-   please reach out to BPF developers for help.
			
 
				-
			
 
				-   By the way, the BPF kernel selftests run with -mcpu=probe for better
			
 
				-   test coverage.
			
 
				-
			
 
				-Q: In some cases clang flag "-target bpf" is used but in other cases the
			
 
				-   default clang target, which matches the underlying architecture, is used.
			
 
				-   What is the difference and when I should use which?
			
 
				-
			
 
				-A: Although LLVM IR generation and optimization try to stay architecture
			
 
				-   independent, "-target <arch>" still has some impact on generated code:
			
 
				-
			
 
				-     - BPF program may recursively include header file(s) with file scope
			
 
				-       inline assembly codes. The default target can handle this well,
			
 
				-       while bpf target may fail if bpf backend assembler does not
			
 
				-       understand these assembly codes, which is true in most cases.
			
 
				-
			
 
				-     - When compiled without -g, additional elf sections, e.g.,
			
 
				-       .eh_frame and .rela.eh_frame, may be present in the object file
			
 
				-       with default target, but not with bpf target.
			
 
				-
			
 
				-     - The default target may turn a C switch statement into a switch table
			
 
				-       lookup and jump operation. Since the switch table is placed
			
 
				-       in the global readonly section, the bpf program will fail to load.
			
 
				-       The bpf target does not support switch table optimization.
			
 
				-       The clang option "-fno-jump-tables" can be used to disable
			
 
				-       switch table generation.
			
 
				-
			
 
				-     - For clang -target bpf, it is guaranteed that pointer or long /
			
 
				-       unsigned long types will always have a width of 64 bit, no matter
			
 
				-       whether underlying clang binary or default target (or kernel) is
			
 
				-       32 bit. However, when native clang target is used, then it will
			
 
				-       compile these types based on the underlying architecture's conventions,
			
 
				-       meaning in case of 32 bit architecture, pointer or long / unsigned
			
 
				-       long types e.g. in BPF context structure will have width of 32 bit
			
 
				-       while the BPF LLVM back end still operates in 64 bit. The native
			
 
				-       target is mostly needed in tracing for the case of walking pt_regs
			
 
				-       or other kernel structures where CPU's register width matters.
			
 
				-       Otherwise, clang -target bpf is generally recommended.
			
 
				-
			
 
				-   You should use default target when:
			
 
				-
			
 
				-     - Your program includes a header file, e.g., ptrace.h, which eventually
			
 
				-       pulls in some header files containing file scope host assembly codes.
			
 
				-     - You can add "-fno-jump-tables" to work around the switch table issue.
			
 
				-
			
 
				-   Otherwise, you can use bpf target. Additionally, you _must_ use bpf target
			
 
				-   when:
			
 
				-
			
 
				-     - Your program uses data structures with pointer or long / unsigned long
			
 
				-       types that interface with BPF helpers or context data structures. Access
			
 
				-       into these structures is verified by the BPF verifier and may result
			
 
				-       in verification failures if the native architecture is not aligned with
			
 
				-       the BPF architecture, e.g. 64-bit. An example of this is
			
 
				-       BPF_PROG_TYPE_SK_MSG require '-target bpf'
			
 
				-
			
 
				-Happy BPF hacking!
			
--- a/Documentation/cgroup-v2.txt
+++ b/Documentation/cgroup-v2.txt
@@ -1,1998 +0,0 @@
 
				-================
			
 
				-Control Group v2
			
 
				-================
			
 
				-
			
 
				-:Date: October, 2015
			
 
				-:Author: Tejun Heo <tj@kernel.org>
			
 
				-
			
 
				-This is the authoritative documentation on the design, interface and
			
 
				-conventions of cgroup v2.  It describes all userland-visible aspects
			
 
				-of cgroup including core and specific controller behaviors.  All
			
 
				-future changes must be reflected in this document.  Documentation for
			
 
				-v1 is available under Documentation/cgroup-v1/.
			
 
				-
			
 
				-.. CONTENTS
			
 
				-
			
 
				-   1. Introduction
			
 
				-     1-1. Terminology
			
 
				-     1-2. What is cgroup?
			
 
				-   2. Basic Operations
			
 
				-     2-1. Mounting
			
 
				-     2-2. Organizing Processes and Threads
			
 
				-       2-2-1. Processes
			
 
				-       2-2-2. Threads
			
 
				-     2-3. [Un]populated Notification
			
 
				-     2-4. Controlling Controllers
			
 
				-       2-4-1. Enabling and Disabling
			
 
				-       2-4-2. Top-down Constraint
			
 
				-       2-4-3. No Internal Process Constraint
			
 
				-     2-5. Delegation
			
 
				-       2-5-1. Model of Delegation
			
 
				-       2-5-2. Delegation Containment
			
 
				-     2-6. Guidelines
			
 
				-       2-6-1. Organize Once and Control
			
 
				-       2-6-2. Avoid Name Collisions
			
 
				-   3. Resource Distribution Models
			
 
				-     3-1. Weights
			
 
				-     3-2. Limits
			
 
				-     3-3. Protections
			
 
				-     3-4. Allocations
			
 
				-   4. Interface Files
			
 
				-     4-1. Format
			
 
				-     4-2. Conventions
			
 
				-     4-3. Core Interface Files
			
 
				-   5. Controllers
			
 
				-     5-1. CPU
			
 
				-       5-1-1. CPU Interface Files
			
 
				-     5-2. Memory
			
 
				-       5-2-1. Memory Interface Files
			
 
				-       5-2-2. Usage Guidelines
			
 
				-       5-2-3. Memory Ownership
			
 
				-     5-3. IO
			
 
				-       5-3-1. IO Interface Files
			
 
				-       5-3-2. Writeback
			
 
				-     5-4. PID
			
 
				-       5-4-1. PID Interface Files
			
 
				-     5-5. Device
			
 
				-     5-6. RDMA
			
 
				-       5-6-1. RDMA Interface Files
			
 
				-     5-7. Misc
			
 
				-       5-7-1. perf_event
			
 
				-     5-N. Non-normative information
			
 
				-       5-N-1. CPU controller root cgroup process behaviour
			
 
				-       5-N-2. IO controller root cgroup process behaviour
			
 
				-   6. Namespace
			
 
				-     6-1. Basics
			
 
				-     6-2. The Root and Views
			
 
				-     6-3. Migration and setns(2)
			
 
				-     6-4. Interaction with Other Namespaces
			
 
				-   P. Information on Kernel Programming
			
 
				-     P-1. Filesystem Support for Writeback
			
 
				-   D. Deprecated v1 Core Features
			
 
				-   R. Issues with v1 and Rationales for v2
			
 
				-     R-1. Multiple Hierarchies
			
 
				-     R-2. Thread Granularity
			
 
				-     R-3. Competition Between Inner Nodes and Threads
			
 
				-     R-4. Other Interface Issues
			
 
				-     R-5. Controller Issues and Remedies
			
 
				-       R-5-1. Memory
			
 
				-
			
 
				-
			
 
				-Introduction
			
 
				-============
			
 
				-
			
 
				-Terminology
			
 
				------------
			
 
				-
			
 
				-"cgroup" stands for "control group" and is never capitalized.  The
			
 
				-singular form is used to designate the whole feature and also as a
			
 
				-qualifier as in "cgroup controllers".  When explicitly referring to
			
 
				-multiple individual control groups, the plural form "cgroups" is used.
			
 
				-
			
 
				-
			
 
				-What is cgroup?
			
 
				----------------
			
 
				-
			
 
				-cgroup is a mechanism to organize processes hierarchically and
			
 
				-distribute system resources along the hierarchy in a controlled and
			
 
				-configurable manner.
			
 
				-
			
 
				-cgroup is largely composed of two parts - the core and controllers.
			
 
				-cgroup core is primarily responsible for hierarchically organizing
			
 
				-processes.  A cgroup controller is usually responsible for
			
 
				-distributing a specific type of system resource along the hierarchy
			
 
				-although there are utility controllers which serve purposes other than
			
 
				-resource distribution.
			
 
				-
			
 
				-cgroups form a tree structure and every process in the system belongs
			
 
				-to one and only one cgroup.  All threads of a process belong to the
			
 
				-same cgroup.  On creation, all processes are put in the cgroup that
			
 
				-the parent process belongs to at the time.  A process can be migrated
			
 
				-to another cgroup.  Migration of a process doesn't affect already
			
 
				-existing descendant processes.
			
 
				-
			
 
				-Following certain structural constraints, controllers may be enabled or
			
 
				-disabled selectively on a cgroup.  All controller behaviors are
			
 
				-hierarchical - if a controller is enabled on a cgroup, it affects all
			
 
				-processes which belong to the cgroups consisting the inclusive
			
 
				-sub-hierarchy of the cgroup.  When a controller is enabled on a nested
			
 
				-cgroup, it always restricts the resource distribution further.  The
			
 
				-restrictions set closer to the root in the hierarchy can not be
			
 
				-overridden from further away.
			
 
				-
			
 
				-
			
 
				-Basic Operations
			
 
				-================
			
 
				-
			
 
				-Mounting
			
 
				---------
			
 
				-
			
 
				-Unlike v1, cgroup v2 has only single hierarchy.  The cgroup v2
			
 
				-hierarchy can be mounted with the following mount command::
			
 
				-
			
 
				-  # mount -t cgroup2 none $MOUNT_POINT
			
 
				-
			
 
				-cgroup2 filesystem has the magic number 0x63677270 ("cgrp").  All
			
 
				-controllers which support v2 and are not bound to a v1 hierarchy are
			
 
				-automatically bound to the v2 hierarchy and show up at the root.
			
 
				-Controllers which are not in active use in the v2 hierarchy can be
			
 
				-bound to other hierarchies.  This allows mixing v2 hierarchy with the
			
 
				-legacy v1 multiple hierarchies in a fully backward compatible way.
			
 
				-
			
 
				-A controller can be moved across hierarchies only after the controller
			
 
				-is no longer referenced in its current hierarchy.  Because per-cgroup
			
 
				-controller states are destroyed asynchronously and controllers may
			
 
				-have lingering references, a controller may not show up immediately on
			
 
				-the v2 hierarchy after the final umount of the previous hierarchy.
			
 
				-Similarly, a controller should be fully disabled to be moved out of
			
 
				-the unified hierarchy and it may take some time for the disabled
			
 
				-controller to become available for other hierarchies; furthermore, due
			
 
				-to inter-controller dependencies, other controllers may need to be
			
 
				-disabled too.
			
 
				-
			
 
				-While useful for development and manual configurations, moving
			
 
				-controllers dynamically between the v2 and other hierarchies is
			
 
				-strongly discouraged for production use.  It is recommended to decide
			
 
				-the hierarchies and controller associations before starting using the
			
 
				-controllers after system boot.
			
 
				-
			
 
				-During transition to v2, system management software might still
			
 
				-automount the v1 cgroup filesystem and so hijack all controllers
			
 
				-during boot, before manual intervention is possible. To make testing
			
 
				-and experimenting easier, the kernel parameter cgroup_no_v1= allows
			
 
				-disabling controllers in v1 and make them always available in v2.
			
 
				-
			
 
				-cgroup v2 currently supports the following mount options.
			
 
				-
			
 
				-  nsdelegate
			
 
				-
			
 
				-	Consider cgroup namespaces as delegation boundaries.  This
			
 
				-	option is system wide and can only be set on mount or modified
			
 
				-	through remount from the init namespace.  The mount option is
			
 
				-	ignored on non-init namespace mounts.  Please refer to the
			
 
				-	Delegation section for details.
			
 
				-
			
 
				-
			
 
				-Organizing Processes and Threads
			
 
				---------------------------------
			
 
				-
			
 
				-Processes
			
 
				-~~~~~~~~~
			
 
				-
			
 
				-Initially, only the root cgroup exists to which all processes belong.
			
 
				-A child cgroup can be created by creating a sub-directory::
			
 
				-
			
 
				-  # mkdir $CGROUP_NAME
			
 
				-
			
 
				-A given cgroup may have multiple child cgroups forming a tree
			
 
				-structure.  Each cgroup has a read-writable interface file
			
 
				-"cgroup.procs".  When read, it lists the PIDs of all processes which
			
 
				-belong to the cgroup one-per-line.  The PIDs are not ordered and the
			
 
				-same PID may show up more than once if the process got moved to
			
 
				-another cgroup and then back or the PID got recycled while reading.
			
 
				-
			
 
				-A process can be migrated into a cgroup by writing its PID to the
			
 
				-target cgroup's "cgroup.procs" file.  Only one process can be migrated
			
 
				-on a single write(2) call.  If a process is composed of multiple
			
 
				-threads, writing the PID of any thread migrates all threads of the
			
 
				-process.
			
 
				-
			
 
				-When a process forks a child process, the new process is born into the
			
 
				-cgroup that the forking process belongs to at the time of the
			
 
				-operation.  After exit, a process stays associated with the cgroup
			
 
				-that it belonged to at the time of exit until it's reaped; however, a
			
 
				-zombie process does not appear in "cgroup.procs" and thus can't be
			
 
				-moved to another cgroup.
			
 
				-
			
 
				-A cgroup which doesn't have any children or live processes can be
			
 
				-destroyed by removing the directory.  Note that a cgroup which doesn't
			
 
				-have any children and is associated only with zombie processes is
			
 
				-considered empty and can be removed::
			
 
				-
			
 
				-  # rmdir $CGROUP_NAME
			
 
				-
			
 
				-"/proc/$PID/cgroup" lists a process's cgroup membership.  If legacy
			
 
				-cgroup is in use in the system, this file may contain multiple lines,
			
 
				-one for each hierarchy.  The entry for cgroup v2 is always in the
			
 
				-format "0::$PATH"::
			
 
				-
			
 
				-  # cat /proc/842/cgroup
			
 
				-  ...
			
 
				-  0::/test-cgroup/test-cgroup-nested
			
 
				-
			
 
				-If the process becomes a zombie and the cgroup it was associated with
			
 
				-is removed subsequently, " (deleted)" is appended to the path::
			
 
				-
			
 
				-  # cat /proc/842/cgroup
			
 
				-  ...
			
 
				-  0::/test-cgroup/test-cgroup-nested (deleted)
			
 
				-
			
 
				-
			
 
				-Threads
			
 
				-~~~~~~~
			
 
				-
			
 
				-cgroup v2 supports thread granularity for a subset of controllers to
			
 
				-support use cases requiring hierarchical resource distribution across
			
 
				-the threads of a group of processes.  By default, all threads of a
			
 
				-process belong to the same cgroup, which also serves as the resource
			
 
				-domain to host resource consumptions which are not specific to a
			
 
				-process or thread.  The thread mode allows threads to be spread across
			
 
				-a subtree while still maintaining the common resource domain for them.
			
 
				-
			
 
				-Controllers which support thread mode are called threaded controllers.
			
 
				-The ones which don't are called domain controllers.
			
 
				-
			
 
				-Marking a cgroup threaded makes it join the resource domain of its
			
 
				-parent as a threaded cgroup.  The parent may be another threaded
			
 
				-cgroup whose resource domain is further up in the hierarchy.  The root
			
 
				-of a threaded subtree, that is, the nearest ancestor which is not
			
 
				-threaded, is called threaded domain or thread root interchangeably and
			
 
				-serves as the resource domain for the entire subtree.
			
 
				-
			
 
				-Inside a threaded subtree, threads of a process can be put in
			
 
				-different cgroups and are not subject to the no internal process
			
 
				-constraint - threaded controllers can be enabled on non-leaf cgroups
			
 
				-whether they have threads in them or not.
			
 
				-
			
 
				-As the threaded domain cgroup hosts all the domain resource
			
 
				-consumptions of the subtree, it is considered to have internal
			
 
				-resource consumptions whether there are processes in it or not and
			
 
				-can't have populated child cgroups which aren't threaded.  Because the
			
 
				-root cgroup is not subject to no internal process constraint, it can
			
 
				-serve both as a threaded domain and a parent to domain cgroups.
			
 
				-
			
 
				-The current operation mode or type of the cgroup is shown in the
			
 
				-"cgroup.type" file which indicates whether the cgroup is a normal
			
 
				-domain, a domain which is serving as the domain of a threaded subtree,
			
 
				-or a threaded cgroup.
			
 
				-
			
 
				-On creation, a cgroup is always a domain cgroup and can be made
			
 
				-threaded by writing "threaded" to the "cgroup.type" file.  The
			
 
				-operation is single direction::
			
 
				-
			
 
				-  # echo threaded > cgroup.type
			
 
				-
			
 
				-Once threaded, the cgroup can't be made a domain again.  To enable the
			
 
				-thread mode, the following conditions must be met.
			
 
				-
			
 
				-- As the cgroup will join the parent's resource domain.  The parent
			
 
				-  must either be a valid (threaded) domain or a threaded cgroup.
			
 
				-
			
 
				-- When the parent is an unthreaded domain, it must not have any domain
			
 
				-  controllers enabled or populated domain children.  The root is
			
 
				-  exempt from this requirement.
			
 
				-
			
 
				-Topology-wise, a cgroup can be in an invalid state.  Please consider
			
 
				-the following topology::
			
 
				-
			
 
				-  A (threaded domain) - B (threaded) - C (domain, just created)
			
 
				-
			
 
				-C is created as a domain but isn't connected to a parent which can
			
 
				-host child domains.  C can't be used until it is turned into a
			
 
				-threaded cgroup.  "cgroup.type" file will report "domain (invalid)" in
			
 
				-these cases.  Operations which fail due to invalid topology use
			
 
				-EOPNOTSUPP as the errno.
			
 
				-
			
 
				-A domain cgroup is turned into a threaded domain when one of its child
			
 
				-cgroup becomes threaded or threaded controllers are enabled in the
			
 
				-"cgroup.subtree_control" file while there are processes in the cgroup.
			
 
				-A threaded domain reverts to a normal domain when the conditions
			
 
				-clear.
			
 
				-
			
 
				-When read, "cgroup.threads" contains the list of the thread IDs of all
			
 
				-threads in the cgroup.  Except that the operations are per-thread
			
 
				-instead of per-process, "cgroup.threads" has the same format and
			
 
				-behaves the same way as "cgroup.procs".  While "cgroup.threads" can be
			
 
				-written to in any cgroup, as it can only move threads inside the same
			
 
				-threaded domain, its operations are confined inside each threaded
			
 
				-subtree.
			
 
				-
			
 
				-The threaded domain cgroup serves as the resource domain for the whole
			
 
				-subtree, and, while the threads can be scattered across the subtree,
			
 
				-all the processes are considered to be in the threaded domain cgroup.
			
 
				-"cgroup.procs" in a threaded domain cgroup contains the PIDs of all
			
 
				-processes in the subtree and is not readable in the subtree proper.
			
 
				-However, "cgroup.procs" can be written to from anywhere in the subtree
			
 
				-to migrate all threads of the matching process to the cgroup.
			
 
				-
			
 
				-Only threaded controllers can be enabled in a threaded subtree.  When
			
 
				-a threaded controller is enabled inside a threaded subtree, it only
			
 
				-accounts for and controls resource consumptions associated with the
			
 
				-threads in the cgroup and its descendants.  All consumptions which
			
 
				-aren't tied to a specific thread belong to the threaded domain cgroup.
			
 
				-
			
 
				-Because a threaded subtree is exempt from no internal process
			
 
				-constraint, a threaded controller must be able to handle competition
			
 
				-between threads in a non-leaf cgroup and its child cgroups.  Each
			
 
				-threaded controller defines how such competitions are handled.
			
 
				-
			
 
				-
			
 
				-[Un]populated Notification
			
 
				---------------------------
			
 
				-
			
 
				-Each non-root cgroup has a "cgroup.events" file which contains
			
 
				-"populated" field indicating whether the cgroup's sub-hierarchy has
			
 
				-live processes in it.  Its value is 0 if there is no live process in
			
 
				-the cgroup and its descendants; otherwise, 1.  poll and [id]notify
			
 
				-events are triggered when the value changes.  This can be used, for
			
 
				-example, to start a clean-up operation after all processes of a given
			
 
				-sub-hierarchy have exited.  The populated state updates and
			
 
				-notifications are recursive.  Consider the following sub-hierarchy
			
 
				-where the numbers in the parentheses represent the numbers of processes
			
 
				-in each cgroup::
			
 
				-
			
 
				-  A(4) - B(0) - C(1)
			
 
				-              \ D(0)
			
 
				-
			
 
				-A, B and C's "populated" fields would be 1 while D's 0.  After the one
			
 
				-process in C exits, B and C's "populated" fields would flip to "0" and
			
 
				-file modified events will be generated on the "cgroup.events" files of
			
 
				-both cgroups.
			
 
				-
			
 
				-
			
 
				-Controlling Controllers
			
 
				------------------------
			
 
				-
			
 
				-Enabling and Disabling
			
 
				-~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Each cgroup has a "cgroup.controllers" file which lists all
			
 
				-controllers available for the cgroup to enable::
			
 
				-
			
 
				-  # cat cgroup.controllers
			
 
				-  cpu io memory
			
 
				-
			
 
				-No controller is enabled by default.  Controllers can be enabled and
			
 
				-disabled by writing to the "cgroup.subtree_control" file::
			
 
				-
			
 
				-  # echo "+cpu +memory -io" > cgroup.subtree_control
			
 
				-
			
 
				-Only controllers which are listed in "cgroup.controllers" can be
			
 
				-enabled.  When multiple operations are specified as above, either they
			
 
				-all succeed or fail.  If multiple operations on the same controller
			
 
				-are specified, the last one is effective.
			
 
				-
			
 
				-Enabling a controller in a cgroup indicates that the distribution of
			
 
				-the target resource across its immediate children will be controlled.
			
 
				-Consider the following sub-hierarchy.  The enabled controllers are
			
 
				-listed in parentheses::
			
 
				-
			
 
				-  A(cpu,memory) - B(memory) - C()
			
 
				-                            \ D()
			
 
				-
			
 
				-As A has "cpu" and "memory" enabled, A will control the distribution
			
 
				-of CPU cycles and memory to its children, in this case, B.  As B has
			
 
				-"memory" enabled but not "CPU", C and D will compete freely on CPU
			
 
				-cycles but their division of memory available to B will be controlled.
			
 
				-
			
 
				-As a controller regulates the distribution of the target resource to
			
 
				-the cgroup's children, enabling it creates the controller's interface
			
 
				-files in the child cgroups.  In the above example, enabling "cpu" on B
			
 
				-would create the "cpu." prefixed controller interface files in C and
			
 
				-D.  Likewise, disabling "memory" from B would remove the "memory."
			
 
				-prefixed controller interface files from C and D.  This means that the
			
 
				-controller interface files - anything which doesn't start with
			
 
				-"cgroup." are owned by the parent rather than the cgroup itself.
			
 
				-
			
 
				-
			
 
				-Top-down Constraint
			
 
				-~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Resources are distributed top-down and a cgroup can further distribute
			
 
				-a resource only if the resource has been distributed to it from the
			
 
				-parent.  This means that all non-root "cgroup.subtree_control" files
			
 
				-can only contain controllers which are enabled in the parent's
			
 
				-"cgroup.subtree_control" file.  A controller can be enabled only if
			
 
				-the parent has the controller enabled and a controller can't be
			
 
				-disabled if one or more children have it enabled.
			
 
				-
			
 
				-
			
 
				-No Internal Process Constraint
			
 
				-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Non-root cgroups can distribute domain resources to their children
			
 
				-only when they don't have any processes of their own.  In other words,
			
 
				-only domain cgroups which don't contain any processes can have domain
			
 
				-controllers enabled in their "cgroup.subtree_control" files.
			
 
				-
			
 
				-This guarantees that, when a domain controller is looking at the part
			
 
				-of the hierarchy which has it enabled, processes are always only on
			
 
				-the leaves.  This rules out situations where child cgroups compete
			
 
				-against internal processes of the parent.
			
 
				-
			
 
				-The root cgroup is exempt from this restriction.  Root contains
			
 
				-processes and anonymous resource consumption which can't be associated
			
 
				-with any other cgroups and requires special treatment from most
			
 
				-controllers.  How resource consumption in the root cgroup is governed
			
 
				-is up to each controller (for more information on this topic please
			
 
				-refer to the Non-normative information section in the Controllers
			
 
				-chapter).
			
 
				-
			
 
				-Note that the restriction doesn't get in the way if there is no
			
 
				-enabled controller in the cgroup's "cgroup.subtree_control".  This is
			
 
				-important as otherwise it wouldn't be possible to create children of a
			
 
				-populated cgroup.  To control resource distribution of a cgroup, the
			
 
				-cgroup must create children and transfer all its processes to the
			
 
				-children before enabling controllers in its "cgroup.subtree_control"
			
 
				-file.
			
 
				-
			
 
				-
			
 
				-Delegation
			
 
				-----------
			
 
				-
			
 
				-Model of Delegation
			
 
				-~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-A cgroup can be delegated in two ways.  First, to a less privileged
			
 
				-user by granting write access of the directory and its "cgroup.procs",
			
 
				-"cgroup.threads" and "cgroup.subtree_control" files to the user.
			
 
				-Second, if the "nsdelegate" mount option is set, automatically to a
			
 
				-cgroup namespace on namespace creation.
			
 
				-
			
 
				-Because the resource control interface files in a given directory
			
 
				-control the distribution of the parent's resources, the delegatee
			
 
				-shouldn't be allowed to write to them.  For the first method, this is
			
 
				-achieved by not granting access to these files.  For the second, the
			
 
				-kernel rejects writes to all files other than "cgroup.procs" and
			
 
				-"cgroup.subtree_control" on a namespace root from inside the
			
 
				-namespace.
			
 
				-
			
 
				-The end results are equivalent for both delegation types.  Once
			
 
				-delegated, the user can build sub-hierarchy under the directory,
			
 
				-organize processes inside it as it sees fit and further distribute the
			
 
				-resources it received from the parent.  The limits and other settings
			
 
				-of all resource controllers are hierarchical and regardless of what
			
 
				-happens in the delegated sub-hierarchy, nothing can escape the
			
 
				-resource restrictions imposed by the parent.
			
 
				-
			
 
				-Currently, cgroup doesn't impose any restrictions on the number of
			
 
				-cgroups in or nesting depth of a delegated sub-hierarchy; however,
			
 
				-this may be limited explicitly in the future.
			
 
				-
			
 
				-
			
 
				-Delegation Containment
			
 
				-~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-A delegated sub-hierarchy is contained in the sense that processes
			
 
				-can't be moved into or out of the sub-hierarchy by the delegatee.
			
 
				-
			
 
				-For delegations to a less privileged user, this is achieved by
			
 
				-requiring the following conditions for a process with a non-root euid
			
 
				-to migrate a target process into a cgroup by writing its PID to the
			
 
				-"cgroup.procs" file.
			
 
				-
			
 
				-- The writer must have write access to the "cgroup.procs" file.
			
 
				-
			
 
				-- The writer must have write access to the "cgroup.procs" file of the
			
 
				-  common ancestor of the source and destination cgroups.
			
 
				-
			
 
				-The above two constraints ensure that while a delegatee may migrate
			
 
				-processes around freely in the delegated sub-hierarchy it can't pull
			
 
				-in from or push out to outside the sub-hierarchy.
			
 
				-
			
 
				-For an example, let's assume cgroups C0 and C1 have been delegated to
			
 
				-user U0 who created C00, C01 under C0 and C10 under C1 as follows and
			
 
				-all processes under C0 and C1 belong to U0::
			
 
				-
			
 
				-  ~~~~~~~~~~~~~ - C0 - C00
			
 
				-  ~ cgroup    ~      \ C01
			
 
				-  ~ hierarchy ~
			
 
				-  ~~~~~~~~~~~~~ - C1 - C10
			
 
				-
			
 
				-Let's also say U0 wants to write the PID of a process which is
			
 
				-currently in C10 into "C00/cgroup.procs".  U0 has write access to the
			
 
				-file; however, the common ancestor of the source cgroup C10 and the
			
 
				-destination cgroup C00 is above the points of delegation and U0 would
			
 
				-not have write access to its "cgroup.procs" files and thus the write
			
 
				-will be denied with -EACCES.
			
 
				-
			
 
				-For delegations to namespaces, containment is achieved by requiring
			
 
				-that both the source and destination cgroups are reachable from the
			
 
				-namespace of the process which is attempting the migration.  If either
			
 
				-is not reachable, the migration is rejected with -ENOENT.
			
 
				-
			
 
				-
			
 
				-Guidelines
			
 
				-----------
			
 
				-
			
 
				-Organize Once and Control
			
 
				-~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Migrating a process across cgroups is a relatively expensive operation
			
 
				-and stateful resources such as memory are not moved together with the
			
 
				-process.  This is an explicit design decision as there often exist
			
 
				-inherent trade-offs between migration and various hot paths in terms
			
 
				-of synchronization cost.
			
 
				-
			
 
				-As such, migrating processes across cgroups frequently as a means to
			
 
				-apply different resource restrictions is discouraged.  A workload
			
 
				-should be assigned to a cgroup according to the system's logical and
			
 
				-resource structure once on start-up.  Dynamic adjustments to resource
			
 
				-distribution can be made by changing controller configuration through
			
 
				-the interface files.
			
 
				-
			
 
				-
			
 
				-Avoid Name Collisions
			
 
				-~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Interface files for a cgroup and its children cgroups occupy the same
			
 
				-directory and it is possible to create children cgroups which collide
			
 
				-with interface files.
			
 
				-
			
 
				-All cgroup core interface files are prefixed with "cgroup." and each
			
 
				-controller's interface files are prefixed with the controller name and
			
 
				-a dot.  A controller's name is composed of lower case alphabets and
			
 
				-'_'s but never begins with an '_' so it can be used as the prefix
			
 
				-character for collision avoidance.  Also, interface file names won't
			
 
				-start or end with terms which are often used in categorizing workloads
			
 
				-such as job, service, slice, unit or workload.
			
 
				-
			
 
				-cgroup doesn't do anything to prevent name collisions and it's the
			
 
				-user's responsibility to avoid them.
			
 
				-
			
 
				-
			
 
				-Resource Distribution Models
			
 
				-============================
			
 
				-
			
 
				-cgroup controllers implement several resource distribution schemes
			
 
				-depending on the resource type and expected use cases.  This section
			
 
				-describes major schemes in use along with their expected behaviors.
			
 
				-
			
 
				-
			
 
				-Weights
			
 
				--------
			
 
				-
			
 
				-A parent's resource is distributed by adding up the weights of all
			
 
				-active children and giving each the fraction matching the ratio of its
			
 
				-weight against the sum.  As only children which can make use of the
			
 
				-resource at the moment participate in the distribution, this is
			
 
				-work-conserving.  Due to the dynamic nature, this model is usually
			
 
				-used for stateless resources.
			
 
				-
			
 
				-All weights are in the range [1, 10000] with the default at 100.  This
			
 
				-allows symmetric multiplicative biases in both directions at fine
			
 
				-enough granularity while staying in the intuitive range.
			
 
				-
			
 
				-As long as the weight is in range, all configuration combinations are
			
 
				-valid and there is no reason to reject configuration changes or
			
 
				-process migrations.
			
 
				-
			
 
				-"cpu.weight" proportionally distributes CPU cycles to active children
			
 
				-and is an example of this type.
			
 
				-
			
 
				-
			
 
				-Limits
			
 
				-------
			
 
				-
			
 
				-A child can only consume upto the configured amount of the resource.
			
 
				-Limits can be over-committed - the sum of the limits of children can
			
 
				-exceed the amount of resource available to the parent.
			
 
				-
			
 
				-Limits are in the range [0, max] and defaults to "max", which is noop.
			
 
				-
			
 
				-As limits can be over-committed, all configuration combinations are
			
 
				-valid and there is no reason to reject configuration changes or
			
 
				-process migrations.
			
 
				-
			
 
				-"io.max" limits the maximum BPS and/or IOPS that a cgroup can consume
			
 
				-on an IO device and is an example of this type.
			
 
				-
			
 
				-
			
 
				-Protections
			
 
				------------
			
 
				-
			
 
				-A cgroup is protected to be allocated upto the configured amount of
			
 
				-the resource if the usages of all its ancestors are under their
			
 
				-protected levels.  Protections can be hard guarantees or best effort
			
 
				-soft boundaries.  Protections can also be over-committed in which case
			
 
				-only upto the amount available to the parent is protected among
			
 
				-children.
			
 
				-
			
 
				-Protections are in the range [0, max] and defaults to 0, which is
			
 
				-noop.
			
 
				-
			
 
				-As protections can be over-committed, all configuration combinations
			
 
				-are valid and there is no reason to reject configuration changes or
			
 
				-process migrations.
			
 
				-
			
 
				-"memory.low" implements best-effort memory protection and is an
			
 
				-example of this type.
			
 
				-
			
 
				-
			
 
				-Allocations
			
 
				------------
			
 
				-
			
 
				-A cgroup is exclusively allocated a certain amount of a finite
			
 
				-resource.  Allocations can't be over-committed - the sum of the
			
 
				-allocations of children can not exceed the amount of resource
			
 
				-available to the parent.
			
 
				-
			
 
				-Allocations are in the range [0, max] and defaults to 0, which is no
			
 
				-resource.
			
 
				-
			
 
				-As allocations can't be over-committed, some configuration
			
 
				-combinations are invalid and should be rejected.  Also, if the
			
 
				-resource is mandatory for execution of processes, process migrations
			
 
				-may be rejected.
			
 
				-
			
 
				-"cpu.rt.max" hard-allocates realtime slices and is an example of this
			
 
				-type.
			
 
				-
			
 
				-
			
 
				-Interface Files
			
 
				-===============
			
 
				-
			
 
				-Format
			
 
				-------
			
 
				-
			
 
				-All interface files should be in one of the following formats whenever
			
 
				-possible::
			
 
				-
			
 
				-  New-line separated values
			
 
				-  (when only one value can be written at once)
			
 
				-
			
 
				-	VAL0\n
			
 
				-	VAL1\n
			
 
				-	...
			
 
				-
			
 
				-  Space separated values
			
 
				-  (when read-only or multiple values can be written at once)
			
 
				-
			
 
				-	VAL0 VAL1 ...\n
			
 
				-
			
 
				-  Flat keyed
			
 
				-
			
 
				-	KEY0 VAL0\n
			
 
				-	KEY1 VAL1\n
			
 
				-	...
			
 
				-
			
 
				-  Nested keyed
			
 
				-
			
 
				-	KEY0 SUB_KEY0=VAL00 SUB_KEY1=VAL01...
			
 
				-	KEY1 SUB_KEY0=VAL10 SUB_KEY1=VAL11...
			
 
				-	...
			
 
				-
			
 
				-For a writable file, the format for writing should generally match
			
 
				-reading; however, controllers may allow omitting later fields or
			
 
				-implement restricted shortcuts for most common use cases.
			
 
				-
			
 
				-For both flat and nested keyed files, only the values for a single key
			
 
				-can be written at a time.  For nested keyed files, the sub key pairs
			
 
				-may be specified in any order and not all pairs have to be specified.
			
 
				-
			
 
				-
			
 
				-Conventions
			
 
				------------
			
 
				-
			
 
				-- Settings for a single feature should be contained in a single file.
			
 
				-
			
 
				-- The root cgroup should be exempt from resource control and thus
			
 
				-  shouldn't have resource control interface files.  Also,
			
 
				-  informational files on the root cgroup which end up showing global
			
 
				-  information available elsewhere shouldn't exist.
			
 
				-
			
 
				-- If a controller implements weight based resource distribution, its
			
 
				-  interface file should be named "weight" and have the range [1,
			
 
				-  10000] with 100 as the default.  The values are chosen to allow
			
 
				-  enough and symmetric bias in both directions while keeping it
			
 
				-  intuitive (the default is 100%).
			
 
				-
			
 
				-- If a controller implements an absolute resource guarantee and/or
			
 
				-  limit, the interface files should be named "min" and "max"
			
 
				-  respectively.  If a controller implements best effort resource
			
 
				-  guarantee and/or limit, the interface files should be named "low"
			
 
				-  and "high" respectively.
			
 
				-
			
 
				-  In the above four control files, the special token "max" should be
			
 
				-  used to represent upward infinity for both reading and writing.
			
 
				-
			
 
				-- If a setting has a configurable default value and keyed specific
			
 
				-  overrides, the default entry should be keyed with "default" and
			
 
				-  appear as the first entry in the file.
			
 
				-
			
 
				-  The default value can be updated by writing either "default $VAL" or
			
 
				-  "$VAL".
			
 
				-
			
 
				-  When writing to update a specific override, "default" can be used as
			
 
				-  the value to indicate removal of the override.  Override entries
			
 
				-  with "default" as the value must not appear when read.
			
 
				-
			
 
				-  For example, a setting which is keyed by major:minor device numbers
			
 
				-  with integer values may look like the following::
			
 
				-
			
 
				-    # cat cgroup-example-interface-file
			
 
				-    default 150
			
 
				-    8:0 300
			
 
				-
			
 
				-  The default value can be updated by::
			
 
				-
			
 
				-    # echo 125 > cgroup-example-interface-file
			
 
				-
			
 
				-  or::
			
 
				-
			
 
				-    # echo "default 125" > cgroup-example-interface-file
			
 
				-
			
 
				-  An override can be set by::
			
 
				-
			
 
				-    # echo "8:16 170" > cgroup-example-interface-file
			
 
				-
			
 
				-  and cleared by::
			
 
				-
			
 
				-    # echo "8:0 default" > cgroup-example-interface-file
			
 
				-    # cat cgroup-example-interface-file
			
 
				-    default 125
			
 
				-    8:16 170
			
 
				-
			
 
				-- For events which are not very high frequency, an interface file
			
 
				-  "events" should be created which lists event key value pairs.
			
 
				-  Whenever a notifiable event happens, file modified event should be
			
 
				-  generated on the file.
			
 
				-
			
 
				-
			
 
				-Core Interface Files
			
 
				---------------------
			
 
				-
			
 
				-All cgroup core files are prefixed with "cgroup."
			
 
				-
			
 
				-  cgroup.type
			
 
				-
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.
			
 
				-
			
 
				-	When read, it indicates the current type of the cgroup, which
			
 
				-	can be one of the following values.
			
 
				-
			
 
				-	- "domain" : A normal valid domain cgroup.
			
 
				-
			
 
				-	- "domain threaded" : A threaded domain cgroup which is
			
 
				-          serving as the root of a threaded subtree.
			
 
				-
			
 
				-	- "domain invalid" : A cgroup which is in an invalid state.
			
 
				-	  It can't be populated or have controllers enabled.  It may
			
 
				-	  be allowed to become a threaded cgroup.
			
 
				-
			
 
				-	- "threaded" : A threaded cgroup which is a member of a
			
 
				-          threaded subtree.
			
 
				-
			
 
				-	A cgroup can be turned into a threaded cgroup by writing
			
 
				-	"threaded" to this file.
			
 
				-
			
 
				-  cgroup.procs
			
 
				-	A read-write new-line separated values file which exists on
			
 
				-	all cgroups.
			
 
				-
			
 
				-	When read, it lists the PIDs of all processes which belong to
			
 
				-	the cgroup one-per-line.  The PIDs are not ordered and the
			
 
				-	same PID may show up more than once if the process got moved
			
 
				-	to another cgroup and then back or the PID got recycled while
			
 
				-	reading.
			
 
				-
			
 
				-	A PID can be written to migrate the process associated with
			
 
				-	the PID to the cgroup.  The writer should match all of the
			
 
				-	following conditions.
			
 
				-
			
 
				-	- It must have write access to the "cgroup.procs" file.
			
 
				-
			
 
				-	- It must have write access to the "cgroup.procs" file of the
			
 
				-	  common ancestor of the source and destination cgroups.
			
 
				-
			
 
				-	When delegating a sub-hierarchy, write access to this file
			
 
				-	should be granted along with the containing directory.
			
 
				-
			
 
				-	In a threaded cgroup, reading this file fails with EOPNOTSUPP
			
 
				-	as all the processes belong to the thread root.  Writing is
			
 
				-	supported and moves every thread of the process to the cgroup.
			
 
				-
			
 
				-  cgroup.threads
			
 
				-	A read-write new-line separated values file which exists on
			
 
				-	all cgroups.
			
 
				-
			
 
				-	When read, it lists the TIDs of all threads which belong to
			
 
				-	the cgroup one-per-line.  The TIDs are not ordered and the
			
 
				-	same TID may show up more than once if the thread got moved to
			
 
				-	another cgroup and then back or the TID got recycled while
			
 
				-	reading.
			
 
				-
			
 
				-	A TID can be written to migrate the thread associated with the
			
 
				-	TID to the cgroup.  The writer should match all of the
			
 
				-	following conditions.
			
 
				-
			
 
				-	- It must have write access to the "cgroup.threads" file.
			
 
				-
			
 
				-	- The cgroup that the thread is currently in must be in the
			
 
				-          same resource domain as the destination cgroup.
			
 
				-
			
 
				-	- It must have write access to the "cgroup.procs" file of the
			
 
				-	  common ancestor of the source and destination cgroups.
			
 
				-
			
 
				-	When delegating a sub-hierarchy, write access to this file
			
 
				-	should be granted along with the containing directory.
			
 
				-
			
 
				-  cgroup.controllers
			
 
				-	A read-only space separated values file which exists on all
			
 
				-	cgroups.
			
 
				-
			
 
				-	It shows space separated list of all controllers available to
			
 
				-	the cgroup.  The controllers are not ordered.
			
 
				-
			
 
				-  cgroup.subtree_control
			
 
				-	A read-write space separated values file which exists on all
			
 
				-	cgroups.  Starts out empty.
			
 
				-
			
 
				-	When read, it shows space separated list of the controllers
			
 
				-	which are enabled to control resource distribution from the
			
 
				-	cgroup to its children.
			
 
				-
			
 
				-	Space separated list of controllers prefixed with '+' or '-'
			
 
				-	can be written to enable or disable controllers.  A controller
			
 
				-	name prefixed with '+' enables the controller and '-'
			
 
				-	disables.  If a controller appears more than once on the list,
			
 
				-	the last one is effective.  When multiple enable and disable
			
 
				-	operations are specified, either all succeed or all fail.
			
 
				-
			
 
				-  cgroup.events
			
 
				-	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				-	The following entries are defined.  Unless specified
			
 
				-	otherwise, a value change in this file generates a file
			
 
				-	modified event.
			
 
				-
			
 
				-	  populated
			
 
				-		1 if the cgroup or its descendants contains any live
			
 
				-		processes; otherwise, 0.
			
 
				-
			
 
				-  cgroup.max.descendants
			
 
				-	A read-write single value files.  The default is "max".
			
 
				-
			
 
				-	Maximum allowed number of descent cgroups.
			
 
				-	If the actual number of descendants is equal or larger,
			
 
				-	an attempt to create a new cgroup in the hierarchy will fail.
			
 
				-
			
 
				-  cgroup.max.depth
			
 
				-	A read-write single value files.  The default is "max".
			
 
				-
			
 
				-	Maximum allowed descent depth below the current cgroup.
			
 
				-	If the actual descent depth is equal or larger,
			
 
				-	an attempt to create a new child cgroup will fail.
			
 
				-
			
 
				-  cgroup.stat
			
 
				-	A read-only flat-keyed file with the following entries:
			
 
				-
			
 
				-	  nr_descendants
			
 
				-		Total number of visible descendant cgroups.
			
 
				-
			
 
				-	  nr_dying_descendants
			
 
				-		Total number of dying descendant cgroups. A cgroup becomes
			
 
				-		dying after being deleted by a user. The cgroup will remain
			
 
				-		in dying state for some time undefined time (which can depend
			
 
				-		on system load) before being completely destroyed.
			
 
				-
			
 
				-		A process can't enter a dying cgroup under any circumstances,
			
 
				-		a dying cgroup can't revive.
			
 
				-
			
 
				-		A dying cgroup can consume system resources not exceeding
			
 
				-		limits, which were active at the moment of cgroup deletion.
			
 
				-
			
 
				-
			
 
				-Controllers
			
 
				-===========
			
 
				-
			
 
				-CPU
			
 
				----
			
 
				-
			
 
				-The "cpu" controllers regulates distribution of CPU cycles.  This
			
 
				-controller implements weight and absolute bandwidth limit models for
			
 
				-normal scheduling policy and absolute bandwidth allocation model for
			
 
				-realtime scheduling policy.
			
 
				-
			
 
				-WARNING: cgroup2 doesn't yet support control of realtime processes and
			
 
				-the cpu controller can only be enabled when all RT processes are in
			
 
				-the root cgroup.  Be aware that system management software may already
			
 
				-have placed RT processes into nonroot cgroups during the system boot
			
 
				-process, and these processes may need to be moved to the root cgroup
			
 
				-before the cpu controller can be enabled.
			
 
				-
			
 
				-
			
 
				-CPU Interface Files
			
 
				-~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-All time durations are in microseconds.
			
 
				-
			
 
				-  cpu.stat
			
 
				-	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				-	This file exists whether the controller is enabled or not.
			
 
				-
			
 
				-	It always reports the following three stats:
			
 
				-
			
 
				-	- usage_usec
			
 
				-	- user_usec
			
 
				-	- system_usec
			
 
				-
			
 
				-	and the following three when the controller is enabled:
			
 
				-
			
 
				-	- nr_periods
			
 
				-	- nr_throttled
			
 
				-	- throttled_usec
			
 
				-
			
 
				-  cpu.weight
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "100".
			
 
				-
			
 
				-	The weight in the range [1, 10000].
			
 
				-
			
 
				-  cpu.weight.nice
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "0".
			
 
				-
			
 
				-	The nice value is in the range [-20, 19].
			
 
				-
			
 
				-	This interface file is an alternative interface for
			
 
				-	"cpu.weight" and allows reading and setting weight using the
			
 
				-	same values used by nice(2).  Because the range is smaller and
			
 
				-	granularity is coarser for the nice values, the read value is
			
 
				-	the closest approximation of the current weight.
			
 
				-
			
 
				-  cpu.max
			
 
				-	A read-write two value file which exists on non-root cgroups.
			
 
				-	The default is "max 100000".
			
 
				-
			
 
				-	The maximum bandwidth limit.  It's in the following format::
			
 
				-
			
 
				-	  $MAX $PERIOD
			
 
				-
			
 
				-	which indicates that the group may consume upto $MAX in each
			
 
				-	$PERIOD duration.  "max" for $MAX indicates no limit.  If only
			
 
				-	one number is written, $MAX is updated.
			
 
				-
			
 
				-
			
 
				-Memory
			
 
				-------
			
 
				-
			
 
				-The "memory" controller regulates distribution of memory.  Memory is
			
 
				-stateful and implements both limit and protection models.  Due to the
			
 
				-intertwining between memory usage and reclaim pressure and the
			
 
				-stateful nature of memory, the distribution model is relatively
			
 
				-complex.
			
 
				-
			
 
				-While not completely water-tight, all major memory usages by a given
			
 
				-cgroup are tracked so that the total memory consumption can be
			
 
				-accounted and controlled to a reasonable extent.  Currently, the
			
 
				-following types of memory usages are tracked.
			
 
				-
			
 
				-- Userland memory - page cache and anonymous memory.
			
 
				-
			
 
				-- Kernel data structures such as dentries and inodes.
			
 
				-
			
 
				-- TCP socket buffers.
			
 
				-
			
 
				-The above list may expand in the future for better coverage.
			
 
				-
			
 
				-
			
 
				-Memory Interface Files
			
 
				-~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-All memory amounts are in bytes.  If a value which is not aligned to
			
 
				-PAGE_SIZE is written, the value may be rounded up to the closest
			
 
				-PAGE_SIZE multiple when read back.
			
 
				-
			
 
				-  memory.current
			
 
				-	A read-only single value file which exists on non-root
			
 
				-	cgroups.
			
 
				-
			
 
				-	The total amount of memory currently being used by the cgroup
			
 
				-	and its descendants.
			
 
				-
			
 
				-  memory.low
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "0".
			
 
				-
			
 
				-	Best-effort memory protection.  If the memory usages of a
			
 
				-	cgroup and all its ancestors are below their low boundaries,
			
 
				-	the cgroup's memory won't be reclaimed unless memory can be
			
 
				-	reclaimed from unprotected cgroups.
			
 
				-
			
 
				-	Putting more memory than generally available under this
			
 
				-	protection is discouraged.
			
 
				-
			
 
				-  memory.high
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "max".
			
 
				-
			
 
				-	Memory usage throttle limit.  This is the main mechanism to
			
 
				-	control memory usage of a cgroup.  If a cgroup's usage goes
			
 
				-	over the high boundary, the processes of the cgroup are
			
 
				-	throttled and put under heavy reclaim pressure.
			
 
				-
			
 
				-	Going over the high limit never invokes the OOM killer and
			
 
				-	under extreme conditions the limit may be breached.
			
 
				-
			
 
				-  memory.max
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "max".
			
 
				-
			
 
				-	Memory usage hard limit.  This is the final protection
			
 
				-	mechanism.  If a cgroup's memory usage reaches this limit and
			
 
				-	can't be reduced, the OOM killer is invoked in the cgroup.
			
 
				-	Under certain circumstances, the usage may go over the limit
			
 
				-	temporarily.
			
 
				-
			
 
				-	This is the ultimate protection mechanism.  As long as the
			
 
				-	high limit is used and monitored properly, this limit's
			
 
				-	utility is limited to providing the final safety net.
			
 
				-
			
 
				-  memory.events
			
 
				-	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				-	The following entries are defined.  Unless specified
			
 
				-	otherwise, a value change in this file generates a file
			
 
				-	modified event.
			
 
				-
			
 
				-	  low
			
 
				-		The number of times the cgroup is reclaimed due to
			
 
				-		high memory pressure even though its usage is under
			
 
				-		the low boundary.  This usually indicates that the low
			
 
				-		boundary is over-committed.
			
 
				-
			
 
				-	  high
			
 
				-		The number of times processes of the cgroup are
			
 
				-		throttled and routed to perform direct memory reclaim
			
 
				-		because the high memory boundary was exceeded.  For a
			
 
				-		cgroup whose memory usage is capped by the high limit
			
 
				-		rather than global memory pressure, this event's
			
 
				-		occurrences are expected.
			
 
				-
			
 
				-	  max
			
 
				-		The number of times the cgroup's memory usage was
			
 
				-		about to go over the max boundary.  If direct reclaim
			
 
				-		fails to bring it down, the cgroup goes to OOM state.
			
 
				-
			
 
				-	  oom
			
 
				-		The number of time the cgroup's memory usage was
			
 
				-		reached the limit and allocation was about to fail.
			
 
				-
			
 
				-		Depending on context result could be invocation of OOM
			
 
				-		killer and retrying allocation or failing allocation.
			
 
				-
			
 
				-		Failed allocation in its turn could be returned into
			
 
				-		userspace as -ENOMEM or silently ignored in cases like
			
 
				-		disk readahead.  For now OOM in memory cgroup kills
			
 
				-		tasks iff shortage has happened inside page fault.
			
 
				-
			
 
				-	  oom_kill
			
 
				-		The number of processes belonging to this cgroup
			
 
				-		killed by any kind of OOM killer.
			
 
				-
			
 
				-  memory.stat
			
 
				-	A read-only flat-keyed file which exists on non-root cgroups.
			
 
				-
			
 
				-	This breaks down the cgroup's memory footprint into different
			
 
				-	types of memory, type-specific details, and other information
			
 
				-	on the state and past events of the memory management system.
			
 
				-
			
 
				-	All memory amounts are in bytes.
			
 
				-
			
 
				-	The entries are ordered to be human readable, and new entries
			
 
				-	can show up in the middle. Don't rely on items remaining in a
			
 
				-	fixed position; use the keys to look up specific values!
			
 
				-
			
 
				-	  anon
			
 
				-		Amount of memory used in anonymous mappings such as
			
 
				-		brk(), sbrk(), and mmap(MAP_ANONYMOUS)
			
 
				-
			
 
				-	  file
			
 
				-		Amount of memory used to cache filesystem data,
			
 
				-		including tmpfs and shared memory.
			
 
				-
			
 
				-	  kernel_stack
			
 
				-		Amount of memory allocated to kernel stacks.
			
 
				-
			
 
				-	  slab
			
 
				-		Amount of memory used for storing in-kernel data
			
 
				-		structures.
			
 
				-
			
 
				-	  sock
			
 
				-		Amount of memory used in network transmission buffers
			
 
				-
			
 
				-	  shmem
			
 
				-		Amount of cached filesystem data that is swap-backed,
			
 
				-		such as tmpfs, shm segments, shared anonymous mmap()s
			
 
				-
			
 
				-	  file_mapped
			
 
				-		Amount of cached filesystem data mapped with mmap()
			
 
				-
			
 
				-	  file_dirty
			
 
				-		Amount of cached filesystem data that was modified but
			
 
				-		not yet written back to disk
			
 
				-
			
 
				-	  file_writeback
			
 
				-		Amount of cached filesystem data that was modified and
			
 
				-		is currently being written back to disk
			
 
				-
			
 
				-	  inactive_anon, active_anon, inactive_file, active_file, unevictable
			
 
				-		Amount of memory, swap-backed and filesystem-backed,
			
 
				-		on the internal memory management lists used by the
			
 
				-		page reclaim algorithm
			
 
				-
			
 
				-	  slab_reclaimable
			
 
				-		Part of "slab" that might be reclaimed, such as
			
 
				-		dentries and inodes.
			
 
				-
			
 
				-	  slab_unreclaimable
			
 
				-		Part of "slab" that cannot be reclaimed on memory
			
 
				-		pressure.
			
 
				-
			
 
				-	  pgfault
			
 
				-		Total number of page faults incurred
			
 
				-
			
 
				-	  pgmajfault
			
 
				-		Number of major page faults incurred
			
 
				-
			
 
				-	  workingset_refault
			
 
				-
			
 
				-		Number of refaults of previously evicted pages
			
 
				-
			
 
				-	  workingset_activate
			
 
				-
			
 
				-		Number of refaulted pages that were immediately activated
			
 
				-
			
 
				-	  workingset_nodereclaim
			
 
				-
			
 
				-		Number of times a shadow node has been reclaimed
			
 
				-
			
 
				-	  pgrefill
			
 
				-
			
 
				-		Amount of scanned pages (in an active LRU list)
			
 
				-
			
 
				-	  pgscan
			
 
				-
			
 
				-		Amount of scanned pages (in an inactive LRU list)
			
 
				-
			
 
				-	  pgsteal
			
 
				-
			
 
				-		Amount of reclaimed pages
			
 
				-
			
 
				-	  pgactivate
			
 
				-
			
 
				-		Amount of pages moved to the active LRU list
			
 
				-
			
 
				-	  pgdeactivate
			
 
				-
			
 
				-		Amount of pages moved to the inactive LRU lis
			
 
				-
			
 
				-	  pglazyfree
			
 
				-
			
 
				-		Amount of pages postponed to be freed under memory pressure
			
 
				-
			
 
				-	  pglazyfreed
			
 
				-
			
 
				-		Amount of reclaimed lazyfree pages
			
 
				-
			
 
				-  memory.swap.current
			
 
				-	A read-only single value file which exists on non-root
			
 
				-	cgroups.
			
 
				-
			
 
				-	The total amount of swap currently being used by the cgroup
			
 
				-	and its descendants.
			
 
				-
			
 
				-  memory.swap.max
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "max".
			
 
				-
			
 
				-	Swap usage hard limit.  If a cgroup's swap usage reaches this
			
 
				-	limit, anonymous memory of the cgroup will not be swapped out.
			
 
				-
			
 
				-
			
 
				-Usage Guidelines
			
 
				-~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-"memory.high" is the main mechanism to control memory usage.
			
 
				-Over-committing on high limit (sum of high limits > available memory)
			
 
				-and letting global memory pressure to distribute memory according to
			
 
				-usage is a viable strategy.
			
 
				-
			
 
				-Because breach of the high limit doesn't trigger the OOM killer but
			
 
				-throttles the offending cgroup, a management agent has ample
			
 
				-opportunities to monitor and take appropriate actions such as granting
			
 
				-more memory or terminating the workload.
			
 
				-
			
 
				-Determining whether a cgroup has enough memory is not trivial as
			
 
				-memory usage doesn't indicate whether the workload can benefit from
			
 
				-more memory.  For example, a workload which writes data received from
			
 
				-network to a file can use all available memory but can also operate as
			
 
				-performant with a small amount of memory.  A measure of memory
			
 
				-pressure - how much the workload is being impacted due to lack of
			
 
				-memory - is necessary to determine whether a workload needs more
			
 
				-memory; unfortunately, memory pressure monitoring mechanism isn't
			
 
				-implemented yet.
			
 
				-
			
 
				-
			
 
				-Memory Ownership
			
 
				-~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-A memory area is charged to the cgroup which instantiated it and stays
			
 
				-charged to the cgroup until the area is released.  Migrating a process
			
 
				-to a different cgroup doesn't move the memory usages that it
			
 
				-instantiated while in the previous cgroup to the new cgroup.
			
 
				-
			
 
				-A memory area may be used by processes belonging to different cgroups.
			
 
				-To which cgroup the area will be charged is in-deterministic; however,
			
 
				-over time, the memory area is likely to end up in a cgroup which has
			
 
				-enough memory allowance to avoid high reclaim pressure.
			
 
				-
			
 
				-If a cgroup sweeps a considerable amount of memory which is expected
			
 
				-to be accessed repeatedly by other cgroups, it may make sense to use
			
 
				-POSIX_FADV_DONTNEED to relinquish the ownership of memory areas
			
 
				-belonging to the affected files to ensure correct memory ownership.
			
 
				-
			
 
				-
			
 
				-IO
			
 
				---
			
 
				-
			
 
				-The "io" controller regulates the distribution of IO resources.  This
			
 
				-controller implements both weight based and absolute bandwidth or IOPS
			
 
				-limit distribution; however, weight based distribution is available
			
 
				-only if cfq-iosched is in use and neither scheme is available for
			
 
				-blk-mq devices.
			
 
				-
			
 
				-
			
 
				-IO Interface Files
			
 
				-~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-  io.stat
			
 
				-	A read-only nested-keyed file which exists on non-root
			
 
				-	cgroups.
			
 
				-
			
 
				-	Lines are keyed by $MAJ:$MIN device numbers and not ordered.
			
 
				-	The following nested keys are defined.
			
 
				-
			
 
				-	  ======	===================
			
 
				-	  rbytes	Bytes read
			
 
				-	  wbytes	Bytes written
			
 
				-	  rios		Number of read IOs
			
 
				-	  wios		Number of write IOs
			
 
				-	  ======	===================
			
 
				-
			
 
				-	An example read output follows:
			
 
				-
			
 
				-	  8:16 rbytes=1459200 wbytes=314773504 rios=192 wios=353
			
 
				-	  8:0 rbytes=90430464 wbytes=299008000 rios=8950 wios=1252
			
 
				-
			
 
				-  io.weight
			
 
				-	A read-write flat-keyed file which exists on non-root cgroups.
			
 
				-	The default is "default 100".
			
 
				-
			
 
				-	The first line is the default weight applied to devices
			
 
				-	without specific override.  The rest are overrides keyed by
			
 
				-	$MAJ:$MIN device numbers and not ordered.  The weights are in
			
 
				-	the range [1, 10000] and specifies the relative amount IO time
			
 
				-	the cgroup can use in relation to its siblings.
			
 
				-
			
 
				-	The default weight can be updated by writing either "default
			
 
				-	$WEIGHT" or simply "$WEIGHT".  Overrides can be set by writing
			
 
				-	"$MAJ:$MIN $WEIGHT" and unset by writing "$MAJ:$MIN default".
			
 
				-
			
 
				-	An example read output follows::
			
 
				-
			
 
				-	  default 100
			
 
				-	  8:16 200
			
 
				-	  8:0 50
			
 
				-
			
 
				-  io.max
			
 
				-	A read-write nested-keyed file which exists on non-root
			
 
				-	cgroups.
			
 
				-
			
 
				-	BPS and IOPS based IO limit.  Lines are keyed by $MAJ:$MIN
			
 
				-	device numbers and not ordered.  The following nested keys are
			
 
				-	defined.
			
 
				-
			
 
				-	  =====		==================================
			
 
				-	  rbps		Max read bytes per second
			
 
				-	  wbps		Max write bytes per second
			
 
				-	  riops		Max read IO operations per second
			
 
				-	  wiops		Max write IO operations per second
			
 
				-	  =====		==================================
			
 
				-
			
 
				-	When writing, any number of nested key-value pairs can be
			
 
				-	specified in any order.  "max" can be specified as the value
			
 
				-	to remove a specific limit.  If the same key is specified
			
 
				-	multiple times, the outcome is undefined.
			
 
				-
			
 
				-	BPS and IOPS are measured in each IO direction and IOs are
			
 
				-	delayed if limit is reached.  Temporary bursts are allowed.
			
 
				-
			
 
				-	Setting read limit at 2M BPS and write at 120 IOPS for 8:16::
			
 
				-
			
 
				-	  echo "8:16 rbps=2097152 wiops=120" > io.max
			
 
				-
			
 
				-	Reading returns the following::
			
 
				-
			
 
				-	  8:16 rbps=2097152 wbps=max riops=max wiops=120
			
 
				-
			
 
				-	Write IOPS limit can be removed by writing the following::
			
 
				-
			
 
				-	  echo "8:16 wiops=max" > io.max
			
 
				-
			
 
				-	Reading now returns the following::
			
 
				-
			
 
				-	  8:16 rbps=2097152 wbps=max riops=max wiops=max
			
 
				-
			
 
				-
			
 
				-Writeback
			
 
				-~~~~~~~~~
			
 
				-
			
 
				-Page cache is dirtied through buffered writes and shared mmaps and
			
 
				-written asynchronously to the backing filesystem by the writeback
			
 
				-mechanism.  Writeback sits between the memory and IO domains and
			
 
				-regulates the proportion of dirty memory by balancing dirtying and
			
 
				-write IOs.
			
 
				-
			
 
				-The io controller, in conjunction with the memory controller,
			
 
				-implements control of page cache writeback IOs.  The memory controller
			
 
				-defines the memory domain that dirty memory ratio is calculated and
			
 
				-maintained for and the io controller defines the io domain which
			
 
				-writes out dirty pages for the memory domain.  Both system-wide and
			
 
				-per-cgroup dirty memory states are examined and the more restrictive
			
 
				-of the two is enforced.
			
 
				-
			
 
				-cgroup writeback requires explicit support from the underlying
			
 
				-filesystem.  Currently, cgroup writeback is implemented on ext2, ext4
			
 
				-and btrfs.  On other filesystems, all writeback IOs are attributed to
			
 
				-the root cgroup.
			
 
				-
			
 
				-There are inherent differences in memory and writeback management
			
 
				-which affects how cgroup ownership is tracked.  Memory is tracked per
			
 
				-page while writeback per inode.  For the purpose of writeback, an
			
 
				-inode is assigned to a cgroup and all IO requests to write dirty pages
			
 
				-from the inode are attributed to that cgroup.
			
 
				-
			
 
				-As cgroup ownership for memory is tracked per page, there can be pages
			
 
				-which are associated with different cgroups than the one the inode is
			
 
				-associated with.  These are called foreign pages.  The writeback
			
 
				-constantly keeps track of foreign pages and, if a particular foreign
			
 
				-cgroup becomes the majority over a certain period of time, switches
			
 
				-the ownership of the inode to that cgroup.
			
 
				-
			
 
				-While this model is enough for most use cases where a given inode is
			
 
				-mostly dirtied by a single cgroup even when the main writing cgroup
			
 
				-changes over time, use cases where multiple cgroups write to a single
			
 
				-inode simultaneously are not supported well.  In such circumstances, a
			
 
				-significant portion of IOs are likely to be attributed incorrectly.
			
 
				-As memory controller assigns page ownership on the first use and
			
 
				-doesn't update it until the page is released, even if writeback
			
 
				-strictly follows page ownership, multiple cgroups dirtying overlapping
			
 
				-areas wouldn't work as expected.  It's recommended to avoid such usage
			
 
				-patterns.
			
 
				-
			
 
				-The sysctl knobs which affect writeback behavior are applied to cgroup
			
 
				-writeback as follows.
			
 
				-
			
 
				-  vm.dirty_background_ratio, vm.dirty_ratio
			
 
				-	These ratios apply the same to cgroup writeback with the
			
 
				-	amount of available memory capped by limits imposed by the
			
 
				-	memory controller and system-wide clean memory.
			
 
				-
			
 
				-  vm.dirty_background_bytes, vm.dirty_bytes
			
 
				-	For cgroup writeback, this is calculated into ratio against
			
 
				-	total available memory and applied the same way as
			
 
				-	vm.dirty[_background]_ratio.
			
 
				-
			
 
				-
			
 
				-PID
			
 
				----
			
 
				-
			
 
				-The process number controller is used to allow a cgroup to stop any
			
 
				-new tasks from being fork()'d or clone()'d after a specified limit is
			
 
				-reached.
			
 
				-
			
 
				-The number of tasks in a cgroup can be exhausted in ways which other
			
 
				-controllers cannot prevent, thus warranting its own controller.  For
			
 
				-example, a fork bomb is likely to exhaust the number of tasks before
			
 
				-hitting memory restrictions.
			
 
				-
			
 
				-Note that PIDs used in this controller refer to TIDs, process IDs as
			
 
				-used by the kernel.
			
 
				-
			
 
				-
			
 
				-PID Interface Files
			
 
				-~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-  pids.max
			
 
				-	A read-write single value file which exists on non-root
			
 
				-	cgroups.  The default is "max".
			
 
				-
			
 
				-	Hard limit of number of processes.
			
 
				-
			
 
				-  pids.current
			
 
				-	A read-only single value file which exists on all cgroups.
			
 
				-
			
 
				-	The number of processes currently in the cgroup and its
			
 
				-	descendants.
			
 
				-
			
 
				-Organisational operations are not blocked by cgroup policies, so it is
			
 
				-possible to have pids.current > pids.max.  This can be done by either
			
 
				-setting the limit to be smaller than pids.current, or attaching enough
			
 
				-processes to the cgroup such that pids.current is larger than
			
 
				-pids.max.  However, it is not possible to violate a cgroup PID policy
			
 
				-through fork() or clone(). These will return -EAGAIN if the creation
			
 
				-of a new process would cause a cgroup policy to be violated.
			
 
				-
			
 
				-
			
 
				-Device controller
			
 
				------------------
			
 
				-
			
 
				-Device controller manages access to device files. It includes both
			
 
				-creation of new device files (using mknod), and access to the
			
 
				-existing device files.
			
 
				-
			
 
				-Cgroup v2 device controller has no interface files and is implemented
			
 
				-on top of cgroup BPF. To control access to device files, a user may
			
 
				-create bpf programs of the BPF_CGROUP_DEVICE type and attach them
			
 
				-to cgroups. On an attempt to access a device file, corresponding
			
 
				-BPF programs will be executed, and depending on the return value
			
 
				-the attempt will succeed or fail with -EPERM.
			
 
				-
			
 
				-A BPF_CGROUP_DEVICE program takes a pointer to the bpf_cgroup_dev_ctx
			
 
				-structure, which describes the device access attempt: access type
			
 
				-(mknod/read/write) and device (type, major and minor numbers).
			
 
				-If the program returns 0, the attempt fails with -EPERM, otherwise
			
 
				-it succeeds.
			
 
				-
			
 
				-An example of BPF_CGROUP_DEVICE program may be found in the kernel
			
 
				-source tree in the tools/testing/selftests/bpf/dev_cgroup.c file.
			
 
				-
			
 
				-
			
 
				-RDMA
			
 
				-----
			
 
				-
			
 
				-The "rdma" controller regulates the distribution and accounting of
			
 
				-of RDMA resources.
			
 
				-
			
 
				-RDMA Interface Files
			
 
				-~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-  rdma.max
			
 
				-	A readwrite nested-keyed file that exists for all the cgroups
			
 
				-	except root that describes current configured resource limit
			
 
				-	for a RDMA/IB device.
			
 
				-
			
 
				-	Lines are keyed by device name and are not ordered.
			
 
				-	Each line contains space separated resource name and its configured
			
 
				-	limit that can be distributed.
			
 
				-
			
 
				-	The following nested keys are defined.
			
 
				-
			
 
				-	  ==========	=============================
			
 
				-	  hca_handle	Maximum number of HCA Handles
			
 
				-	  hca_object 	Maximum number of HCA Objects
			
 
				-	  ==========	=============================
			
 
				-
			
 
				-	An example for mlx4 and ocrdma device follows::
			
 
				-
			
 
				-	  mlx4_0 hca_handle=2 hca_object=2000
			
 
				-	  ocrdma1 hca_handle=3 hca_object=max
			
 
				-
			
 
				-  rdma.current
			
 
				-	A read-only file that describes current resource usage.
			
 
				-	It exists for all the cgroup except root.
			
 
				-
			
 
				-	An example for mlx4 and ocrdma device follows::
			
 
				-
			
 
				-	  mlx4_0 hca_handle=1 hca_object=20
			
 
				-	  ocrdma1 hca_handle=1 hca_object=23
			
 
				-
			
 
				-
			
 
				-Misc
			
 
				-----
			
 
				-
			
 
				-perf_event
			
 
				-~~~~~~~~~~
			
 
				-
			
 
				-perf_event controller, if not mounted on a legacy hierarchy, is
			
 
				-automatically enabled on the v2 hierarchy so that perf events can
			
 
				-always be filtered by cgroup v2 path.  The controller can still be
			
 
				-moved to a legacy hierarchy after v2 hierarchy is populated.
			
 
				-
			
 
				-
			
 
				-Non-normative information
			
 
				--------------------------
			
 
				-
			
 
				-This section contains information that isn't considered to be a part of
			
 
				-the stable kernel API and so is subject to change.
			
 
				-
			
 
				-
			
 
				-CPU controller root cgroup process behaviour
			
 
				-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-When distributing CPU cycles in the root cgroup each thread in this
			
 
				-cgroup is treated as if it was hosted in a separate child cgroup of the
			
 
				-root cgroup. This child cgroup weight is dependent on its thread nice
			
 
				-level.
			
 
				-
			
 
				-For details of this mapping see sched_prio_to_weight array in
			
 
				-kernel/sched/core.c file (values from this array should be scaled
			
 
				-appropriately so the neutral - nice 0 - value is 100 instead of 1024).
			
 
				-
			
 
				-
			
 
				-IO controller root cgroup process behaviour
			
 
				-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
			
 
				-
			
 
				-Root cgroup processes are hosted in an implicit leaf child node.
			
 
				-When distributing IO resources this implicit child node is taken into
			
 
				-account as if it was a normal child cgroup of the root cgroup with a
			
 
				-weight value of 200.
			
 
				-
			
 
				-
			
 
				-Namespace
			
 
				-=========
			
 
				-
			
 
				-Basics
			
 
				-------
			
 
				-
			
 
				-cgroup namespace provides a mechanism to virtualize the view of the
			
 
				-"/proc/$PID/cgroup" file and cgroup mounts.  The CLONE_NEWCGROUP clone
			
 
				-flag can be used with clone(2) and unshare(2) to create a new cgroup
			
 
				-namespace.  The process running inside the cgroup namespace will have
			
 
				-its "/proc/$PID/cgroup" output restricted to cgroupns root.  The
			
 
				-cgroupns root is the cgroup of the process at the time of creation of
			
 
				-the cgroup namespace.
			
 
				-
			
 
				-Without cgroup namespace, the "/proc/$PID/cgroup" file shows the
			
 
				-complete path of the cgroup of a process.  In a container setup where
			
 
				-a set of cgroups and namespaces are intended to isolate processes the
			
 
				-"/proc/$PID/cgroup" file may leak potential system level information
			
 
				-to the isolated processes.  For Example::
			
 
				-
			
 
				-  # cat /proc/self/cgroup
			
 
				-  0::/batchjobs/container_id1
			
 
				-
			
 
				-The path '/batchjobs/container_id1' can be considered as system-data
			
 
				-and undesirable to expose to the isolated processes.  cgroup namespace
			
 
				-can be used to restrict visibility of this path.  For example, before
			
 
				-creating a cgroup namespace, one would see::
			
 
				-
			
 
				-  # ls -l /proc/self/ns/cgroup
			
 
				-  lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835]
			
 
				-  # cat /proc/self/cgroup
			
 
				-  0::/batchjobs/container_id1
			
 
				-
			
 
				-After unsharing a new namespace, the view changes::
			
 
				-
			
 
				-  # ls -l /proc/self/ns/cgroup
			
 
				-  lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183]
			
 
				-  # cat /proc/self/cgroup
			
 
				-  0::/
			
 
				-
			
 
				-When some thread from a multi-threaded process unshares its cgroup
			
 
				-namespace, the new cgroupns gets applied to the entire process (all
			
 
				-the threads).  This is natural for the v2 hierarchy; however, for the
			
 
				-legacy hierarchies, this may be unexpected.
			
 
				-
			
 
				-A cgroup namespace is alive as long as there are processes inside or
			
 
				-mounts pinning it.  When the last usage goes away, the cgroup
			
 
				-namespace is destroyed.  The cgroupns root and the actual cgroups
			
 
				-remain.
			
 
				-
			
 
				-
			
 
				-The Root and Views
			
 
				-------------------
			
 
				-
			
 
				-The 'cgroupns root' for a cgroup namespace is the cgroup in which the
			
 
				-process calling unshare(2) is running.  For example, if a process in
			
 
				-/batchjobs/container_id1 cgroup calls unshare, cgroup
			
 
				-/batchjobs/container_id1 becomes the cgroupns root.  For the
			
 
				-init_cgroup_ns, this is the real root ('/') cgroup.
			
 
				-
			
 
				-The cgroupns root cgroup does not change even if the namespace creator
			
 
				-process later moves to a different cgroup::
			
 
				-
			
 
				-  # ~/unshare -c # unshare cgroupns in some cgroup
			
 
				-  # cat /proc/self/cgroup
			
 
				-  0::/
			
 
				-  # mkdir sub_cgrp_1
			
 
				-  # echo 0 > sub_cgrp_1/cgroup.procs
			
 
				-  # cat /proc/self/cgroup
			
 
				-  0::/sub_cgrp_1
			
 
				-
			
 
				-Each process gets its namespace-specific view of "/proc/$PID/cgroup"
			
 
				-
			
 
				-Processes running inside the cgroup namespace will be able to see
			
 
				-cgroup paths (in /proc/self/cgroup) only inside their root cgroup.
			
 
				-From within an unshared cgroupns::
			
 
				-
			
 
				-  # sleep 100000 &
			
 
				-  [1] 7353
			
 
				-  # echo 7353 > sub_cgrp_1/cgroup.procs
			
 
				-  # cat /proc/7353/cgroup
			
 
				-  0::/sub_cgrp_1
			
 
				-
			
 
				-From the initial cgroup namespace, the real cgroup path will be
			
 
				-visible::
			
 
				-
			
 
				-  $ cat /proc/7353/cgroup
			
 
				-  0::/batchjobs/container_id1/sub_cgrp_1
			
 
				-
			
 
				-From a sibling cgroup namespace (that is, a namespace rooted at a
			
 
				-different cgroup), the cgroup path relative to its own cgroup
			
 
				-namespace root will be shown.  For instance, if PID 7353's cgroup
			
 
				-namespace root is at '/batchjobs/container_id2', then it will see::
			
 
				-
			
 
				-  # cat /proc/7353/cgroup
			
 
				-  0::/../container_id2/sub_cgrp_1
			
 
				-
			
 
				-Note that the relative path always starts with '/' to indicate that
			
 
				-its relative to the cgroup namespace root of the caller.
			
 
				-
			
 
				-
			
 
				-Migration and setns(2)
			
 
				-----------------------
			
 
				-
			
 
				-Processes inside a cgroup namespace can move into and out of the
			
 
				-namespace root if they have proper access to external cgroups.  For
			
 
				-example, from inside a namespace with cgroupns root at
			
 
				-/batchjobs/container_id1, and assuming that the global hierarchy is
			
 
				-still accessible inside cgroupns::
			
 
				-
			
 
				-  # cat /proc/7353/cgroup
			
 
				-  0::/sub_cgrp_1
			
 
				-  # echo 7353 > batchjobs/container_id2/cgroup.procs
			
 
				-  # cat /proc/7353/cgroup
			
 
				-  0::/../container_id2
			
 
				-
			
 
				-Note that this kind of setup is not encouraged.  A task inside cgroup
			
 
				-namespace should only be exposed to its own cgroupns hierarchy.
			
 
				-
			
 
				-setns(2) to another cgroup namespace is allowed when:
			
 
				-
			
 
				-(a) the process has CAP_SYS_ADMIN against its current user namespace
			
 
				-(b) the process has CAP_SYS_ADMIN against the target cgroup
			
 
				-    namespace's userns
			
 
				-
			
 
				-No implicit cgroup changes happen with attaching to another cgroup
			
 
				-namespace.  It is expected that the someone moves the attaching
			
 
				-process under the target cgroup namespace root.
			
 
				-
			
 
				-
			
 
				-Interaction with Other Namespaces
			
 
				----------------------------------
			
 
				-
			
 
				-Namespace specific cgroup hierarchy can be mounted by a process
			
 
				-running inside a non-init cgroup namespace::
			
 
				-
			
 
				-  # mount -t cgroup2 none $MOUNT_POINT
			
 
				-
			
 
				-This will mount the unified cgroup hierarchy with cgroupns root as the
			
 
				-filesystem root.  The process needs CAP_SYS_ADMIN against its user and
			
 
				-mount namespaces.
			
 
				-
			
 
				-The virtualization of /proc/self/cgroup file combined with restricting
			
 
				-the view of cgroup hierarchy by namespace-private cgroupfs mount
			
 
				-provides a properly isolated cgroup view inside the container.
			
 
				-
			
 
				-
			
 
				-Information on Kernel Programming
			
 
				-=================================
			
 
				-
			
 
				-This section contains kernel programming information in the areas
			
 
				-where interacting with cgroup is necessary.  cgroup core and
			
 
				-controllers are not covered.
			
 
				-
			
 
				-
			
 
				-Filesystem Support for Writeback
			
 
				---------------------------------
			
 
				-
			
 
				-A filesystem can support cgroup writeback by updating
			
 
				-address_space_operations->writepage[s]() to annotate bio's using the
			
 
				-following two functions.
			
 
				-
			
 
				-  wbc_init_bio(@wbc, @bio)
			
 
				-	Should be called for each bio carrying writeback data and
			
 
				-	associates the bio with the inode's owner cgroup.  Can be
			
 
				-	called anytime between bio allocation and submission.
			
 
				-
			
 
				-  wbc_account_io(@wbc, @page, @bytes)
			
 
				-	Should be called for each data segment being written out.
			
 
				-	While this function doesn't care exactly when it's called
			
 
				-	during the writeback session, it's the easiest and most
			
 
				-	natural to call it as data segments are added to a bio.
			
 
				-
			
 
				-With writeback bio's annotated, cgroup support can be enabled per
			
 
				-super_block by setting SB_I_CGROUPWB in ->s_iflags.  This allows for
			
 
				-selective disabling of cgroup writeback support which is helpful when
			
 
				-certain filesystem features, e.g. journaled data mode, are
			
 
				-incompatible.
			
 
				-
			
 
				-wbc_init_bio() binds the specified bio to its cgroup.  Depending on
			
 
				-the configuration, the bio may be executed at a lower priority and if
			
 
				-the writeback session is holding shared resources, e.g. a journal
			
 
				-entry, may lead to priority inversion.  There is no one easy solution
			
 
				-for the problem.  Filesystems can try to work around specific problem
			
 
				-cases by skipping wbc_init_bio() or using bio_associate_blkcg()
			
 
				-directly.
			
 
				-
			
 
				-
			
 
				-Deprecated v1 Core Features
			
 
				-===========================
			
 
				-
			
 
				-- Multiple hierarchies including named ones are not supported.
			
 
				-
			
 
				-- All v1 mount options are not supported.
			
 
				-
			
 
				-- The "tasks" file is removed and "cgroup.procs" is not sorted.
			
 
				-
			
 
				-- "cgroup.clone_children" is removed.
			
 
				-
			
 
				-- /proc/cgroups is meaningless for v2.  Use "cgroup.controllers" file
			
 
				-  at the root instead.
			
 
				-
			
 
				-
			
 
				-Issues with v1 and Rationales for v2
			
 
				-====================================
			
 
				-
			
 
				-Multiple Hierarchies
			
 
				---------------------
			
 
				-
			
 
				-cgroup v1 allowed an arbitrary number of hierarchies and each
			
 
				-hierarchy could host any number of controllers.  While this seemed to
			
 
				-provide a high level of flexibility, it wasn't useful in practice.
			
 
				-
			
 
				-For example, as there is only one instance of each controller, utility
			
 
				-type controllers such as freezer which can be useful in all
			
 
				-hierarchies could only be used in one.  The issue is exacerbated by
			
 
				-the fact that controllers couldn't be moved to another hierarchy once
			
 
				-hierarchies were populated.  Another issue was that all controllers
			
 
				-bound to a hierarchy were forced to have exactly the same view of the
			
 
				-hierarchy.  It wasn't possible to vary the granularity depending on
			
 
				-the specific controller.
			
 
				-
			
 
				-In practice, these issues heavily limited which controllers could be
			
 
				-put on the same hierarchy and most configurations resorted to putting
			
 
				-each controller on its own hierarchy.  Only closely related ones, such
			
 
				-as the cpu and cpuacct controllers, made sense to be put on the same
			
 
				-hierarchy.  This often meant that userland ended up managing multiple
			
 
				-similar hierarchies repeating the same steps on each hierarchy
			
 
				-whenever a hierarchy management operation was necessary.
			
 
				-
			
 
				-Furthermore, support for multiple hierarchies came at a steep cost.
			
 
				-It greatly complicated cgroup core implementation but more importantly
			
 
				-the support for multiple hierarchies restricted how cgroup could be
			
 
				-used in general and what controllers was able to do.
			
 
				-
			
 
				-There was no limit on how many hierarchies there might be, which meant
			
 
				-that a thread's cgroup membership couldn't be described in finite
			
 
				-length.  The key might contain any number of entries and was unlimited
			
 
				-in length, which made it highly awkward to manipulate and led to
			
 
				-addition of controllers which existed only to identify membership,
			
 
				-which in turn exacerbated the original problem of proliferating number
			
 
				-of hierarchies.
			
 
				-
			
 
				-Also, as a controller couldn't have any expectation regarding the
			
 
				-topologies of hierarchies other controllers might be on, each
			
 
				-controller had to assume that all other controllers were attached to
			
 
				-completely orthogonal hierarchies.  This made it impossible, or at
			
 
				-least very cumbersome, for controllers to cooperate with each other.
			
 
				-
			
 
				-In most use cases, putting controllers on hierarchies which are
			
 
				-completely orthogonal to each other isn't necessary.  What usually is
			
 
				-called for is the ability to have differing levels of granularity
			
 
				-depending on the specific controller.  In other words, hierarchy may
			
 
				-be collapsed from leaf towards root when viewed from specific
			
 
				-controllers.  For example, a given configuration might not care about
			
 
				-how memory is distributed beyond a certain level while still wanting
			
 
				-to control how CPU cycles are distributed.
			
 
				-
			
 
				-
			
 
				-Thread Granularity
			
 
				-------------------
			
 
				-
			
 
				-cgroup v1 allowed threads of a process to belong to different cgroups.
			
 
				-This didn't make sense for some controllers and those controllers
			
 
				-ended up implementing different ways to ignore such situations but
			
 
				-much more importantly it blurred the line between API exposed to
			
 
				-individual applications and system management interface.
			
 
				-
			
 
				-Generally, in-process knowledge is available only to the process
			
 
				-itself; thus, unlike service-level organization of processes,
			
 
				-categorizing threads of a process requires active participation from
			
 
				-the application which owns the target process.
			
 
				-
			
 
				-cgroup v1 had an ambiguously defined delegation model which got abused
			
 
				-in combination with thread granularity.  cgroups were delegated to
			
 
				-individual applications so that they can create and manage their own
			
 
				-sub-hierarchies and control resource distributions along them.  This
			
 
				-effectively raised cgroup to the status of a syscall-like API exposed
			
 
				-to lay programs.
			
 
				-
			
 
				-First of all, cgroup has a fundamentally inadequate interface to be
			
 
				-exposed this way.  For a process to access its own knobs, it has to
			
 
				-extract the path on the target hierarchy from /proc/self/cgroup,
			
 
				-construct the path by appending the name of the knob to the path, open
			
 
				-and then read and/or write to it.  This is not only extremely clunky
			
 
				-and unusual but also inherently racy.  There is no conventional way to
			
 
				-define transaction across the required steps and nothing can guarantee
			
 
				-that the process would actually be operating on its own sub-hierarchy.
			
 
				-
			
 
				-cgroup controllers implemented a number of knobs which would never be
			
 
				-accepted as public APIs because they were just adding control knobs to
			
 
				-system-management pseudo filesystem.  cgroup ended up with interface
			
 
				-knobs which were not properly abstracted or refined and directly
			
 
				-revealed kernel internal details.  These knobs got exposed to
			
 
				-individual applications through the ill-defined delegation mechanism
			
 
				-effectively abusing cgroup as a shortcut to implementing public APIs
			
 
				-without going through the required scrutiny.
			
 
				-
			
 
				-This was painful for both userland and kernel.  Userland ended up with
			
 
				-misbehaving and poorly abstracted interfaces and kernel exposing and
			
 
				-locked into constructs inadvertently.
			
 
				-
			
 
				-
			
 
				-Competition Between Inner Nodes and Threads
			
 
				--------------------------------------------
			
 
				-
			
 
				-cgroup v1 allowed threads to be in any cgroups which created an
			
 
				-interesting problem where threads belonging to a parent cgroup and its
			
 
				-children cgroups competed for resources.  This was nasty as two
			
 
				-different types of entities competed and there was no obvious way to
			
 
				-settle it.  Different controllers did different things.
			
 
				-
			
 
				-The cpu controller considered threads and cgroups as equivalents and
			
 
				-mapped nice levels to cgroup weights.  This worked for some cases but
			
 
				-fell flat when children wanted to be allocated specific ratios of CPU
			
 
				-cycles and the number of internal threads fluctuated - the ratios
			
 
				-constantly changed as the number of competing entities fluctuated.
			
 
				-There also were other issues.  The mapping from nice level to weight
			
 
				-wasn't obvious or universal, and there were various other knobs which
			
 
				-simply weren't available for threads.
			
 
				-
			
 
				-The io controller implicitly created a hidden leaf node for each
			
 
				-cgroup to host the threads.  The hidden leaf had its own copies of all
			
 
				-the knobs with ``leaf_`` prefixed.  While this allowed equivalent
			
 
				-control over internal threads, it was with serious drawbacks.  It
			
 
				-always added an extra layer of nesting which wouldn't be necessary
			
 
				-otherwise, made the interface messy and significantly complicated the
			
 
				-implementation.
			
 
				-
			
 
				-The memory controller didn't have a way to control what happened
			
 
				-between internal tasks and child cgroups and the behavior was not
			
 
				-clearly defined.  There were attempts to add ad-hoc behaviors and
			
 
				-knobs to tailor the behavior to specific workloads which would have
			
 
				-led to problems extremely difficult to resolve in the long term.
			
 
				-
			
 
				-Multiple controllers struggled with internal tasks and came up with
			
 
				-different ways to deal with it; unfortunately, all the approaches were
			
 
				-severely flawed and, furthermore, the widely different behaviors
			
 
				-made cgroup as a whole highly inconsistent.
			
 
				-
			
 
				-This clearly is a problem which needs to be addressed from cgroup core
			
 
				-in a uniform way.
			
 
				-
			
 
				-
			
 
				-Other Interface Issues
			
 
				-----------------------
			
 
				-
			
 
				-cgroup v1 grew without oversight and developed a large number of
			
 
				-idiosyncrasies and inconsistencies.  One issue on the cgroup core side
			
 
				-was how an empty cgroup was notified - a userland helper binary was
			
 
				-forked and executed for each event.  The event delivery wasn't
			
 
				-recursive or delegatable.  The limitations of the mechanism also led
			
 
				-to in-kernel event delivery filtering mechanism further complicating
			
 
				-the interface.
			
 
				-
			
 
				-Controller interfaces were problematic too.  An extreme example is
			
 
				-controllers completely ignoring hierarchical organization and treating
			
 
				-all cgroups as if they were all located directly under the root
			
 
				-cgroup.  Some controllers exposed a large amount of inconsistent
			
 
				-implementation details to userland.
			
 
				-
			
 
				-There also was no consistency across controllers.  When a new cgroup
			
 
				-was created, some controllers defaulted to not imposing extra
			
 
				-restrictions while others disallowed any resource usage until
			
 
				-explicitly configured.  Configuration knobs for the same type of
			
 
				-control used widely differing naming schemes and formats.  Statistics
			
 
				-and information knobs were named arbitrarily and used different
			
 
				-formats and units even in the same controller.
			
 
				-
			
 
				-cgroup v2 establishes common conventions where appropriate and updates
			
 
				-controllers so that they expose minimal and consistent interfaces.
			
 
				-
			
 
				-
			
 
				-Controller Issues and Remedies
			
 
				-------------------------------
			
 
				-
			
 
				-Memory
			
 
				-~~~~~~
			
 
				-
			
 
				-The original lower boundary, the soft limit, is defined as a limit
			
 
				-that is per default unset.  As a result, the set of cgroups that
			
 
				-global reclaim prefers is opt-in, rather than opt-out.  The costs for
			
 
				-optimizing these mostly negative lookups are so high that the
			
 
				-implementation, despite its enormous size, does not even provide the
			
 
				-basic desirable behavior.  First off, the soft limit has no
			
 
				-hierarchical meaning.  All configured groups are organized in a global
			
 
				-rbtree and treated like equal peers, regardless where they are located
			
 
				-in the hierarchy.  This makes subtree delegation impossible.  Second,
			
 
				-the soft limit reclaim pass is so aggressive that it not just
			
 
				-introduces high allocation latencies into the system, but also impacts
			
 
				-system performance due to overreclaim, to the point where the feature
			
 
				-becomes self-defeating.
			
 
				-
			
 
				-The memory.low boundary on the other hand is a top-down allocated
			
 
				-reserve.  A cgroup enjoys reclaim protection when it and all its
			
 
				-ancestors are below their low boundaries, which makes delegation of
			
 
				-subtrees possible.  Secondly, new cgroups have no reserve per default
			
 
				-and in the common case most cgroups are eligible for the preferred
			
 
				-reclaim pass.  This allows the new low boundary to be efficiently
			
 
				-implemented with just a minor addition to the generic reclaim code,
			
 
				-without the need for out-of-band data structures and reclaim passes.
			
 
				-Because the generic reclaim code considers all cgroups except for the
			
 
				-ones running low in the preferred first reclaim pass, overreclaim of
			
 
				-individual groups is eliminated as well, resulting in much better
			
 
				-overall workload performance.
			
 
				-
			
 
				-The original high boundary, the hard limit, is defined as a strict
			
 
				-limit that can not budge, even if the OOM killer has to be called.
			
 
				-But this generally goes against the goal of making the most out of the
			
 
				-available memory.  The memory consumption of workloads varies during
			
 
				-runtime, and that requires users to overcommit.  But doing that with a
			
 
				-strict upper limit requires either a fairly accurate prediction of the
			
 
				-working set size or adding slack to the limit.  Since working set size
			
 
				-estimation is hard and error prone, and getting it wrong results in
			
 
				-OOM kills, most users tend to err on the side of a looser limit and
			
 
				-end up wasting precious resources.
			
 
				-
			
 
				-The memory.high boundary on the other hand can be set much more
			
 
				-conservatively.  When hit, it throttles allocations by forcing them
			
 
				-into direct reclaim to work off the excess, but it never invokes the
			
 
				-OOM killer.  As a result, a high boundary that is chosen too
			
 
				-aggressively will not terminate the processes, but instead it will
			
 
				-lead to gradual performance degradation.  The user can monitor this
			
 
				-and make corrections until the minimal memory footprint that still
			
 
				-gives acceptable performance is found.
			
 
				-
			
 
				-In extreme cases, with many concurrent allocations and a complete
			
 
				-breakdown of reclaim progress within the group, the high boundary can
			
 
				-be exceeded.  But even then it's mostly better to satisfy the
			
 
				-allocation from the slack available in other groups or the rest of the
			
 
				-system than killing the group.  Otherwise, memory.max is there to
			
 
				-limit this type of spillover and ultimately contain buggy or even
			
 
				-malicious applications.
			
 
				-
			
 
				-Setting the original memory.limit_in_bytes below the current usage was
			
 
				-subject to a race condition, where concurrent charges could cause the
			
 
				-limit setting to fail. memory.max on the other hand will first set the
			
 
				-limit to prevent new charges, and then reclaim and OOM kill until the
			
 
				-new limit is met - or the task writing to memory.max is killed.
			
 
				-
			
 
				-The combined memory+swap accounting and limiting is replaced by real
			
 
				-control over swap space.
			
 
				-
			
 
				-The main argument for a combined memory+swap facility in the original
			
 
				-cgroup design was that global or parental pressure would always be
			
 
				-able to swap all anonymous memory of a child group, regardless of the
			
 
				-child's own (possibly untrusted) configuration.  However, untrusted
			
 
				-groups can sabotage swapping by other means - such as referencing its
			
 
				-anonymous memory in a tight loop - and an admin can not assume full
			
 
				-swappability when overcommitting untrusted jobs.
			
 
				-
			
 
				-For trusted jobs, on the other hand, a combined counter is not an
			
 
				-intuitive userspace interface, and it flies in the face of the idea
			
 
				-that cgroup controllers should account and limit specific physical
			
 
				-resources.  Swap space is a resource like all others in the system,
			
 
				-and that's why unified hierarchy allows distributing it separately.
			
--- a/Documentation/clk.txt
+++ b/Documentation/clk.txt
@@ -1,307 +0,0 @@
 
				-========================
			
 
				-The Common Clk Framework
			
 
				-========================
			
 
				-
			
 
				-:Author: Mike Turquette <mturquette@ti.com>
			
 
				-
			
 
				-This document endeavours to explain the common clk framework details,
			
 
				-and how to port a platform over to this framework.  It is not yet a
			
 
				-detailed explanation of the clock api in include/linux/clk.h, but
			
 
				-perhaps someday it will include that information.
			
 
				-
			
 
				-Introduction and interface split
			
 
				-================================
			
 
				-
			
 
				-The common clk framework is an interface to control the clock nodes
			
 
				-available on various devices today.  This may come in the form of clock
			
 
				-gating, rate adjustment, muxing or other operations.  This framework is
			
 
				-enabled with the CONFIG_COMMON_CLK option.
			
 
				-
			
 
				-The interface itself is divided into two halves, each shielded from the
			
 
				-details of its counterpart.  First is the common definition of struct
			
 
				-clk which unifies the framework-level accounting and infrastructure that
			
 
				-has traditionally been duplicated across a variety of platforms.  Second
			
 
				-is a common implementation of the clk.h api, defined in
			
 
				-drivers/clk/clk.c.  Finally there is struct clk_ops, whose operations
			
 
				-are invoked by the clk api implementation.
			
 
				-
			
 
				-The second half of the interface is comprised of the hardware-specific
			
 
				-callbacks registered with struct clk_ops and the corresponding
			
 
				-hardware-specific structures needed to model a particular clock.  For
			
 
				-the remainder of this document any reference to a callback in struct
			
 
				-clk_ops, such as .enable or .set_rate, implies the hardware-specific
			
 
				-implementation of that code.  Likewise, references to struct clk_foo
			
 
				-serve as a convenient shorthand for the implementation of the
			
 
				-hardware-specific bits for the hypothetical "foo" hardware.
			
 
				-
			
 
				-Tying the two halves of this interface together is struct clk_hw, which
			
 
				-is defined in struct clk_foo and pointed to within struct clk_core.  This
			
 
				-allows for easy navigation between the two discrete halves of the common
			
 
				-clock interface.
			
 
				-
			
 
				-Common data structures and api
			
 
				-==============================
			
 
				-
			
 
				-Below is the common struct clk_core definition from
			
 
				-drivers/clk/clk.c, modified for brevity::
			
 
				-
			
 
				-	struct clk_core {
			
 
				-		const char		*name;
			
 
				-		const struct clk_ops	*ops;
			
 
				-		struct clk_hw		*hw;
			
 
				-		struct module		*owner;
			
 
				-		struct clk_core		*parent;
			
 
				-		const char		**parent_names;
			
 
				-		struct clk_core		**parents;
			
 
				-		u8			num_parents;
			
 
				-		u8			new_parent_index;
			
 
				-		...
			
 
				-	};
			
 
				-
			
 
				-The members above make up the core of the clk tree topology.  The clk
			
 
				-api itself defines several driver-facing functions which operate on
			
 
				-struct clk.  That api is documented in include/linux/clk.h.
			
 
				-
			
 
				-Platforms and devices utilizing the common struct clk_core use the struct
			
 
				-clk_ops pointer in struct clk_core to perform the hardware-specific parts of
			
 
				-the operations defined in clk-provider.h::
			
 
				-
			
 
				-	struct clk_ops {
			
 
				-		int		(*prepare)(struct clk_hw *hw);
			
 
				-		void		(*unprepare)(struct clk_hw *hw);
			
 
				-		int		(*is_prepared)(struct clk_hw *hw);
			
 
				-		void		(*unprepare_unused)(struct clk_hw *hw);
			
 
				-		int		(*enable)(struct clk_hw *hw);
			
 
				-		void		(*disable)(struct clk_hw *hw);
			
 
				-		int		(*is_enabled)(struct clk_hw *hw);
			
 
				-		void		(*disable_unused)(struct clk_hw *hw);
			
 
				-		unsigned long	(*recalc_rate)(struct clk_hw *hw,
			
 
				-						unsigned long parent_rate);
			
 
				-		long		(*round_rate)(struct clk_hw *hw,
			
 
				-						unsigned long rate,
			
 
				-						unsigned long *parent_rate);
			
 
				-		int		(*determine_rate)(struct clk_hw *hw,
			
 
				-						  struct clk_rate_request *req);
			
 
				-		int		(*set_parent)(struct clk_hw *hw, u8 index);
			
 
				-		u8		(*get_parent)(struct clk_hw *hw);
			
 
				-		int		(*set_rate)(struct clk_hw *hw,
			
 
				-					    unsigned long rate,
			
 
				-					    unsigned long parent_rate);
			
 
				-		int		(*set_rate_and_parent)(struct clk_hw *hw,
			
 
				-					    unsigned long rate,
			
 
				-					    unsigned long parent_rate,
			
 
				-					    u8 index);
			
 
				-		unsigned long	(*recalc_accuracy)(struct clk_hw *hw,
			
 
				-						unsigned long parent_accuracy);
			
 
				-		int		(*get_phase)(struct clk_hw *hw);
			
 
				-		int		(*set_phase)(struct clk_hw *hw, int degrees);
			
 
				-		void		(*init)(struct clk_hw *hw);
			
 
				-		int		(*debug_init)(struct clk_hw *hw,
			
 
				-					      struct dentry *dentry);
			
 
				-	};
			
 
				-
			
 
				-Hardware clk implementations
			
 
				-============================
			
 
				-
			
 
				-The strength of the common struct clk_core comes from its .ops and .hw pointers
			
 
				-which abstract the details of struct clk from the hardware-specific bits, and
			
 
				-vice versa.  To illustrate consider the simple gateable clk implementation in
			
 
				-drivers/clk/clk-gate.c::
			
 
				-
			
 
				-	struct clk_gate {
			
 
				-		struct clk_hw	hw;
			
 
				-		void __iomem    *reg;
			
 
				-		u8              bit_idx;
			
 
				-		...
			
 
				-	};
			
 
				-
			
 
				-struct clk_gate contains struct clk_hw hw as well as hardware-specific
			
 
				-knowledge about which register and bit controls this clk's gating.
			
 
				-Nothing about clock topology or accounting, such as enable_count or
			
 
				-notifier_count, is needed here.  That is all handled by the common
			
 
				-framework code and struct clk_core.
			
 
				-
			
 
				-Let's walk through enabling this clk from driver code::
			
 
				-
			
 
				-	struct clk *clk;
			
 
				-	clk = clk_get(NULL, "my_gateable_clk");
			
 
				-
			
 
				-	clk_prepare(clk);
			
 
				-	clk_enable(clk);
			
 
				-
			
 
				-The call graph for clk_enable is very simple::
			
 
				-
			
 
				-	clk_enable(clk);
			
 
				-		clk->ops->enable(clk->hw);
			
 
				-		[resolves to...]
			
 
				-			clk_gate_enable(hw);
			
 
				-			[resolves struct clk gate with to_clk_gate(hw)]
			
 
				-				clk_gate_set_bit(gate);
			
 
				-
			
 
				-And the definition of clk_gate_set_bit::
			
 
				-
			
 
				-	static void clk_gate_set_bit(struct clk_gate *gate)
			
 
				-	{
			
 
				-		u32 reg;
			
 
				-
			
 
				-		reg = __raw_readl(gate->reg);
			
 
				-		reg |= BIT(gate->bit_idx);
			
 
				-		writel(reg, gate->reg);
			
 
				-	}
			
 
				-
			
 
				-Note that to_clk_gate is defined as::
			
 
				-
			
 
				-	#define to_clk_gate(_hw) container_of(_hw, struct clk_gate, hw)
			
 
				-
			
 
				-This pattern of abstraction is used for every clock hardware
			
 
				-representation.
			
 
				-
			
 
				-Supporting your own clk hardware
			
 
				-================================
			
 
				-
			
 
				-When implementing support for a new type of clock it is only necessary to
			
 
				-include the following header::
			
 
				-
			
 
				-	#include <linux/clk-provider.h>
			
 
				-
			
 
				-To construct a clk hardware structure for your platform you must define
			
 
				-the following::
			
 
				-
			
 
				-	struct clk_foo {
			
 
				-		struct clk_hw hw;
			
 
				-		... hardware specific data goes here ...
			
 
				-	};
			
 
				-
			
 
				-To take advantage of your data you'll need to support valid operations
			
 
				-for your clk::
			
 
				-
			
 
				-	struct clk_ops clk_foo_ops {
			
 
				-		.enable		= &clk_foo_enable;
			
 
				-		.disable	= &clk_foo_disable;
			
 
				-	};
			
 
				-
			
 
				-Implement the above functions using container_of::
			
 
				-
			
 
				-	#define to_clk_foo(_hw) container_of(_hw, struct clk_foo, hw)
			
 
				-
			
 
				-	int clk_foo_enable(struct clk_hw *hw)
			
 
				-	{
			
 
				-		struct clk_foo *foo;
			
 
				-
			
 
				-		foo = to_clk_foo(hw);
			
 
				-
			
 
				-		... perform magic on foo ...
			
 
				-
			
 
				-		return 0;
			
 
				-	};
			
 
				-
			
 
				-Below is a matrix detailing which clk_ops are mandatory based upon the
			
 
				-hardware capabilities of that clock.  A cell marked as "y" means
			
 
				-mandatory, a cell marked as "n" implies that either including that
			
 
				-callback is invalid or otherwise unnecessary.  Empty cells are either
			
 
				-optional or must be evaluated on a case-by-case basis.
			
 
				-
			
 
				-.. table:: clock hardware characteristics
			
 
				-
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |                | gate | change rate | single parent | multiplexer | root |
			
 
				-   +================+======+=============+===============+=============+======+
			
 
				-   |.prepare        |      |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.unprepare      |      |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.enable         | y    |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.disable        | y    |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.is_enabled     | y    |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.recalc_rate    |      | y           |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.round_rate     |      | y [1]_      |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.determine_rate |      | y [1]_      |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.set_rate       |      | y           |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.set_parent     |      |             | n             | y           | n    |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.get_parent     |      |             | n             | y           | n    |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.recalc_accuracy|      |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-   |.init           |      |             |               |             |      |
			
 
				-   +----------------+------+-------------+---------------+-------------+------+
			
 
				-
			
 
				-.. [1] either one of round_rate or determine_rate is required.
			
 
				-
			
 
				-Finally, register your clock at run-time with a hardware-specific
			
 
				-registration function.  This function simply populates struct clk_foo's
			
 
				-data and then passes the common struct clk parameters to the framework
			
 
				-with a call to::
			
 
				-
			
 
				-	clk_register(...)
			
 
				-
			
 
				-See the basic clock types in ``drivers/clk/clk-*.c`` for examples.
			
 
				-
			
 
				-Disabling clock gating of unused clocks
			
 
				-=======================================
			
 
				-
			
 
				-Sometimes during development it can be useful to be able to bypass the
			
 
				-default disabling of unused clocks. For example, if drivers aren't enabling
			
 
				-clocks properly but rely on them being on from the bootloader, bypassing
			
 
				-the disabling means that the driver will remain functional while the issues
			
 
				-are sorted out.
			
 
				-
			
 
				-To bypass this disabling, include "clk_ignore_unused" in the bootargs to the
			
 
				-kernel.
			
 
				-
			
 
				-Locking
			
 
				-=======
			
 
				-
			
 
				-The common clock framework uses two global locks, the prepare lock and the
			
 
				-enable lock.
			
 
				-
			
 
				-The enable lock is a spinlock and is held across calls to the .enable,
			
 
				-.disable operations. Those operations are thus not allowed to sleep,
			
 
				-and calls to the clk_enable(), clk_disable() API functions are allowed in
			
 
				-atomic context.
			
 
				-
			
 
				-For clk_is_enabled() API, it is also designed to be allowed to be used in
			
 
				-atomic context. However, it doesn't really make any sense to hold the enable
			
 
				-lock in core, unless you want to do something else with the information of
			
 
				-the enable state with that lock held. Otherwise, seeing if a clk is enabled is
			
 
				-a one-shot read of the enabled state, which could just as easily change after
			
 
				-the function returns because the lock is released. Thus the user of this API
			
 
				-needs to handle synchronizing the read of the state with whatever they're
			
 
				-using it for to make sure that the enable state doesn't change during that
			
 
				-time.
			
 
				-
			
 
				-The prepare lock is a mutex and is held across calls to all other operations.
			
 
				-All those operations are allowed to sleep, and calls to the corresponding API
			
 
				-functions are not allowed in atomic context.
			
 
				-
			
 
				-This effectively divides operations in two groups from a locking perspective.
			
 
				-
			
 
				-Drivers don't need to manually protect resources shared between the operations
			
 
				-of one group, regardless of whether those resources are shared by multiple
			
 
				-clocks or not. However, access to resources that are shared between operations
			
 
				-of the two groups needs to be protected by the drivers. An example of such a
			
 
				-resource would be a register that controls both the clock rate and the clock
			
 
				-enable/disable state.
			
 
				-
			
 
				-The clock framework is reentrant, in that a driver is allowed to call clock
			
 
				-framework functions from within its implementation of clock operations. This
			
 
				-can for instance cause a .set_rate operation of one clock being called from
			
 
				-within the .set_rate operation of another clock. This case must be considered
			
 
				-in the driver implementations, but the code flow is usually controlled by the
			
 
				-driver in that case.
			
 
				-
			
 
				-Note that locking must also be considered when code outside of the common
			
 
				-clock framework needs to access resources used by the clock operations. This
			
 
				-is considered out of scope of this document.
			
--- a/Documentation/core-api/atomic_ops.rst
+++ b/Documentation/core-api/atomic_ops.rst
@@ -111,7 +111,6 @@ If the compiler can prove that do_something() does not store to the
 
				 variable a, then the compiler is within its rights transforming this to
			
 
				 the following::
			
 
				 
			
 
				-	tmp = a;
			
 
				 	if (a > 0)
			
 
				 		for (;;)
			
 
				 			do_something();
			
@@ -119,7 +118,7 @@ the following::
 
				 If you don't want the compiler to do this (and you probably don't), then
			
 
				 you should use something like the following::
			
 
				 
			
 
				-	while (READ_ONCE(a) < 0)
			
 
				+	while (READ_ONCE(a) > 0)
			
 
				 		do_something();
			
 
				 
			
 
				 Alternatively, you could place a barrier() call in the loop.
			
@@ -467,10 +466,12 @@ Like the above, except that these routines return a boolean which
 
				 indicates whether the changed bit was set _BEFORE_ the atomic bit
			
 
				 operation.
			
 
				 
			
 
				-WARNING! It is incredibly important that the value be a boolean,
			
 
				-ie. "0" or "1".  Do not try to be fancy and save a few instructions by
			
 
				-declaring the above to return "long" and just returning something like
			
 
				-"old_val & mask" because that will not work.
			
 
				+
			
 
				+.. warning::
			
 
				+        It is incredibly important that the value be a boolean, ie. "0" or "1".
			
 
				+        Do not try to be fancy and save a few instructions by declaring the
			
 
				+        above to return "long" and just returning something like "old_val &
			
 
				+        mask" because that will not work.
			
 
				 
			
 
				 For one thing, this return value gets truncated to int in many code
			
 
				 paths using these interfaces, so on 64-bit if the bit is set in the
			
--- a/Documentation/core-api/cachetlb.rst
+++ b/Documentation/core-api/cachetlb.rst
--- a/Documentation/core-api/circular-buffers.rst
+++ b/Documentation/core-api/circular-buffers.rst
--- a/Documentation/core-api/gfp_mask-from-fs-io.rst
+++ b/Documentation/core-api/gfp_mask-from-fs-io.rst
@@ -0,0 +1,66 @@
 
				+=================================
			
 
				+GFP masks used from FS/IO context
			
 
				+=================================
			
 
				+
			
 
				+:Date: May, 2018
			
 
				+:Author: Michal Hocko <mhocko@kernel.org>
			
 
				+
			
 
				+Introduction
			
 
				+============
			
 
				+
			
 
				+Code paths in the filesystem and IO stacks must be careful when
			
 
				+allocating memory to prevent recursion deadlocks caused by direct
			
 
				+memory reclaim calling back into the FS or IO paths and blocking on
			
 
				+already held resources (e.g. locks - most commonly those used for the
			
 
				+transaction context).
			
 
				+
			
 
				+The traditional way to avoid this deadlock problem is to clear __GFP_FS
			
 
				+respectively __GFP_IO (note the latter implies clearing the first as well) in
			
 
				+the gfp mask when calling an allocator. GFP_NOFS respectively GFP_NOIO can be
			
 
				+used as shortcut. It turned out though that above approach has led to
			
 
				+abuses when the restricted gfp mask is used "just in case" without a
			
 
				+deeper consideration which leads to problems because an excessive use
			
 
				+of GFP_NOFS/GFP_NOIO can lead to memory over-reclaim or other memory
			
 
				+reclaim issues.
			
 
				+
			
 
				+New API
			
 
				+========
			
 
				+
			
 
				+Since 4.12 we do have a generic scope API for both NOFS and NOIO context
			
 
				+``memalloc_nofs_save``, ``memalloc_nofs_restore`` respectively ``memalloc_noio_save``,
			
 
				+``memalloc_noio_restore`` which allow to mark a scope to be a critical
			
 
				+section from a filesystem or I/O point of view. Any allocation from that
			
 
				+scope will inherently drop __GFP_FS respectively __GFP_IO from the given
			
 
				+mask so no memory allocation can recurse back in the FS/IO.
			
 
				+
			
 
				+.. kernel-doc:: include/linux/sched/mm.h
			
 
				+   :functions: memalloc_nofs_save memalloc_nofs_restore
			
 
				+.. kernel-doc:: include/linux/sched/mm.h
			
 
				+   :functions: memalloc_noio_save memalloc_noio_restore
			
 
				+
			
 
				+FS/IO code then simply calls the appropriate save function before
			
 
				+any critical section with respect to the reclaim is started - e.g.
			
 
				+lock shared with the reclaim context or when a transaction context
			
 
				+nesting would be possible via reclaim. The restore function should be
			
 
				+called when the critical section ends. All that ideally along with an
			
 
				+explanation what is the reclaim context for easier maintenance.
			
 
				+
			
 
				+Please note that the proper pairing of save/restore functions
			
 
				+allows nesting so it is safe to call ``memalloc_noio_save`` or
			
 
				+``memalloc_noio_restore`` respectively from an existing NOIO or NOFS
			
 
				+scope.
			
 
				+
			
 
				+What about __vmalloc(GFP_NOFS)
			
 
				+==============================
			
 
				+
			
 
				+vmalloc doesn't support GFP_NOFS semantic because there are hardcoded
			
 
				+GFP_KERNEL allocations deep inside the allocator which are quite non-trivial
			
 
				+to fix up. That means that calling ``vmalloc`` with GFP_NOFS/GFP_NOIO is
			
 
				+almost always a bug. The good news is that the NOFS/NOIO semantic can be
			
 
				+achieved by the scope API.
			
 
				+
			
 
				+In the ideal world, upper layers should already mark dangerous contexts
			
 
				+and so no special care is required and vmalloc should be called without
			
 
				+any problems. Sometimes if the context is not really clear or there are
			
 
				+layering violations then the recommended way around that is to wrap ``vmalloc``
			
 
				+by the scope API with a comment explaining the problem.
			
--- a/Documentation/core-api/index.rst
+++ b/Documentation/core-api/index.rst
@@ -14,6 +14,7 @@ Core utilities
 
				    kernel-api
			
 
				    assoc_array
			
 
				    atomic_ops
			
 
				+   cachetlb
			
 
				    refcount-vs-atomic
			
 
				    cpu_hotplug
			
 
				    idr
			
@@ -25,6 +26,8 @@ Core utilities
 
				    genalloc
			
 
				    errseq
			
 
				    printk-formats
			
 
				+   circular-buffers
			
 
				+   gfp_mask-from-fs-io
			
 
				 
			
 
				 Interfaces for kernel debugging
			
 
				 ===============================
			
--- a/Documentation/core-api/kernel-api.rst
+++ b/Documentation/core-api/kernel-api.rst
@@ -39,17 +39,17 @@ String Manipulation
 
				 .. kernel-doc:: lib/string.c
			
 
				    :export:
			
 
				 
			
 
				+Basic Kernel Library Functions
			
 
				+==============================
			
 
				+
			
 
				+The Linux kernel provides more basic utility functions.
			
 
				+
			
 
				 Bit Operations
			
 
				 --------------
			
 
				 
			
 
				 .. kernel-doc:: arch/x86/include/asm/bitops.h
			
 
				    :internal:
			
 
				 
			
 
				-Basic Kernel Library Functions
			
 
				-==============================
			
 
				-
			
 
				-The Linux kernel provides more basic utility functions.
			
 
				-
			
 
				 Bitmap Operations
			
 
				 -----------------
			
 
				 
			
@@ -80,6 +80,31 @@ Command-line Parsing
 
				 .. kernel-doc:: lib/cmdline.c
			
 
				    :export:
			
 
				 
			
 
				+Sorting
			
 
				+-------
			
 
				+
			
 
				+.. kernel-doc:: lib/sort.c
			
 
				+   :export:
			
 
				+
			
 
				+.. kernel-doc:: lib/list_sort.c
			
 
				+   :export:
			
 
				+
			
 
				+Text Searching
			
 
				+--------------
			
 
				+
			
 
				+.. kernel-doc:: lib/textsearch.c
			
 
				+   :doc: ts_intro
			
 
				+
			
 
				+.. kernel-doc:: lib/textsearch.c
			
 
				+   :export:
			
 
				+
			
 
				+.. kernel-doc:: include/linux/textsearch.h
			
 
				+   :functions: textsearch_find textsearch_next \
			
 
				+               textsearch_get_pattern textsearch_get_pattern_len
			
 
				+
			
 
				+CRC and Math Functions in Linux
			
 
				+===============================
			
 
				+
			
 
				 CRC Functions
			
 
				 -------------
			
 
				 
			
@@ -103,9 +128,6 @@ CRC Functions
 
				 .. kernel-doc:: lib/crc-itu-t.c
			
 
				    :export:
			
 
				 
			
 
				-Math Functions in Linux
			
 
				-=======================
			
 
				-
			
 
				 Base 2 log and power Functions
			
 
				 ------------------------------
			
 
				 
			
@@ -127,28 +149,6 @@ Division Functions
 
				 .. kernel-doc:: lib/gcd.c
			
 
				    :export:
			
 
				 
			
 
				-Sorting
			
 
				--------
			
 
				-
			
 
				-.. kernel-doc:: lib/sort.c
			
 
				-   :export:
			
 
				-
			
 
				-.. kernel-doc:: lib/list_sort.c
			
 
				-   :export:
			
 
				-
			
 
				-Text Searching
			
 
				---------------
			
 
				-
			
 
				-.. kernel-doc:: lib/textsearch.c
			
 
				-   :doc: ts_intro
			
 
				-
			
 
				-.. kernel-doc:: lib/textsearch.c
			
 
				-   :export:
			
 
				-
			
 
				-.. kernel-doc:: include/linux/textsearch.h
			
 
				-   :functions: textsearch_find textsearch_next \
			
 
				-               textsearch_get_pattern textsearch_get_pattern_len
			
 
				-
			
 
				 UUID/GUID
			
 
				 ---------
			
 
				 
			
@@ -284,7 +284,7 @@ Resources Management
 
				 MTRR Handling
			
 
				 -------------
			
 
				 
			
 
				-.. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c
			
 
				+.. kernel-doc:: arch/x86/kernel/cpu/mtrr/mtrr.c
			
 
				    :export:
			
 
				 
			
 
				 Security Framework
			
--- a/Documentation/core-api/printk-formats.rst
+++ b/Documentation/core-api/printk-formats.rst
@@ -419,11 +419,10 @@ struct clk
 
				 
			
 
				 	%pC	pll1
			
 
				 	%pCn	pll1
			
 
				-	%pCr	1560000000
			
 
				 
			
 
				 For printing struct clk structures. %pC and %pCn print the name
			
 
				 (Common Clock Framework) or address (legacy clock framework) of the
			
 
				-structure; %pCr prints the current clock rate.
			
 
				+structure.
			
 
				 
			
 
				 Passed by reference.
			
 
				 
			
--- a/Documentation/core-api/refcount-vs-atomic.rst
+++ b/Documentation/core-api/refcount-vs-atomic.rst
@@ -17,7 +17,7 @@ in order to help maintainers validate their code against the change in
 
				 these memory ordering guarantees.
			
 
				 
			
 
				 The terms used through this document try to follow the formal LKMM defined in
			
 
				-github.com/aparri/memory-model/blob/master/Documentation/explanation.txt
			
 
				+tools/memory-model/Documentation/explanation.txt.
			
 
				 
			
 
				 memory-barriers.txt and atomic_t.txt provide more background to the
			
 
				 memory ordering in general and for atomic operations specifically.
			
--- a/Documentation/crypto/crypto_engine.rst
+++ b/Documentation/crypto/crypto_engine.rst
@@ -8,11 +8,13 @@ The crypto engine API (CE), is a crypto queue manager.
 
				 
			
 
				 Requirement
			
 
				 -----------
			
 
				-You have to put at start of your tfm_ctx the struct crypto_engine_ctx
			
 
				-struct your_tfm_ctx {
			
 
				+You have to put at start of your tfm_ctx the struct crypto_engine_ctx::
			
 
				+
			
 
				+  struct your_tfm_ctx {
			
 
				         struct crypto_engine_ctx enginectx;
			
 
				         ...
			
 
				-};
			
 
				+  };
			
 
				+
			
 
				 Why: Since CE manage only crypto_async_request, it cannot know the underlying
			
 
				 request_type and so have access only on the TFM.
			
 
				 So using container_of for accessing __ctx is impossible.
			
--- a/Documentation/crypto/index.rst
+++ b/Documentation/crypto/index.rst
@@ -20,5 +20,6 @@ for cryptographic use cases, as well as programming examples.
 
				    architecture
			
 
				    devel-algos
			
 
				    userspace-if
			
 
				+   crypto_engine
			
 
				    api
			
 
				    api-samples
			
--- a/Documentation/dell_rbu.txt
+++ b/Documentation/dell_rbu.txt
@@ -121,10 +121,7 @@ read back the image downloaded.
 
				 
			
 
				 .. note::
			
 
				 
			
 
				-   This driver requires a patch for firmware_class.c which has the modified
			
 
				-   request_firmware_nowait function.
			
 
				-
			
 
				-   Also after updating the BIOS image a user mode application needs to execute
			
 
				+   After updating the BIOS image a user mode application needs to execute
			
 
				    code which sends the BIOS update request to the BIOS. So on the next reboot
			
 
				    the BIOS knows about the new image downloaded and it updates itself.
			
 
				    Also don't unload the rbu driver if the image has to be updated.
			
--- a/Documentation/dev-tools/kasan.rst
+++ b/Documentation/dev-tools/kasan.rst
@@ -120,7 +120,7 @@ A typical out of bounds access report looks like this::
 
				 
			
 
				 The header of the report discribe what kind of bug happened and what kind of
			
 
				 access caused it. It's followed by the description of the accessed slub object
			
 
				-(see 'SLUB Debug output' section in Documentation/vm/slub.txt for details) and
			
 
				+(see 'SLUB Debug output' section in Documentation/vm/slub.rst for details) and
			
 
				 the description of the accessed memory page.
			
 
				 
			
 
				 In the last section the report shows memory state around the accessed address.
			
--- a/Documentation/dev-tools/kselftest.rst
+++ b/Documentation/dev-tools/kselftest.rst
@@ -151,6 +151,11 @@ Contributing new tests (details)
 
				    TEST_FILES, TEST_GEN_FILES mean it is the file which is used by
			
 
				    test.
			
 
				 
			
 
				+ * First use the headers inside the kernel source and/or git repo, and then the
			
 
				+   system headers.  Headers for the kernel release as opposed to headers
			
 
				+   installed by the distro on the system should be the primary focus to be able
			
 
				+   to find regressions.
			
 
				+
			
 
				 Test Harness
			
 
				 ============
			
 
				 
			
--- a/Documentation/device-mapper/writecache.txt
+++ b/Documentation/device-mapper/writecache.txt
@@ -0,0 +1,68 @@
 
				+The writecache target caches writes on persistent memory or on SSD. It
			
 
				+doesn't cache reads because reads are supposed to be cached in page cache
			
 
				+in normal RAM.
			
 
				+
			
 
				+When the device is constructed, the first sector should be zeroed or the
			
 
				+first sector should contain valid superblock from previous invocation.
			
 
				+
			
 
				+Constructor parameters:
			
 
				+1. type of the cache device - "p" or "s"
			
 
				+	p - persistent memory
			
 
				+	s - SSD
			
 
				+2. the underlying device that will be cached
			
 
				+3. the cache device
			
 
				+4. block size (4096 is recommended; the maximum block size is the page
			
 
				+   size)
			
 
				+5. the number of optional parameters (the parameters with an argument
			
 
				+   count as two)
			
 
				+	high_watermark n	(default: 50)
			
 
				+		start writeback when the number of used blocks reach this
			
 
				+		watermark
			
 
				+	low_watermark x		(default: 45)
			
 
				+		stop writeback when the number of used blocks drops below
			
 
				+		this watermark
			
 
				+	writeback_jobs n	(default: unlimited)
			
 
				+		limit the number of blocks that are in flight during
			
 
				+		writeback. Setting this value reduces writeback
			
 
				+		throughput, but it may improve latency of read requests
			
 
				+	autocommit_blocks n	(default: 64 for pmem, 65536 for ssd)
			
 
				+		when the application writes this amount of blocks without
			
 
				+		issuing the FLUSH request, the blocks are automatically
			
 
				+		commited
			
 
				+	autocommit_time ms	(default: 1000)
			
 
				+		autocommit time in milliseconds. The data is automatically
			
 
				+		commited if this time passes and no FLUSH request is
			
 
				+		received
			
 
				+	fua			(by default on)
			
 
				+		applicable only to persistent memory - use the FUA flag
			
 
				+		when writing data from persistent memory back to the
			
 
				+		underlying device
			
 
				+	nofua
			
 
				+		applicable only to persistent memory - don't use the FUA
			
 
				+		flag when writing back data and send the FLUSH request
			
 
				+		afterwards
			
 
				+		- some underlying devices perform better with fua, some
			
 
				+		  with nofua. The user should test it
			
 
				+
			
 
				+Status:
			
 
				+1. error indicator - 0 if there was no error, otherwise error number
			
 
				+2. the number of blocks
			
 
				+3. the number of free blocks
			
 
				+4. the number of blocks under writeback
			
 
				+
			
 
				+Messages:
			
 
				+	flush
			
 
				+		flush the cache device. The message returns successfully
			
 
				+		if the cache device was flushed without an error
			
 
				+	flush_on_suspend
			
 
				+		flush the cache device on next suspend. Use this message
			
 
				+		when you are going to remove the cache device. The proper
			
 
				+		sequence for removing the cache device is:
			
 
				+		1. send the "flush_on_suspend" message
			
 
				+		2. load an inactive table with a linear target that maps
			
 
				+		   to the underlying device
			
 
				+		3. suspend the device
			
 
				+		4. ask for status and verify that there are no errors
			
 
				+		5. resume the device, so that it will use the linear
			
 
				+		   target
			
 
				+		6. the cache device is now inactive and it can be deleted
			
--- a/Documentation/devicetree/bindings/arm/amlogic.txt
+++ b/Documentation/devicetree/bindings/arm/amlogic.txt
@@ -25,6 +25,10 @@ Boards with the Amlogic Meson8b SoC shall have the following properties:
 
				   Required root node property:
			
 
				     compatible: "amlogic,meson8b";
			
 
				 
			
 
				+Boards with the Amlogic Meson8m2 SoC shall have the following properties:
			
 
				+  Required root node property:
			
 
				+    compatible: "amlogic,meson8m2";
			
 
				+
			
 
				 Boards with the Amlogic Meson GXBaby SoC shall have the following properties:
			
 
				   Required root node property:
			
 
				     compatible: "amlogic,meson-gxbb";
			
@@ -54,6 +58,8 @@ Board compatible values (alphabetically, grouped by SoC):
 
				   - "hardkernel,odroid-c1" (Meson8b)
			
 
				   - "tronfy,mxq" (Meson8b)
			
 
				 
			
 
				+  - "tronsmart,mxiii-plus" (Meson8m2)
			
 
				+
			
 
				   - "amlogic,p200" (Meson gxbb)
			
 
				   - "amlogic,p201" (Meson gxbb)
			
 
				   - "friendlyarm,nanopi-k2" (Meson gxbb)
			
--- a/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt
+++ b/Documentation/devicetree/bindings/arm/bcm/brcm,bcm2835.txt
@@ -34,6 +34,10 @@ Raspberry Pi 3 Model B
 
				 Required root node properties:
			
 
				 compatible = "raspberrypi,3-model-b", "brcm,bcm2837";
			
 
				 
			
 
				+Raspberry Pi 3 Model B+
			
 
				+Required root node properties:
			
 
				+compatible = "raspberrypi,3-model-b-plus", "brcm,bcm2837";
			
 
				+
			
 
				 Raspberry Pi Compute Module
			
 
				 Required root node properties:
			
 
				 compatible = "raspberrypi,compute-module", "brcm,bcm2835";
			
--- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,g3dsys.txt
+++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,g3dsys.txt
@@ -0,0 +1,30 @@
 
				+MediaTek g3dsys controller
			
 
				+============================
			
 
				+
			
 
				+The MediaTek g3dsys controller provides various clocks and reset controller to
			
 
				+the GPU.
			
 
				+
			
 
				+Required Properties:
			
 
				+
			
 
				+- compatible: Should be:
			
 
				+	- "mediatek,mt2701-g3dsys", "syscon":
			
 
				+		for MT2701 SoC
			
 
				+	- "mediatek,mt7623-g3dsys", "mediatek,mt2701-g3dsys", "syscon":
			
 
				+		for MT7623 SoC
			
 
				+- #clock-cells: Must be 1
			
 
				+- #reset-cells: Must be 1
			
 
				+
			
 
				+The g3dsys controller uses the common clk binding from
			
 
				+Documentation/devicetree/bindings/clock/clock-bindings.txt
			
 
				+The available clocks are defined in dt-bindings/clock/mt*-clk.h.
			
 
				+
			
 
				+Example:
			
 
				+
			
 
				+g3dsys: clock-controller@13000000 {
			
 
				+	compatible = "mediatek,mt7623-g3dsys",
			
 
				+		     "mediatek,mt2701-g3dsys",
			
 
				+		     "syscon";
			
 
				+	reg = <0 0x13000000 0 0x200>;
			
 
				+	#clock-cells = <1>;
			
 
				+	#reset-cells = <1>;
			
 
				+};
			
--- a/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt
+++ b/Documentation/devicetree/bindings/arm/samsung/samsung-boards.txt
@@ -21,8 +21,6 @@ Required root node properties:
 
				 	- "samsung,smdk5420"	- for Exynos5420-based Samsung SMDK5420 eval board.
			
 
				 	- "samsung,tm2"		- for Exynos5433-based Samsung TM2 board.
			
 
				 	- "samsung,tm2e"	- for Exynos5433-based Samsung TM2E board.
			
 
				-	- "samsung,sd5v1"	- for Exynos5440-based Samsung board.
			
 
				-	- "samsung,ssdk5440"	- for Exynos5440-based Samsung board.
			
 
				 
			
 
				 * Other companies Exynos SoC based
			
 
				   * FriendlyARM
			
--- a/Documentation/devicetree/bindings/arm/shmobile.txt
+++ b/Documentation/devicetree/bindings/arm/shmobile.txt
@@ -21,6 +21,8 @@ SoCs:
 
				     compatible = "renesas,r8a7744"
			
 
				   - RZ/G1E (R8A77450)
			
 
				     compatible = "renesas,r8a7745"
			
 
				+  - RZ/G1C (R8A77470)
			
 
				+    compatible = "renesas,r8a77470"
			
 
				   - R-Car M1A (R8A77781)
			
 
				     compatible = "renesas,r8a7778"
			
 
				   - R-Car H1 (R8A77790)
			
@@ -45,6 +47,8 @@ SoCs:
 
				     compatible = "renesas,r8a77970"
			
 
				   - R-Car V3H (R8A77980)
			
 
				     compatible = "renesas,r8a77980"
			
 
				+  - R-Car E3 (R8A77990)
			
 
				+    compatible = "renesas,r8a77990"
			
 
				   - R-Car D3 (R8A77995)
			
 
				     compatible = "renesas,r8a77995"
			
 
				 
			
@@ -67,6 +71,8 @@ Boards:
 
				     compatible = "renesas,draak", "renesas,r8a77995"
			
 
				   - Eagle (RTP0RC77970SEB0010S)
			
 
				     compatible = "renesas,eagle", "renesas,r8a77970"
			
 
				+  - Ebisu (RTP0RC77990SEB0010S)
			
 
				+    compatible = "renesas,ebisu", "renesas,r8a77990"
			
 
				   - Genmai (RTK772100BC00000BR)
			
 
				     compatible = "renesas,genmai", "renesas,r7s72100"
			
 
				   - GR-Peach (X28A-M01-E/F)
			
@@ -78,6 +84,8 @@ Boards:
 
				     compatible = "renesas,h3ulcb", "renesas,r8a7795"
			
 
				   - Henninger
			
 
				     compatible = "renesas,henninger", "renesas,r8a7791"
			
 
				+  - iWave Systems RZ/G1C Single Board Computer (iW-RainboW-G23S)
			
 
				+    compatible = "iwave,g23s", "renesas,r8a77470"
			
 
				   - iWave Systems RZ/G1E SODIMM SOM Development Platform (iW-RainboW-G22D)
			
 
				     compatible = "iwave,g22d", "iwave,g22m", "renesas,r8a7745"
			
 
				   - iWave Systems RZ/G1E SODIMM System On Module (iW-RainboW-G22M-SM)
			
@@ -108,7 +116,7 @@ Boards:
 
				     compatible = "renesas,salvator-x", "renesas,r8a7795"
			
 
				   - Salvator-X (RTP0RC7796SIPB0011S)
			
 
				     compatible = "renesas,salvator-x", "renesas,r8a7796"
			
 
				-  - Salvator-X (RTP0RC7796SIPB0011S (M3N))
			
 
				+  - Salvator-X (RTP0RC7796SIPB0011S (M3-N))
			
 
				     compatible = "renesas,salvator-x", "renesas,r8a77965"
			
 
				   - Salvator-XS (Salvator-X 2nd version, RTP0RC7795SIPB0012S)
			
 
				     compatible = "renesas,salvator-xs", "renesas,r8a7795"
			
@@ -124,6 +132,8 @@ Boards:
 
				     compatible = "renesas,sk-rzg1m", "renesas,r8a7743"
			
 
				   - Stout (ADAS Starterkit, Y-R-CAR-ADAS-SKH2-BOARD)
			
 
				     compatible = "renesas,stout", "renesas,r8a7790"
			
 
				+  - V3HSK (Y-ASK-RCAR-V3H-WS10)
			
 
				+    compatible = "renesas,v3hsk", "renesas,r8a77980"
			
 
				   - V3MSK (Y-ASK-RCAR-V3M-WS10)
			
 
				     compatible = "renesas,v3msk", "renesas,r8a77970"
			
 
				   - Wheat (RTP0RC7792ASKB0000JE)
			
--- a/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32-syscon.txt
@@ -0,0 +1,14 @@
 
				+STMicroelectronics STM32 Platforms System Controller
			
 
				+
			
 
				+Properties:
			
 
				+   - compatible : should contain two values. First value must be :
			
 
				+                 - " st,stm32mp157-syscfg " - for stm32mp157 based SoCs,
			
 
				+                 second value must be always "syscon".
			
 
				+   - reg : offset and length of the register set.
			
 
				+
			
 
				+ Example:
			
 
				+         syscfg: syscon@50020000 {
			
 
				+                 compatible = "st,stm32mp157-syscfg", "syscon";
			
 
				+                 reg = <0x50020000 0x400>;
			
 
				+         };
			
 
				+
			
--- a/Documentation/devicetree/bindings/arm/stm32/stm32.txt
+++ b/Documentation/devicetree/bindings/arm/stm32/stm32.txt
--- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-mc.txt
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-mc.txt
@@ -1,16 +0,0 @@
 
				-NVIDIA Tegra20 MC(Memory Controller)
			
 
				-
			
 
				-Required properties:
			
 
				-- compatible : "nvidia,tegra20-mc"
			
 
				-- reg : Should contain 2 register ranges(address and length); see the
			
 
				-  example below. Note that the MC registers are interleaved with the
			
 
				-  GART registers, and hence must be represented as multiple ranges.
			
 
				-- interrupts : Should contain MC General interrupt.
			
 
				-
			
 
				-Example:
			
 
				-	memory-controller@7000f000 {
			
 
				-		compatible = "nvidia,tegra20-mc";
			
 
				-		reg = <0x7000f000 0x024
			
 
				-		       0x7000f03c 0x3c4>;
			
 
				-		interrupts = <0 77 0x04>;
			
 
				-	};
			
--- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra30-mc.txt
+++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra30-mc.txt
@@ -1,18 +0,0 @@
 
				-NVIDIA Tegra30 MC(Memory Controller)
			
 
				-
			
 
				-Required properties:
			
 
				-- compatible : "nvidia,tegra30-mc"
			
 
				-- reg : Should contain 4 register ranges(address and length); see the
			
 
				-  example below. Note that the MC registers are interleaved with the
			
 
				-  SMMU registers, and hence must be represented as multiple ranges.
			
 
				-- interrupts : Should contain MC General interrupt.
			
 
				-
			
 
				-Example:
			
 
				-	memory-controller {
			
 
				-		compatible = "nvidia,tegra30-mc";
			
 
				-		reg = <0x7000f000 0x010
			
 
				-		       0x7000f03c 0x1b4
			
 
				-		       0x7000f200 0x028
			
 
				-		       0x7000f284 0x17c>;
			
 
				-		interrupts = <0 77 0x04>;
			
 
				-	};
			
--- a/Documentation/devicetree/bindings/arm/ux500/boards.txt
+++ b/Documentation/devicetree/bindings/arm/ux500/boards.txt
@@ -26,7 +26,7 @@ interrupt-controller:
 
				 	see binding for interrupt-controller/arm,gic.txt
			
 
				 
			
 
				 timer:
			
 
				-	see binding for arm/twd.txt
			
 
				+	see binding for timer/arm,twd.txt
			
 
				 
			
 
				 clocks:
			
 
				 	see binding for clocks/ux500.txt
			
--- a/Documentation/devicetree/bindings/bus/ti-sysc.txt
+++ b/Documentation/devicetree/bindings/bus/ti-sysc.txt
@@ -79,7 +79,11 @@ Optional properties:
 
				 		mode as for example omap4 L4_CFG_CLKCTRL
			
 
				 
			
 
				 - clock-names	should contain at least "fck", and optionally also "ick"
			
 
				-		depending on the SoC and the interconnect target module
			
 
				+		depending on the SoC and the interconnect target module,
			
 
				+		some interconnect target modules also need additional
			
 
				+		optional clocks that can be specified as listed in TRM
			
 
				+		for the related CLKCTRL register bits 8 to 15 such as
			
 
				+		"dbclk" or "clk32k" depending on their role
			
 
				 
			
 
				 - ti,hwmods	optional TI interconnect module name to use legacy
			
 
				 		hwmod platform data
			
--- a/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt
+++ b/Documentation/devicetree/bindings/clock/actions,s900-cmu.txt
@@ -0,0 +1,47 @@
 
				+* Actions S900 Clock Management Unit (CMU)
			
 
				+
			
 
				+The Actions S900 clock management unit generates and supplies clock to various
			
 
				+controllers within the SoC. The clock binding described here is applicable to
			
 
				+S900 SoC.
			
 
				+
			
 
				+Required Properties:
			
 
				+
			
 
				+- compatible: should be "actions,s900-cmu"
			
 
				+- reg: physical base address of the controller and length of memory mapped
			
 
				+  region.
			
 
				+- clocks: Reference to the parent clocks ("hosc", "losc")
			
 
				+- #clock-cells: should be 1.
			
 
				+
			
 
				+Each clock is assigned an identifier, and client nodes can use this identifier
			
 
				+to specify the clock which they consume.
			
 
				+
			
 
				+All available clocks are defined as preprocessor macros in
			
 
				+dt-bindings/clock/actions,s900-cmu.h header and can be used in device
			
 
				+tree sources.
			
 
				+
			
 
				+External clocks:
			
 
				+
			
 
				+The hosc clock used as input for the plls is generated outside the SoC. It is
			
 
				+expected that it is defined using standard clock bindings as "hosc".
			
 
				+
			
 
				+Actions S900 CMU also requires one more clock:
			
 
				+ - "losc" - internal low frequency oscillator
			
 
				+
			
 
				+Example: Clock Management Unit node:
			
 
				+
			
 
				+        cmu: clock-controller@e0160000 {
			
 
				+                compatible = "actions,s900-cmu";
			
 
				+                reg = <0x0 0xe0160000 0x0 0x1000>;
			
 
				+                clocks = <&hosc>, <&losc>;
			
 
				+                #clock-cells = <1>;
			
 
				+        };
			
 
				+
			
 
				+Example: UART controller node that consumes clock generated by the clock
			
 
				+management unit:
			
 
				+
			
 
				+        uart: serial@e012a000 {
			
 
				+                compatible = "actions,s900-uart", "actions,owl-uart";
			
 
				+                reg = <0x0 0xe012a000 0x0 0x2000>;
			
 
				+                interrupts = <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>;
			
 
				+                clocks = <&cmu CLK_UART5>;
			
 
				+        };
			
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-aoclkc.txt
@@ -9,6 +9,7 @@ Required Properties:
 
				 	- GXBB (S905) : "amlogic,meson-gxbb-aoclkc"
			
 
				 	- GXL (S905X, S905D) : "amlogic,meson-gxl-aoclkc"
			
 
				 	- GXM (S912) : "amlogic,meson-gxm-aoclkc"
			
 
				+	- AXG (A113D, A113X) : "amlogic,meson-axg-aoclkc"
			
 
				 	followed by the common "amlogic,meson-gx-aoclkc"
			
 
				 
			
 
				 - #clock-cells: should be 1.
			
--- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
+++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt
@@ -10,9 +10,6 @@ Required Properties:
 
				 		"amlogic,gxl-clkc" for GXL and GXM SoC,
			
 
				 		"amlogic,axg-clkc" for AXG SoC.
			
 
				 
			
 
				-- reg: physical base address of the clock controller and length of memory
			
 
				-       mapped region.
			
 
				-
			
 
				 - #clock-cells: should be 1.
			
 
				 
			
 
				 Each clock is assigned an identifier and client nodes can use this identifier
			
@@ -20,13 +17,22 @@ to specify the clock which they consume. All available clocks are defined as
 
				 preprocessor macros in the dt-bindings/clock/gxbb-clkc.h header and can be
			
 
				 used in device tree sources.
			
 
				 
			
 
				+Parent node should have the following properties :
			
 
				+- compatible: "syscon", "simple-mfd, and "amlogic,meson-gx-hhi-sysctrl" or
			
 
				+              "amlogic,meson-axg-hhi-sysctrl"
			
 
				+- reg: base address and size of the HHI system control register space.
			
 
				+
			
 
				 Example: Clock controller node:
			
 
				 
			
 
				-	clkc: clock-controller@c883c000 {
			
 
				+sysctrl: system-controller@0 {
			
 
				+	compatible = "amlogic,meson-gx-hhi-sysctrl", "syscon", "simple-mfd";
			
 
				+	reg = <0 0 0 0x400>;
			
 
				+
			
 
				+	clkc: clock-controller {
			
 
				 		#clock-cells = <1>;
			
 
				 		compatible = "amlogic,gxbb-clkc";
			
 
				-		reg = <0x0 0xc883c000 0x0 0x3db>;
			
 
				 	};
			
 
				+};
			
 
				 
			
 
				 Example: UART controller node that consumes the clock generated by the clock
			
 
				   controller:
			
--- a/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
+++ b/Documentation/devicetree/bindings/clock/brcm,iproc-clocks.txt
@@ -276,36 +276,38 @@ These clock IDs are defined in:
 
				     clk_ts_500_ref	genpll2		2	BCM_SR_GENPLL2_TS_500_REF_CLK
			
 
				     clk_125_nitro	genpll2		3	BCM_SR_GENPLL2_125_NITRO_CLK
			
 
				     clk_chimp		genpll2		4	BCM_SR_GENPLL2_CHIMP_CLK
			
 
				-    clk_nic_flash	genpll2		5	BCM_SR_GENPLL2_NIC_FLASH
			
 
				+    clk_nic_flash	genpll2		5	BCM_SR_GENPLL2_NIC_FLASH_CLK
			
 
				+    clk_fs		genpll2		6	BCM_SR_GENPLL2_FS_CLK
			
 
				 
			
 
				     genpll3		crystal		0	BCM_SR_GENPLL3
			
 
				     clk_hsls		genpll3		1	BCM_SR_GENPLL3_HSLS_CLK
			
 
				     clk_sdio		genpll3		2	BCM_SR_GENPLL3_SDIO_CLK
			
 
				 
			
 
				     genpll4		crystal		0	BCM_SR_GENPLL4
			
 
				-    ccn			genpll4		1	BCM_SR_GENPLL4_CCN_CLK
			
 
				+    clk_ccn		genpll4		1	BCM_SR_GENPLL4_CCN_CLK
			
 
				     clk_tpiu_pll	genpll4		2	BCM_SR_GENPLL4_TPIU_PLL_CLK
			
 
				-    noc_clk		genpll4		3	BCM_SR_GENPLL4_NOC_CLK
			
 
				+    clk_noc		genpll4		3	BCM_SR_GENPLL4_NOC_CLK
			
 
				     clk_chclk_fs4	genpll4		4	BCM_SR_GENPLL4_CHCLK_FS4_CLK
			
 
				     clk_bridge_fscpu	genpll4		5	BCM_SR_GENPLL4_BRIDGE_FSCPU_CLK
			
 
				 
			
 
				-
			
 
				     genpll5		crystal		0	BCM_SR_GENPLL5
			
 
				-    fs4_hf_clk		genpll5		1	BCM_SR_GENPLL5_FS4_HF_CLK
			
 
				-    crypto_ae_clk	genpll5		2	BCM_SR_GENPLL5_CRYPTO_AE_CLK
			
 
				-    raid_ae_clk		genpll5		3	BCM_SR_GENPLL5_RAID_AE_CLK
			
 
				+    clk_fs4_hf		genpll5		1	BCM_SR_GENPLL5_FS4_HF_CLK
			
 
				+    clk_crypto_ae	genpll5		2	BCM_SR_GENPLL5_CRYPTO_AE_CLK
			
 
				+    clk_raid_ae		genpll5		3	BCM_SR_GENPLL5_RAID_AE_CLK
			
 
				 
			
 
				     genpll6		crystal		0	BCM_SR_GENPLL6
			
 
				-    48_usb		genpll6		1	BCM_SR_GENPLL6_48_USB_CLK
			
 
				+    clk_48_usb		genpll6		1	BCM_SR_GENPLL6_48_USB_CLK
			
 
				 
			
 
				     lcpll0		crystal		0	BCM_SR_LCPLL0
			
 
				     clk_sata_refp 	lcpll0		1	BCM_SR_LCPLL0_SATA_REFP_CLK
			
 
				     clk_sata_refn	lcpll0		2	BCM_SR_LCPLL0_SATA_REFN_CLK
			
 
				-    clk_usb_ref		lcpll0		3	BCM_SR_LCPLL0_USB_REF_CLK
			
 
				-    sata_refpn		lcpll0		3	BCM_SR_LCPLL0_SATA_REFPN_CLK
			
 
				+    clk_sata_350	lcpll0		3	BCM_SR_LCPLL0_SATA_350_CLK
			
 
				+    clk_sata_500	lcpll0		4	BCM_SR_LCPLL0_SATA_500_CLK
			
 
				 
			
 
				     lcpll1		crystal		0	BCM_SR_LCPLL1
			
 
				-    wan 		lcpll1		1	BCM_SR_LCPLL0_WAN_CLK
			
 
				+    clk_wan		lcpll1		1	BCM_SR_LCPLL1_WAN_CLK
			
 
				+    clk_usb_ref		lcpll1		2	BCM_SR_LCPLL1_USB_REF_CLK
			
 
				+    clk_crmu_ts		lcpll1		3	BCM_SR_LCPLL1_CRMU_TS_CLK
			
 
				 
			
 
				     lcpll_pcie		crystal		0	BCM_SR_LCPLL_PCIE
			
 
				-    pcie_phy_ref 	lcpll1		1	BCM_SR_LCPLL_PCIE_PHY_REF_CLK
			
 
				+    clk_pcie_phy_ref	lcpll1		1	BCM_SR_LCPLL_PCIE_PHY_REF_CLK
			
--- a/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt
+++ b/Documentation/devicetree/bindings/clock/nuvoton,npcm750-clk.txt
@@ -0,0 +1,100 @@
 
				+* Nuvoton NPCM7XX Clock Controller
			
 
				+
			
 
				+Nuvoton Poleg BMC NPCM7XX contains an integrated clock controller, which
			
 
				+generates and supplies clocks to all modules within the BMC.
			
 
				+
			
 
				+External clocks:
			
 
				+
			
 
				+There are six fixed clocks that are generated outside the BMC. All clocks are of
			
 
				+a known fixed value that cannot be changed. clk_refclk, clk_mcbypck and
			
 
				+clk_sysbypck are inputs to the clock controller.
			
 
				+clk_rg1refck, clk_rg2refck and clk_xin are external clocks suppling the
			
 
				+network. They are set on the device tree, but not used by the clock module. The
			
 
				+network devices use them directly.
			
 
				+Example can be found below.
			
 
				+
			
 
				+All available clocks are defined as preprocessor macros in:
			
 
				+dt-bindings/clock/nuvoton,npcm7xx-clock.h
			
 
				+and can be reused as DT sources.
			
 
				+
			
 
				+Required Properties of clock controller:
			
 
				+
			
 
				+	- compatible: "nuvoton,npcm750-clk" : for clock controller of Nuvoton
			
 
				+		  Poleg BMC NPCM750
			
 
				+
			
 
				+	- reg: physical base address of the clock controller and length of
			
 
				+		memory mapped region.
			
 
				+
			
 
				+	- #clock-cells: should be 1.
			
 
				+
			
 
				+Example: Clock controller node:
			
 
				+
			
 
				+	clk: clock-controller@f0801000 {
			
 
				+		compatible = "nuvoton,npcm750-clk";
			
 
				+		#clock-cells = <1>;
			
 
				+		reg = <0xf0801000 0x1000>;
			
 
				+		clock-names = "refclk", "sysbypck", "mcbypck";
			
 
				+		clocks = <&clk_refclk>, <&clk_sysbypck>, <&clk_mcbypck>;
			
 
				+	};
			
 
				+
			
 
				+Example: Required external clocks for network:
			
 
				+
			
 
				+	/* external reference clock */
			
 
				+	clk_refclk: clk-refclk {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <25000000>;
			
 
				+		clock-output-names = "refclk";
			
 
				+	};
			
 
				+
			
 
				+	/* external reference clock for cpu. float in normal operation */
			
 
				+	clk_sysbypck: clk-sysbypck {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <800000000>;
			
 
				+		clock-output-names = "sysbypck";
			
 
				+	};
			
 
				+
			
 
				+	/* external reference clock for MC. float in normal operation */
			
 
				+	clk_mcbypck: clk-mcbypck {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <800000000>;
			
 
				+		clock-output-names = "mcbypck";
			
 
				+	};
			
 
				+
			
 
				+	 /* external clock signal rg1refck, supplied by the phy */
			
 
				+	clk_rg1refck: clk-rg1refck {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <125000000>;
			
 
				+		clock-output-names = "clk_rg1refck";
			
 
				+	};
			
 
				+
			
 
				+	 /* external clock signal rg2refck, supplied by the phy */
			
 
				+	clk_rg2refck: clk-rg2refck {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <125000000>;
			
 
				+		clock-output-names = "clk_rg2refck";
			
 
				+	};
			
 
				+
			
 
				+	clk_xin: clk-xin {
			
 
				+		compatible = "fixed-clock";
			
 
				+		#clock-cells = <0>;
			
 
				+		clock-frequency = <50000000>;
			
 
				+		clock-output-names = "clk_xin";
			
 
				+	};
			
 
				+
			
 
				+
			
 
				+Example: GMAC controller node that consumes two clocks: a generated clk by the
			
 
				+clock controller and a fixed clock from DT (clk_rg1refck).
			
 
				+
			
 
				+	ethernet0: ethernet@f0802000 {
			
 
				+		compatible = "snps,dwmac";
			
 
				+		reg = <0xf0802000 0x2000>;
			
 
				+		interrupts = <0 14 4>;
			
 
				+		interrupt-names = "macirq";
			
 
				+		clocks	= <&clk_rg1refck>, <&clk NPCM7XX_CLK_AHB>;
			
 
				+		clock-names = "stmmaceth", "clk_gmac";
			
 
				+	};
			
--- a/Documentation/devicetree/bindings/clock/qcom,gcc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,gcc.txt
@@ -17,7 +17,9 @@ Required properties :
 
				 			"qcom,gcc-msm8974pro-ac"
			
 
				 			"qcom,gcc-msm8994"
			
 
				 			"qcom,gcc-msm8996"
			
 
				+			"qcom,gcc-msm8998"
			
 
				 			"qcom,gcc-mdm9615"
			
 
				+			"qcom,gcc-sdm845"
			
 
				 
			
 
				 - reg : shall contain base register location and length
			
 
				 - #clock-cells : shall contain 1
			
--- a/Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,rpmh-clk.txt
@@ -0,0 +1,22 @@
 
				+Qualcomm Technologies, Inc. RPMh Clocks
			
 
				+-------------------------------------------------------
			
 
				+
			
 
				+Resource Power Manager Hardened (RPMh) manages shared resources on
			
 
				+some Qualcomm Technologies Inc. SoCs. It accepts clock requests from
			
 
				+other hardware subsystems via RSC to control clocks.
			
 
				+
			
 
				+Required properties :
			
 
				+- compatible : shall contain "qcom,sdm845-rpmh-clk"
			
 
				+
			
 
				+- #clock-cells : must contain 1
			
 
				+
			
 
				+Example :
			
 
				+
			
 
				+#include <dt-bindings/clock/qcom,rpmh.h>
			
 
				+
			
 
				+	&apps_rsc {
			
 
				+		rpmhcc: clock-controller {
			
 
				+			compatible = "qcom,sdm845-rpmh-clk";
			
 
				+			#clock-cells = <1>;
			
 
				+		};
			
 
				+	};
			
--- a/Documentation/devicetree/bindings/clock/qcom,videocc.txt
+++ b/Documentation/devicetree/bindings/clock/qcom,videocc.txt
@@ -0,0 +1,19 @@
 
				+Qualcomm Video Clock & Reset Controller Binding
			
 
				+-----------------------------------------------
			
 
				+
			
 
				+Required properties :
			
 
				+- compatible : shall contain "qcom,sdm845-videocc"
			
 
				+- reg : shall contain base register location and length
			
 
				+- #clock-cells : from common clock binding, shall contain 1.
			
 
				+- #power-domain-cells : from generic power domain binding, shall contain 1.
			
 
				+
			
 
				+Optional properties :
			
 
				+- #reset-cells : from common reset binding, shall contain 1.
			
 
				+
			
 
				+Example:
			
 
				+	videocc: clock-controller@ab00000 {
			
 
				+		compatible = "qcom,sdm845-videocc";
			
 
				+		reg = <0xab00000 0x10000>;
			
 
				+		#clock-cells = <1>;
			
 
				+		#power-domain-cells = <1>;
			
 
				+	};
			
--- a/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt
+++ b/Documentation/devicetree/bindings/clock/renesas,cpg-mssr.txt
@@ -15,6 +15,7 @@ Required Properties:
 
				   - compatible: Must be one of:
			
 
				       - "renesas,r8a7743-cpg-mssr" for the r8a7743 SoC (RZ/G1M)
			
 
				       - "renesas,r8a7745-cpg-mssr" for the r8a7745 SoC (RZ/G1E)
			
 
				+      - "renesas,r8a77470-cpg-mssr" for the r8a77470 SoC (RZ/G1C)
			
 
				       - "renesas,r8a7790-cpg-mssr" for the r8a7790 SoC (R-Car H2)
			
 
				       - "renesas,r8a7791-cpg-mssr" for the r8a7791 SoC (R-Car M2-W)
			
 
				       - "renesas,r8a7792-cpg-mssr" for the r8a7792 SoC (R-Car V2H)
			
@@ -25,6 +26,7 @@ Required Properties:
 
				       - "renesas,r8a77965-cpg-mssr" for the r8a77965 SoC (R-Car M3-N)
			
 
				       - "renesas,r8a77970-cpg-mssr" for the r8a77970 SoC (R-Car V3M)
			
 
				       - "renesas,r8a77980-cpg-mssr" for the r8a77980 SoC (R-Car V3H)
			
 
				+      - "renesas,r8a77990-cpg-mssr" for the r8a77990 SoC (R-Car E3)
			
 
				       - "renesas,r8a77995-cpg-mssr" for the r8a77995 SoC (R-Car D3)
			
 
				 
			
 
				   - reg: Base address and length of the memory resource used by the CPG/MSSR
			
@@ -33,10 +35,12 @@ Required Properties:
 
				   - clocks: References to external parent clocks, one entry for each entry in
			
 
				     clock-names
			
 
				   - clock-names: List of external parent clock names. Valid names are:
			
 
				-      - "extal" (r8a7743, r8a7745, r8a7790, r8a7791, r8a7792, r8a7793, r8a7794,
			
 
				-		 r8a7795, r8a7796, r8a77965, r8a77970, r8a77980, r8a77995)
			
 
				+      - "extal" (r8a7743, r8a7745, r8a77470, r8a7790, r8a7791, r8a7792,
			
 
				+		 r8a7793, r8a7794, r8a7795, r8a7796, r8a77965, r8a77970,
			
 
				+		 r8a77980, r8a77990, r8a77995)
			
 
				       - "extalr" (r8a7795, r8a7796, r8a77965, r8a77970, r8a77980)
			
 
				-      - "usb_extal" (r8a7743, r8a7745, r8a7790, r8a7791, r8a7793, r8a7794)
			
 
				+      - "usb_extal" (r8a7743, r8a7745, r8a77470, r8a7790, r8a7791, r8a7793,
			
 
				+		     r8a7794)
			
 
				 
			
 
				   - #clock-cells: Must be 2
			
 
				       - For CPG core clocks, the two clock specifier cells must be "CPG_CORE"
			
--- a/Documentation/devicetree/bindings/clock/rockchip.txt
+++ b/Documentation/devicetree/bindings/clock/rockchip.txt
@@ -1,77 +0,0 @@
 
				-Device Tree Clock bindings for arch-rockchip
			
 
				-
			
 
				-This binding uses the common clock binding[1].
			
 
				-
			
 
				-[1] Documentation/devicetree/bindings/clock/clock-bindings.txt
			
 
				-
			
 
				-== Gate clocks ==
			
 
				-
			
 
				-These bindings are deprecated!
			
 
				-Please use the soc specific CRU bindings instead.
			
 
				-
			
 
				-The gate registers form a continuos block which makes the dt node
			
 
				-structure a matter of taste, as either all gates can be put into
			
 
				-one gate clock spanning all registers or they can be divided into
			
 
				-the 10 individual gates containing 16 clocks each.
			
 
				-The code supports both approaches.
			
 
				-
			
 
				-Required properties:
			
 
				-- compatible : "rockchip,rk2928-gate-clk"
			
 
				-- reg : shall be the control register address(es) for the clock.
			
 
				-- #clock-cells : from common clock binding; shall be set to 1
			
 
				-- clock-output-names : the corresponding gate names that the clock controls
			
 
				-- clocks : should contain the parent clock for each individual gate,
			
 
				-  therefore the number of clocks elements should match the number of
			
 
				-  clock-output-names
			
 
				-
			
 
				-Example using multiple gate clocks:
			
 
				-
			
 
				-		clk_gates0: gate-clk@200000d0 {
			
 
				-			compatible = "rockchip,rk2928-gate-clk";
			
 
				-			reg = <0x200000d0 0x4>;
			
 
				-			clocks = <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>,
			
 
				-				 <&dummy>, <&dummy>;
			
 
				-
			
 
				-			clock-output-names =
			
 
				-				"gate_core_periph", "gate_cpu_gpll",
			
 
				-				"gate_ddrphy", "gate_aclk_cpu",
			
 
				-				"gate_hclk_cpu", "gate_pclk_cpu",
			
 
				-				"gate_atclk_cpu", "gate_i2s0",
			
 
				-				"gate_i2s0_frac", "gate_i2s1",
			
 
				-				"gate_i2s1_frac", "gate_i2s2",
			
 
				-				"gate_i2s2_frac", "gate_spdif",
			
 
				-				"gate_spdif_frac", "gate_testclk";
			
 
				-
			
 
				-			#clock-cells = <1>;
			
 
				-		};
			
 
				-
			
 
				-		clk_gates1: gate-clk@200000d4 {
			
 
				-			compatible = "rockchip,rk2928-gate-clk";
			
 
				-			reg = <0x200000d4 0x4>;
			
 
				-			clocks = <&xin24m>, <&xin24m>,
			
 
				-				 <&xin24m>, <&dummy>,
			
 
				-				 <&dummy>, <&xin24m>,
			
 
				-				 <&xin24m>, <&dummy>,
			
 
				-				 <&xin24m>, <&dummy>,
			
 
				-				 <&xin24m>, <&dummy>,
			
 
				-				 <&xin24m>, <&dummy>,
			
 
				-				 <&xin24m>, <&dummy>;
			
 
				-
			
 
				-			clock-output-names =
			
 
				-				"gate_timer0", "gate_timer1",
			
 
				-				"gate_timer2", "gate_jtag",
			
 
				-				"gate_aclk_lcdc1_src", "gate_otgphy0",
			
 
				-				"gate_otgphy1", "gate_ddr_gpll",
			
 
				-				"gate_uart0", "gate_frac_uart0",
			
 
				-				"gate_uart1", "gate_frac_uart1",
			
 
				-				"gate_uart2", "gate_frac_uart2",
			
 
				-				"gate_uart3", "gate_frac_uart3";
			
 
				-
			
 
				-			#clock-cells = <1>;
			
 
				-		};
			
--- a/Documentation/devicetree/bindings/clock/st/st,clkgen.txt
+++ b/Documentation/devicetree/bindings/clock/st/st,clkgen.txt
@@ -31,10 +31,10 @@ This binding uses the common clock binding[1].
 
				 Each subnode should use the binding described in [2]..[7]
			
 
				 
			
 
				 [1] Documentation/devicetree/bindings/clock/clock-bindings.txt
			
 
				-[3] Documentation/devicetree/bindings/clock/st,clkgen-mux.txt
			
 
				-[4] Documentation/devicetree/bindings/clock/st,clkgen-pll.txt
			
 
				-[7] Documentation/devicetree/bindings/clock/st,quadfs.txt
			
 
				-[8] Documentation/devicetree/bindings/clock/st,flexgen.txt
			
 
				+[3] Documentation/devicetree/bindings/clock/st/st,clkgen-mux.txt
			
 
				+[4] Documentation/devicetree/bindings/clock/st/st,clkgen-pll.txt
			
 
				+[7] Documentation/devicetree/bindings/clock/st/st,quadfs.txt
			
 
				+[8] Documentation/devicetree/bindings/clock/st/st,flexgen.txt
			
 
				 
			
 
				 
			
 
				 Required properties:
			
--- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
+++ b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt
@@ -21,6 +21,7 @@ Required properties :
 
				 		- "allwinner,sun50i-a64-r-ccu"
			
 
				 		- "allwinner,sun50i-h5-ccu"
			
 
				 		- "allwinner,sun50i-h6-ccu"
			
 
				+		- "allwinner,sun50i-h6-r-ccu"
			
 
				 		- "nextthing,gr8-ccu"
			
 
				 
			
 
				 - reg: Must contain the registers base address and length
			
@@ -35,7 +36,7 @@ Required properties :
 
				 For the main CCU on H6, one more clock is needed:
			
 
				 - "iosc": the SoC's internal frequency oscillator
			
 
				 
			
 
				-For the PRCM CCUs on A83T/H3/A64, two more clocks are needed:
			
 
				+For the PRCM CCUs on A83T/H3/A64/H6, two more clocks are needed:
			
 
				 - "pll-periph": the SoC's peripheral PLL from the main CCU
			
 
				 - "iosc": the SoC's internal frequency oscillator
			
 
				 
			
--- a/Documentation/devicetree/bindings/clock/ti/gate.txt
+++ b/Documentation/devicetree/bindings/clock/ti/gate.txt
@@ -10,7 +10,7 @@ will be controlled instead and the corresponding hw-ops for
 
				 that is used.
			
 
				 
			
 
				 [1] Documentation/devicetree/bindings/clock/clock-bindings.txt
			
 
				-[2] Documentation/devicetree/bindings/clock/gate-clock.txt
			
 
				+[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
			
 
				 [3] Documentation/devicetree/bindings/clock/ti/clockdomain.txt
			
 
				 
			
 
				 Required properties:
			
--- a/Documentation/devicetree/bindings/clock/ti/interface.txt
+++ b/Documentation/devicetree/bindings/clock/ti/interface.txt
@@ -9,7 +9,7 @@ companion clock finding (match corresponding functional gate
 
				 clock) and hardware autoidle enable / disable.
			
 
				 
			
 
				 [1] Documentation/devicetree/bindings/clock/clock-bindings.txt
			
 
				-[2] Documentation/devicetree/bindings/clock/gate-clock.txt
			
 
				+[2] Documentation/devicetree/bindings/clock/gpio-gate-clock.txt
			
 
				 
			
 
				 Required properties:
			
 
				 - compatible : shall be one of:
			
--- a/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt
+++ b/Documentation/devicetree/bindings/cpufreq/cpufreq-mediatek.txt
@@ -8,7 +8,7 @@ Required properties:
 
				 	"intermediate"	- A parent of "cpu" clock which is used as "intermediate" clock
			
 
				 			  source (usually MAINPLL) when the original CPU PLL is under
			
 
				 			  transition and not stable yet.
			
 
				-	Please refer to Documentation/devicetree/bindings/clk/clock-bindings.txt for
			
 
				+	Please refer to Documentation/devicetree/bindings/clock/clock-bindings.txt for
			
 
				 	generic clock consumer properties.
			
 
				 - operating-points-v2: Please refer to Documentation/devicetree/bindings/opp/opp.txt
			
 
				 	for detail.
			
--- a/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
+++ b/Documentation/devicetree/bindings/devfreq/rk3399_dmc.txt
@@ -12,7 +12,7 @@ Required properties:
 
				 - clocks:		 Phandles for clock specified in "clock-names" property
			
 
				 - clock-names :		 The name of clock used by the DFI, must be
			
 
				 			 "pclk_ddr_mon";
			
 
				-- operating-points-v2:	 Refer to Documentation/devicetree/bindings/power/opp.txt
			
 
				+- operating-points-v2:	 Refer to Documentation/devicetree/bindings/opp/opp.txt
			
 
				 			 for details.
			
 
				 - center-supply:	 DMC supply node.
			
 
				 - status:		 Marks the node enabled/disabled.