8 năm trước cách đây · f57091767a
--- a/Documentation/admin-guide/kernel-parameters.rst
+++ b/Documentation/admin-guide/kernel-parameters.rst
@@ -138,6 +138,7 @@ parameter is applicable::
 
				 	PPT	Parallel port support is enabled.
			
 
				 	PS2	Appropriate PS/2 support is enabled.
			
 
				 	RAM	RAM disk support is enabled.
			
 
				+	RDT	Intel Resource Director Technology.
			
 
				 	S390	S390 architecture is enabled.
			
 
				 	SCSI	Appropriate SCSI support is enabled.
			
 
				 			A lot of drivers have their options described inside
			
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3612,6 +3612,12 @@
 
				 			Run specified binary instead of /init from the ramdisk,
			
 
				 			used for early userspace startup. See initrd.
			
 
				 
			
 
				+	rdt=		[HW,X86,RDT]
			
 
				+			Turn on/off individual RDT features. List is:
			
 
				+			cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba.
			
 
				+			E.g. to turn on cmt and turn off mba use:
			
 
				+				rdt=cmt,!mba
			
 
				+
			
 
				 	reboot=		[KNL]
			
 
				 			Format (x86 or x86_64):
			
 
				 				[w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \
			
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -6,8 +6,8 @@ Fenghua Yu <fenghua.yu@intel.com>
 
				 Tony Luck <tony.luck@intel.com>
			
 
				 Vikas Shivappa <vikas.shivappa@intel.com>
			
 
				 
			
 
				-This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the
			
 
				-X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3".
			
 
				+This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the
			
 
				+X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3".
			
 
				 
			
 
				 To use the feature mount the file system:
			
 
				 
			
@@ -17,6 +17,13 @@ mount options are:
 
				 
			
 
				 "cdp": Enable code/data prioritization in L3 cache allocations.
			
 
				 
			
 
				+RDT features are orthogonal. A particular system may support only
			
 
				+monitoring, only control, or both monitoring and control.
			
 
				+
			
 
				+The mount succeeds if either of allocation or monitoring is present, but
			
 
				+only those files and directories supported by the system will be created.
			
 
				+For more details on the behavior of the interface during monitoring
			
 
				+and allocation, see the "Resource alloc and monitor groups" section.
			
 
				 
			
 
				 Info directory
			
 
				 --------------
			
@@ -24,7 +31,12 @@ Info directory
 
				 The 'info' directory contains information about the enabled
			
 
				 resources. Each resource has its own subdirectory. The subdirectory
			
 
				 names reflect the resource names.
			
 
				-Cache resource(L3/L2)  subdirectory contains the following files:
			
 
				+
			
 
				+Each subdirectory contains the following files with respect to
			
 
				+allocation:
			
 
				+
			
 
				+Cache resource(L3/L2)  subdirectory contains the following files
			
 
				+related to allocation:
			
 
				 
			
 
				 "num_closids":  	The number of CLOSIDs which are valid for this
			
 
				 			resource. The kernel uses the smallest number of
			
@@ -36,7 +48,15 @@ Cache resource(L3/L2)  subdirectory contains the following files:
 
				 "min_cbm_bits": 	The minimum number of consecutive bits which
			
 
				 			must be set when writing a mask.
			
 
				 
			
 
				-Memory bandwitdh(MB) subdirectory contains the following files:
			
 
				+"shareable_bits":	Bitmask of shareable resource with other executing
			
 
				+			entities (e.g. I/O). User can use this when
			
 
				+			setting up exclusive cache partitions. Note that
			
 
				+			some platforms support devices that have their
			
 
				+			own settings for cache use which can over-ride
			
 
				+			these bits.
			
 
				+
			
 
				+Memory bandwitdh(MB) subdirectory contains the following files
			
 
				+with respect to allocation:
			
 
				 
			
 
				 "min_bandwidth":	The minimum memory bandwidth percentage which
			
 
				 			user can request.
			
@@ -52,48 +72,152 @@ Memory bandwitdh(MB) subdirectory contains the following files:
 
				 			non-linear. This field is purely informational
			
 
				 			only.
			
 
				 
			
 
				-Resource groups
			
 
				----------------
			
 
				+If RDT monitoring is available there will be an "L3_MON" directory
			
 
				+with the following files:
			
 
				+
			
 
				+"num_rmids":		The number of RMIDs available. This is the
			
 
				+			upper bound for how many "CTRL_MON" + "MON"
			
 
				+			groups can be created.
			
 
				+
			
 
				+"mon_features":	Lists the monitoring events if
			
 
				+			monitoring is enabled for the resource.
			
 
				+
			
 
				+"max_threshold_occupancy":
			
 
				+			Read/write file provides the largest value (in
			
 
				+			bytes) at which a previously used LLC_occupancy
			
 
				+			counter can be considered for re-use.
			
 
				+
			
 
				+
			
 
				+Resource alloc and monitor groups
			
 
				+---------------------------------
			
 
				+
			
 
				 Resource groups are represented as directories in the resctrl file
			
 
				-system. The default group is the root directory. Other groups may be
			
 
				-created as desired by the system administrator using the "mkdir(1)"
			
 
				-command, and removed using "rmdir(1)".
			
 
				+system.  The default group is the root directory which, immediately
			
 
				+after mounting, owns all the tasks and cpus in the system and can make
			
 
				+full use of all resources.
			
 
				+
			
 
				+On a system with RDT control features additional directories can be
			
 
				+created in the root directory that specify different amounts of each
			
 
				+resource (see "schemata" below). The root and these additional top level
			
 
				+directories are referred to as "CTRL_MON" groups below.
			
 
				+
			
 
				+On a system with RDT monitoring the root directory and other top level
			
 
				+directories contain a directory named "mon_groups" in which additional
			
 
				+directories can be created to monitor subsets of tasks in the CTRL_MON
			
 
				+group that is their ancestor. These are called "MON" groups in the rest
			
 
				+of this document.
			
 
				+
			
 
				+Removing a directory will move all tasks and cpus owned by the group it
			
 
				+represents to the parent. Removing one of the created CTRL_MON groups
			
 
				+will automatically remove all MON groups below it.
			
 
				+
			
 
				+All groups contain the following files:
			
 
				+
			
 
				+"tasks":
			
 
				+	Reading this file shows the list of all tasks that belong to
			
 
				+	this group. Writing a task id to the file will add a task to the
			
 
				+	group. If the group is a CTRL_MON group the task is removed from
			
 
				+	whichever previous CTRL_MON group owned the task and also from
			
 
				+	any MON group that owned the task. If the group is a MON group,
			
 
				+	then the task must already belong to the CTRL_MON parent of this
			
 
				+	group. The task is removed from any previous MON group.
			
 
				+
			
 
				+
			
 
				+"cpus":
			
 
				+	Reading this file shows a bitmask of the logical CPUs owned by
			
 
				+	this group. Writing a mask to this file will add and remove
			
 
				+	CPUs to/from this group. As with the tasks file a hierarchy is
			
 
				+	maintained where MON groups may only include CPUs owned by the
			
 
				+	parent CTRL_MON group.
			
 
				+
			
 
				 
			
 
				-There are three files associated with each group:
			
 
				+"cpus_list":
			
 
				+	Just like "cpus", only using ranges of CPUs instead of bitmasks.
			
 
				 
			
 
				-"tasks": A list of tasks that belongs to this group. Tasks can be
			
 
				-	added to a group by writing the task ID to the "tasks" file
			
 
				-	(which will automatically remove them from the previous
			
 
				-	group to which they belonged). New tasks created by fork(2)
			
 
				-	and clone(2) are added to the same group as their parent.
			
 
				-	If a pid is not in any sub partition, it is in root partition
			
 
				-	(i.e. default partition).
			
 
				 
			
 
				-"cpus": A bitmask of logical CPUs assigned to this group. Writing
			
 
				-	a new mask can add/remove CPUs from this group. Added CPUs
			
 
				-	are removed from their previous group. Removed ones are
			
 
				-	given to the default (root) group. You cannot remove CPUs
			
 
				-	from the default group.
			
 
				+When control is enabled all CTRL_MON groups will also contain:
			
 
				 
			
 
				-"cpus_list": One or more CPU ranges of logical CPUs assigned to this
			
 
				-	     group. Same rules apply like for the "cpus" file.
			
 
				+"schemata":
			
 
				+	A list of all the resources available to this group.
			
 
				+	Each resource has its own line and format - see below for details.
			
 
				 
			
 
				-"schemata": A list of all the resources available to this group.
			
 
				-	Each resource has its own line and format - see below for
			
 
				-	details.
			
 
				+When monitoring is enabled all MON groups will also contain:
			
 
				 
			
 
				-When a task is running the following rules define which resources
			
 
				-are available to it:
			
 
				+"mon_data":
			
 
				+	This contains a set of files organized by L3 domain and by
			
 
				+	RDT event. E.g. on a system with two L3 domains there will
			
 
				+	be subdirectories "mon_L3_00" and "mon_L3_01".	Each of these
			
 
				+	directories have one file per event (e.g. "llc_occupancy",
			
 
				+	"mbm_total_bytes", and "mbm_local_bytes"). In a MON group these
			
 
				+	files provide a read out of the current value of the event for
			
 
				+	all tasks in the group. In CTRL_MON groups these files provide
			
 
				+	the sum for all tasks in the CTRL_MON group and all tasks in
			
 
				+	MON groups. Please see example section for more details on usage.
			
 
				+
			
 
				+Resource allocation rules
			
 
				+-------------------------
			
 
				+When a task is running the following rules define which resources are
			
 
				+available to it:
			
 
				 
			
 
				 1) If the task is a member of a non-default group, then the schemata
			
 
				-for that group is used.
			
 
				+   for that group is used.
			
 
				 
			
 
				 2) Else if the task belongs to the default group, but is running on a
			
 
				-CPU that is assigned to some specific group, then the schemata for
			
 
				-the CPU's group is used.
			
 
				+   CPU that is assigned to some specific group, then the schemata for the
			
 
				+   CPU's group is used.
			
 
				 
			
 
				 3) Otherwise the schemata for the default group is used.
			
 
				 
			
 
				+Resource monitoring rules
			
 
				+-------------------------
			
 
				+1) If a task is a member of a MON group, or non-default CTRL_MON group
			
 
				+   then RDT events for the task will be reported in that group.
			
 
				+
			
 
				+2) If a task is a member of the default CTRL_MON group, but is running
			
 
				+   on a CPU that is assigned to some specific group, then the RDT events
			
 
				+   for the task will be reported in that group.
			
 
				+
			
 
				+3) Otherwise RDT events for the task will be reported in the root level
			
 
				+   "mon_data" group.
			
 
				+
			
 
				+
			
 
				+Notes on cache occupancy monitoring and control
			
 
				+-----------------------------------------------
			
 
				+When moving a task from one group to another you should remember that
			
 
				+this only affects *new* cache allocations by the task. E.g. you may have
			
 
				+a task in a monitor group showing 3 MB of cache occupancy. If you move
			
 
				+to a new group and immediately check the occupancy of the old and new
			
 
				+groups you will likely see that the old group is still showing 3 MB and
			
 
				+the new group zero. When the task accesses locations still in cache from
			
 
				+before the move, the h/w does not update any counters. On a busy system
			
 
				+you will likely see the occupancy in the old group go down as cache lines
			
 
				+are evicted and re-used while the occupancy in the new group rises as
			
 
				+the task accesses memory and loads into the cache are counted based on
			
 
				+membership in the new group.
			
 
				+
			
 
				+The same applies to cache allocation control. Moving a task to a group
			
 
				+with a smaller cache partition will not evict any cache lines. The
			
 
				+process may continue to use them from the old partition.
			
 
				+
			
 
				+Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID)
			
 
				+to identify a control group and a monitoring group respectively. Each of
			
 
				+the resource groups are mapped to these IDs based on the kind of group. The
			
 
				+number of CLOSid and RMID are limited by the hardware and hence the creation of
			
 
				+a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID
			
 
				+and creation of "MON" group may fail if we run out of RMIDs.
			
 
				+
			
 
				+max_threshold_occupancy - generic concepts
			
 
				+------------------------------------------
			
 
				+
			
 
				+Note that an RMID once freed may not be immediately available for use as
			
 
				+the RMID is still tagged the cache lines of the previous user of RMID.
			
 
				+Hence such RMIDs are placed on limbo list and checked back if the cache
			
 
				+occupancy has gone down. If there is a time when system has a lot of
			
 
				+limbo RMIDs but which are not ready to be used, user may see an -EBUSY
			
 
				+during mkdir.
			
 
				+
			
 
				+max_threshold_occupancy is a user configurable value to determine the
			
 
				+occupancy at which an RMID can be freed.
			
 
				 
			
 
				 Schemata files - general concepts
			
 
				 ---------------------------------
			
@@ -143,22 +267,22 @@ SKUs. Using a high bandwidth and a low bandwidth setting on two threads
 
				 sharing a core will result in both threads being throttled to use the
			
 
				 low bandwidth.
			
 
				 
			
 
				-L3 details (code and data prioritization disabled)
			
 
				---------------------------------------------------
			
 
				+L3 schemata file details (code and data prioritization disabled)
			
 
				+----------------------------------------------------------------
			
 
				 With CDP disabled the L3 schemata format is:
			
 
				 
			
 
				 	L3:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
			
 
				 
			
 
				-L3 details (CDP enabled via mount option to resctrl)
			
 
				-----------------------------------------------------
			
 
				+L3 schemata file details (CDP enabled via mount option to resctrl)
			
 
				+------------------------------------------------------------------
			
 
				 When CDP is enabled L3 control is split into two separate resources
			
 
				 so you can specify independent masks for code and data like this:
			
 
				 
			
 
				 	L3data:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
			
 
				 	L3code:<cache_id0>=<cbm>;<cache_id1>=<cbm>;...
			
 
				 
			
 
				-L2 details
			
 
				-----------
			
 
				+L2 schemata file details
			
 
				+------------------------
			
 
				 L2 cache does not support code and data prioritization, so the
			
 
				 schemata format is always:
			
 
				 
			
@@ -185,6 +309,8 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
 
				 L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
			
 
				 L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
			
 
				 
			
 
				+Examples for RDT allocation usage:
			
 
				+
			
 
				 Example 1
			
 
				 ---------
			
 
				 On a two socket machine (one L3 cache per socket) with just four bits
			
@@ -410,3 +536,124 @@ void main(void)
 
				 	/* code to read and write directory contents */
			
 
				 	resctrl_release_lock(fd);
			
 
				 }
			
 
				+
			
 
				+Examples for RDT Monitoring along with allocation usage:
			
 
				+
			
 
				+Reading monitored data
			
 
				+----------------------
			
 
				+Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would
			
 
				+show the current snapshot of LLC occupancy of the corresponding MON
			
 
				+group or CTRL_MON group.
			
 
				+
			
 
				+
			
 
				+Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group)
			
 
				+---------
			
 
				+On a two socket machine (one L3 cache per socket) with just four bits
			
 
				+for cache bit masks
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl
			
 
				+# cd /sys/fs/resctrl
			
 
				+# mkdir p0 p1
			
 
				+# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata
			
 
				+# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata
			
 
				+# echo 5678 > p1/tasks
			
 
				+# echo 5679 > p1/tasks
			
 
				+
			
 
				+The default resource group is unmodified, so we have access to all parts
			
 
				+of all caches (its schemata file reads "L3:0=f;1=f").
			
 
				+
			
 
				+Tasks that are under the control of group "p0" may only allocate from the
			
 
				+"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1.
			
 
				+Tasks in group "p1" use the "lower" 50% of cache on both sockets.
			
 
				+
			
 
				+Create monitor groups and assign a subset of tasks to each monitor group.
			
 
				+
			
 
				+# cd /sys/fs/resctrl/p1/mon_groups
			
 
				+# mkdir m11 m12
			
 
				+# echo 5678 > m11/tasks
			
 
				+# echo 5679 > m12/tasks
			
 
				+
			
 
				+fetch data (data shown in bytes)
			
 
				+
			
 
				+# cat m11/mon_data/mon_L3_00/llc_occupancy
			
 
				+16234000
			
 
				+# cat m11/mon_data/mon_L3_01/llc_occupancy
			
 
				+14789000
			
 
				+# cat m12/mon_data/mon_L3_00/llc_occupancy
			
 
				+16789000
			
 
				+
			
 
				+The parent ctrl_mon group shows the aggregated data.
			
 
				+
			
 
				+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
			
 
				+31234000
			
 
				+
			
 
				+Example 2 (Monitor a task from its creation)
			
 
				+---------
			
 
				+On a two socket machine (one L3 cache per socket)
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl
			
 
				+# cd /sys/fs/resctrl
			
 
				+# mkdir p0 p1
			
 
				+
			
 
				+An RMID is allocated to the group once its created and hence the <cmd>
			
 
				+below is monitored from its creation.
			
 
				+
			
 
				+# echo $$ > /sys/fs/resctrl/p1/tasks
			
 
				+# <cmd>
			
 
				+
			
 
				+Fetch the data
			
 
				+
			
 
				+# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy
			
 
				+31789000
			
 
				+
			
 
				+Example 3 (Monitor without CAT support or before creating CAT groups)
			
 
				+---------
			
 
				+
			
 
				+Assume a system like HSW has only CQM and no CAT support. In this case
			
 
				+the resctrl will still mount but cannot create CTRL_MON directories.
			
 
				+But user can create different MON groups within the root group thereby
			
 
				+able to monitor all tasks including kernel threads.
			
 
				+
			
 
				+This can also be used to profile jobs cache size footprint before being
			
 
				+able to allocate them to different allocation groups.
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl
			
 
				+# cd /sys/fs/resctrl
			
 
				+# mkdir mon_groups/m01
			
 
				+# mkdir mon_groups/m02
			
 
				+
			
 
				+# echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks
			
 
				+# echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks
			
 
				+
			
 
				+Monitor the groups separately and also get per domain data. From the
			
 
				+below its apparent that the tasks are mostly doing work on
			
 
				+domain(socket) 0.
			
 
				+
			
 
				+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy
			
 
				+31234000
			
 
				+# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy
			
 
				+34555
			
 
				+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy
			
 
				+31234000
			
 
				+# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy
			
 
				+32789
			
 
				+
			
 
				+
			
 
				+Example 4 (Monitor real time tasks)
			
 
				+-----------------------------------
			
 
				+
			
 
				+A single socket system which has real time tasks running on cores 4-7
			
 
				+and non real time tasks on other cpus. We want to monitor the cache
			
 
				+occupancy of the real time threads on these cores.
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl
			
 
				+# cd /sys/fs/resctrl
			
 
				+# mkdir p1
			
 
				+
			
 
				+Move the cpus 4-7 over to p1
			
 
				+# echo f0 > p0/cpus
			
 
				+
			
 
				+View the llc occupancy snapshot
			
 
				+
			
 
				+# cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy
			
 
				+11234000
			
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -11121,7 +11121,7 @@ M:	Fenghua Yu <fenghua.yu@intel.com>
 
				 L:	linux-kernel@vger.kernel.org
			
 
				 S:	Supported
			
 
				 F:	arch/x86/kernel/cpu/intel_rdt*
			
 
				-F:	arch/x86/include/asm/intel_rdt*
			
 
				+F:	arch/x86/include/asm/intel_rdt_sched.h
			
 
				 F:	Documentation/x86/intel_rdt*
			
 
				 
			
 
				 READ-COPY UPDATE (RCU)
			
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -429,16 +429,16 @@ config GOLDFISH
 
				        def_bool y
			
 
				        depends on X86_GOLDFISH
			
 
				 
			
 
				-config INTEL_RDT_A
			
 
				-	bool "Intel Resource Director Technology Allocation support"
			
 
				+config INTEL_RDT
			
 
				+	bool "Intel Resource Director Technology support"
			
 
				 	default n
			
 
				 	depends on X86 && CPU_SUP_INTEL
			
 
				 	select KERNFS
			
 
				 	help
			
 
				-	  Select to enable resource allocation which is a sub-feature of
			
 
				-	  Intel Resource Director Technology(RDT). More information about
			
 
				-	  RDT can be found in the Intel x86 Architecture Software
			
 
				-	  Developer Manual.
			
 
				+	  Select to enable resource allocation and monitoring which are
			
 
				+	  sub-features of Intel Resource Director Technology(RDT). More
			
 
				+	  information about RDT can be found in the Intel x86
			
 
				+	  Architecture Software Developer Manual.
			
 
				 
			
 
				 	  Say N if unsure.
			
 
				 
			
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@@ -1,4 +1,4 @@
 
				-obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
			
 
				+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
			
 
				 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
			
 
				 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
			
 
				 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl-perf.o
			
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
@@ -1,1766 +0,0 @@
 
				-/*
			
 
				- * Intel Cache Quality-of-Service Monitoring (CQM) support.
			
 
				- *
			
 
				- * Based very, very heavily on work by Peter Zijlstra.
			
 
				- */
			
 
				-
			
 
				-#include <linux/perf_event.h>
			
 
				-#include <linux/slab.h>
			
 
				-#include <asm/cpu_device_id.h>
			
 
				-#include <asm/intel_rdt_common.h>
			
 
				-#include "../perf_event.h"
			
 
				-
			
 
				-#define MSR_IA32_QM_CTR		0x0c8e
			
 
				-#define MSR_IA32_QM_EVTSEL	0x0c8d
			
 
				-
			
 
				-#define MBM_CNTR_WIDTH		24
			
 
				-/*
			
 
				- * Guaranteed time in ms as per SDM where MBM counters will not overflow.
			
 
				- */
			
 
				-#define MBM_CTR_OVERFLOW_TIME	1000
			
 
				-
			
 
				-static u32 cqm_max_rmid = -1;
			
 
				-static unsigned int cqm_l3_scale; /* supposedly cacheline size */
			
 
				-static bool cqm_enabled, mbm_enabled;
			
 
				-unsigned int mbm_socket_max;
			
 
				-
			
 
				-/*
			
 
				- * The cached intel_pqr_state is strictly per CPU and can never be
			
 
				- * updated from a remote CPU. Both functions which modify the state
			
 
				- * (intel_cqm_event_start and intel_cqm_event_stop) are called with
			
 
				- * interrupts disabled, which is sufficient for the protection.
			
 
				- */
			
 
				-DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
			
 
				-static struct hrtimer *mbm_timers;
			
 
				-/**
			
 
				- * struct sample - mbm event's (local or total) data
			
 
				- * @total_bytes    #bytes since we began monitoring
			
 
				- * @prev_msr       previous value of MSR
			
 
				- */
			
 
				-struct sample {
			
 
				-	u64	total_bytes;
			
 
				-	u64	prev_msr;
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- * samples profiled for total memory bandwidth type events
			
 
				- */
			
 
				-static struct sample *mbm_total;
			
 
				-/*
			
 
				- * samples profiled for local memory bandwidth type events
			
 
				- */
			
 
				-static struct sample *mbm_local;
			
 
				-
			
 
				-#define pkg_id	topology_physical_package_id(smp_processor_id())
			
 
				-/*
			
 
				- * rmid_2_index returns the index for the rmid in mbm_local/mbm_total array.
			
 
				- * mbm_total[] and mbm_local[] are linearly indexed by socket# * max number of
			
 
				- * rmids per socket, an example is given below
			
 
				- * RMID1 of Socket0:  vrmid =  1
			
 
				- * RMID1 of Socket1:  vrmid =  1 * (cqm_max_rmid + 1) + 1
			
 
				- * RMID1 of Socket2:  vrmid =  2 * (cqm_max_rmid + 1) + 1
			
 
				- */
			
 
				-#define rmid_2_index(rmid)  ((pkg_id * (cqm_max_rmid + 1)) + rmid)
			
 
				-/*
			
 
				- * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru.
			
 
				- * Also protects event->hw.cqm_rmid
			
 
				- *
			
 
				- * Hold either for stability, both for modification of ->hw.cqm_rmid.
			
 
				- */
			
 
				-static DEFINE_MUTEX(cache_mutex);
			
 
				-static DEFINE_RAW_SPINLOCK(cache_lock);
			
 
				-
			
 
				-/*
			
 
				- * Groups of events that have the same target(s), one RMID per group.
			
 
				- */
			
 
				-static LIST_HEAD(cache_groups);
			
 
				-
			
 
				-/*
			
 
				- * Mask of CPUs for reading CQM values. We only need one per-socket.
			
 
				- */
			
 
				-static cpumask_t cqm_cpumask;
			
 
				-
			
 
				-#define RMID_VAL_ERROR		(1ULL << 63)
			
 
				-#define RMID_VAL_UNAVAIL	(1ULL << 62)
			
 
				-
			
 
				-/*
			
 
				- * Event IDs are used to program IA32_QM_EVTSEL before reading event
			
 
				- * counter from IA32_QM_CTR
			
 
				- */
			
 
				-#define QOS_L3_OCCUP_EVENT_ID	0x01
			
 
				-#define QOS_MBM_TOTAL_EVENT_ID	0x02
			
 
				-#define QOS_MBM_LOCAL_EVENT_ID	0x03
			
 
				-
			
 
				-/*
			
 
				- * This is central to the rotation algorithm in __intel_cqm_rmid_rotate().
			
 
				- *
			
 
				- * This rmid is always free and is guaranteed to have an associated
			
 
				- * near-zero occupancy value, i.e. no cachelines are tagged with this
			
 
				- * RMID, once __intel_cqm_rmid_rotate() returns.
			
 
				- */
			
 
				-static u32 intel_cqm_rotation_rmid;
			
 
				-
			
 
				-#define INVALID_RMID		(-1)
			
 
				-
			
 
				-/*
			
 
				- * Is @rmid valid for programming the hardware?
			
 
				- *
			
 
				- * rmid 0 is reserved by the hardware for all non-monitored tasks, which
			
 
				- * means that we should never come across an rmid with that value.
			
 
				- * Likewise, an rmid value of -1 is used to indicate "no rmid currently
			
 
				- * assigned" and is used as part of the rotation code.
			
 
				- */
			
 
				-static inline bool __rmid_valid(u32 rmid)
			
 
				-{
			
 
				-	if (!rmid || rmid == INVALID_RMID)
			
 
				-		return false;
			
 
				-
			
 
				-	return true;
			
 
				-}
			
 
				-
			
 
				-static u64 __rmid_read(u32 rmid)
			
 
				-{
			
 
				-	u64 val;
			
 
				-
			
 
				-	/*
			
 
				-	 * Ignore the SDM, this thing is _NOTHING_ like a regular perfcnt,
			
 
				-	 * it just says that to increase confusion.
			
 
				-	 */
			
 
				-	wrmsr(MSR_IA32_QM_EVTSEL, QOS_L3_OCCUP_EVENT_ID, rmid);
			
 
				-	rdmsrl(MSR_IA32_QM_CTR, val);
			
 
				-
			
 
				-	/*
			
 
				-	 * Aside from the ERROR and UNAVAIL bits, assume this thing returns
			
 
				-	 * the number of cachelines tagged with @rmid.
			
 
				-	 */
			
 
				-	return val;
			
 
				-}
			
 
				-
			
 
				-enum rmid_recycle_state {
			
 
				-	RMID_YOUNG = 0,
			
 
				-	RMID_AVAILABLE,
			
 
				-	RMID_DIRTY,
			
 
				-};
			
 
				-
			
 
				-struct cqm_rmid_entry {
			
 
				-	u32 rmid;
			
 
				-	enum rmid_recycle_state state;
			
 
				-	struct list_head list;
			
 
				-	unsigned long queue_time;
			
 
				-};
			
 
				-
			
 
				-/*
			
 
				- * cqm_rmid_free_lru - A least recently used list of RMIDs.
			
 
				- *
			
 
				- * Oldest entry at the head, newest (most recently used) entry at the
			
 
				- * tail. This list is never traversed, it's only used to keep track of
			
 
				- * the lru order. That is, we only pick entries of the head or insert
			
 
				- * them on the tail.
			
 
				- *
			
 
				- * All entries on the list are 'free', and their RMIDs are not currently
			
 
				- * in use. To mark an RMID as in use, remove its entry from the lru
			
 
				- * list.
			
 
				- *
			
 
				- *
			
 
				- * cqm_rmid_limbo_lru - list of currently unused but (potentially) dirty RMIDs.
			
 
				- *
			
 
				- * This list is contains RMIDs that no one is currently using but that
			
 
				- * may have a non-zero occupancy value associated with them. The
			
 
				- * rotation worker moves RMIDs from the limbo list to the free list once
			
 
				- * the occupancy value drops below __intel_cqm_threshold.
			
 
				- *
			
 
				- * Both lists are protected by cache_mutex.
			
 
				- */
			
 
				-static LIST_HEAD(cqm_rmid_free_lru);
			
 
				-static LIST_HEAD(cqm_rmid_limbo_lru);
			
 
				-
			
 
				-/*
			
 
				- * We use a simple array of pointers so that we can lookup a struct
			
 
				- * cqm_rmid_entry in O(1). This alleviates the callers of __get_rmid()
			
 
				- * and __put_rmid() from having to worry about dealing with struct
			
 
				- * cqm_rmid_entry - they just deal with rmids, i.e. integers.
			
 
				- *
			
 
				- * Once this array is initialized it is read-only. No locks are required
			
 
				- * to access it.
			
 
				- *
			
 
				- * All entries for all RMIDs can be looked up in the this array at all
			
 
				- * times.
			
 
				- */
			
 
				-static struct cqm_rmid_entry **cqm_rmid_ptrs;
			
 
				-
			
 
				-static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry;
			
 
				-
			
 
				-	entry = cqm_rmid_ptrs[rmid];
			
 
				-	WARN_ON(entry->rmid != rmid);
			
 
				-
			
 
				-	return entry;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Returns < 0 on fail.
			
 
				- *
			
 
				- * We expect to be called with cache_mutex held.
			
 
				- */
			
 
				-static u32 __get_rmid(void)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	if (list_empty(&cqm_rmid_free_lru))
			
 
				-		return INVALID_RMID;
			
 
				-
			
 
				-	entry = list_first_entry(&cqm_rmid_free_lru, struct cqm_rmid_entry, list);
			
 
				-	list_del(&entry->list);
			
 
				-
			
 
				-	return entry->rmid;
			
 
				-}
			
 
				-
			
 
				-static void __put_rmid(u32 rmid)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	WARN_ON(!__rmid_valid(rmid));
			
 
				-	entry = __rmid_entry(rmid);
			
 
				-
			
 
				-	entry->queue_time = jiffies;
			
 
				-	entry->state = RMID_YOUNG;
			
 
				-
			
 
				-	list_add_tail(&entry->list, &cqm_rmid_limbo_lru);
			
 
				-}
			
 
				-
			
 
				-static void cqm_cleanup(void)
			
 
				-{
			
 
				-	int i;
			
 
				-
			
 
				-	if (!cqm_rmid_ptrs)
			
 
				-		return;
			
 
				-
			
 
				-	for (i = 0; i < cqm_max_rmid; i++)
			
 
				-		kfree(cqm_rmid_ptrs[i]);
			
 
				-
			
 
				-	kfree(cqm_rmid_ptrs);
			
 
				-	cqm_rmid_ptrs = NULL;
			
 
				-	cqm_enabled = false;
			
 
				-}
			
 
				-
			
 
				-static int intel_cqm_setup_rmid_cache(void)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry;
			
 
				-	unsigned int nr_rmids;
			
 
				-	int r = 0;
			
 
				-
			
 
				-	nr_rmids = cqm_max_rmid + 1;
			
 
				-	cqm_rmid_ptrs = kzalloc(sizeof(struct cqm_rmid_entry *) *
			
 
				-				nr_rmids, GFP_KERNEL);
			
 
				-	if (!cqm_rmid_ptrs)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	for (; r <= cqm_max_rmid; r++) {
			
 
				-		struct cqm_rmid_entry *entry;
			
 
				-
			
 
				-		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
			
 
				-		if (!entry)
			
 
				-			goto fail;
			
 
				-
			
 
				-		INIT_LIST_HEAD(&entry->list);
			
 
				-		entry->rmid = r;
			
 
				-		cqm_rmid_ptrs[r] = entry;
			
 
				-
			
 
				-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * RMID 0 is special and is always allocated. It's used for all
			
 
				-	 * tasks that are not monitored.
			
 
				-	 */
			
 
				-	entry = __rmid_entry(0);
			
 
				-	list_del(&entry->list);
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-	intel_cqm_rotation_rmid = __get_rmid();
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-
			
 
				-	return 0;
			
 
				-
			
 
				-fail:
			
 
				-	cqm_cleanup();
			
 
				-	return -ENOMEM;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Determine if @a and @b measure the same set of tasks.
			
 
				- *
			
 
				- * If @a and @b measure the same set of tasks then we want to share a
			
 
				- * single RMID.
			
 
				- */
			
 
				-static bool __match_event(struct perf_event *a, struct perf_event *b)
			
 
				-{
			
 
				-	/* Per-cpu and task events don't mix */
			
 
				-	if ((a->attach_state & PERF_ATTACH_TASK) !=
			
 
				-	    (b->attach_state & PERF_ATTACH_TASK))
			
 
				-		return false;
			
 
				-
			
 
				-#ifdef CONFIG_CGROUP_PERF
			
 
				-	if (a->cgrp != b->cgrp)
			
 
				-		return false;
			
 
				-#endif
			
 
				-
			
 
				-	/* If not task event, we're machine wide */
			
 
				-	if (!(b->attach_state & PERF_ATTACH_TASK))
			
 
				-		return true;
			
 
				-
			
 
				-	/*
			
 
				-	 * Events that target same task are placed into the same cache group.
			
 
				-	 * Mark it as a multi event group, so that we update ->count
			
 
				-	 * for every event rather than just the group leader later.
			
 
				-	 */
			
 
				-	if (a->hw.target == b->hw.target) {
			
 
				-		b->hw.is_group_event = true;
			
 
				-		return true;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Are we an inherited event?
			
 
				-	 */
			
 
				-	if (b->parent == a)
			
 
				-		return true;
			
 
				-
			
 
				-	return false;
			
 
				-}
			
 
				-
			
 
				-#ifdef CONFIG_CGROUP_PERF
			
 
				-static inline struct perf_cgroup *event_to_cgroup(struct perf_event *event)
			
 
				-{
			
 
				-	if (event->attach_state & PERF_ATTACH_TASK)
			
 
				-		return perf_cgroup_from_task(event->hw.target, event->ctx);
			
 
				-
			
 
				-	return event->cgrp;
			
 
				-}
			
 
				-#endif
			
 
				-
			
 
				-/*
			
 
				- * Determine if @a's tasks intersect with @b's tasks
			
 
				- *
			
 
				- * There are combinations of events that we explicitly prohibit,
			
 
				- *
			
 
				- *		   PROHIBITS
			
 
				- *     system-wide    -> 	cgroup and task
			
 
				- *     cgroup 	      ->	system-wide
			
 
				- *     		      ->	task in cgroup
			
 
				- *     task 	      -> 	system-wide
			
 
				- *     		      ->	task in cgroup
			
 
				- *
			
 
				- * Call this function before allocating an RMID.
			
 
				- */
			
 
				-static bool __conflict_event(struct perf_event *a, struct perf_event *b)
			
 
				-{
			
 
				-#ifdef CONFIG_CGROUP_PERF
			
 
				-	/*
			
 
				-	 * We can have any number of cgroups but only one system-wide
			
 
				-	 * event at a time.
			
 
				-	 */
			
 
				-	if (a->cgrp && b->cgrp) {
			
 
				-		struct perf_cgroup *ac = a->cgrp;
			
 
				-		struct perf_cgroup *bc = b->cgrp;
			
 
				-
			
 
				-		/*
			
 
				-		 * This condition should have been caught in
			
 
				-		 * __match_event() and we should be sharing an RMID.
			
 
				-		 */
			
 
				-		WARN_ON_ONCE(ac == bc);
			
 
				-
			
 
				-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
			
 
				-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
			
 
				-			return true;
			
 
				-
			
 
				-		return false;
			
 
				-	}
			
 
				-
			
 
				-	if (a->cgrp || b->cgrp) {
			
 
				-		struct perf_cgroup *ac, *bc;
			
 
				-
			
 
				-		/*
			
 
				-		 * cgroup and system-wide events are mutually exclusive
			
 
				-		 */
			
 
				-		if ((a->cgrp && !(b->attach_state & PERF_ATTACH_TASK)) ||
			
 
				-		    (b->cgrp && !(a->attach_state & PERF_ATTACH_TASK)))
			
 
				-			return true;
			
 
				-
			
 
				-		/*
			
 
				-		 * Ensure neither event is part of the other's cgroup
			
 
				-		 */
			
 
				-		ac = event_to_cgroup(a);
			
 
				-		bc = event_to_cgroup(b);
			
 
				-		if (ac == bc)
			
 
				-			return true;
			
 
				-
			
 
				-		/*
			
 
				-		 * Must have cgroup and non-intersecting task events.
			
 
				-		 */
			
 
				-		if (!ac || !bc)
			
 
				-			return false;
			
 
				-
			
 
				-		/*
			
 
				-		 * We have cgroup and task events, and the task belongs
			
 
				-		 * to a cgroup. Check for for overlap.
			
 
				-		 */
			
 
				-		if (cgroup_is_descendant(ac->css.cgroup, bc->css.cgroup) ||
			
 
				-		    cgroup_is_descendant(bc->css.cgroup, ac->css.cgroup))
			
 
				-			return true;
			
 
				-
			
 
				-		return false;
			
 
				-	}
			
 
				-#endif
			
 
				-	/*
			
 
				-	 * If one of them is not a task, same story as above with cgroups.
			
 
				-	 */
			
 
				-	if (!(a->attach_state & PERF_ATTACH_TASK) ||
			
 
				-	    !(b->attach_state & PERF_ATTACH_TASK))
			
 
				-		return true;
			
 
				-
			
 
				-	/*
			
 
				-	 * Must be non-overlapping.
			
 
				-	 */
			
 
				-	return false;
			
 
				-}
			
 
				-
			
 
				-struct rmid_read {
			
 
				-	u32 rmid;
			
 
				-	u32 evt_type;
			
 
				-	atomic64_t value;
			
 
				-};
			
 
				-
			
 
				-static void __intel_cqm_event_count(void *info);
			
 
				-static void init_mbm_sample(u32 rmid, u32 evt_type);
			
 
				-static void __intel_mbm_event_count(void *info);
			
 
				-
			
 
				-static bool is_cqm_event(int e)
			
 
				-{
			
 
				-	return (e == QOS_L3_OCCUP_EVENT_ID);
			
 
				-}
			
 
				-
			
 
				-static bool is_mbm_event(int e)
			
 
				-{
			
 
				-	return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID);
			
 
				-}
			
 
				-
			
 
				-static void cqm_mask_call(struct rmid_read *rr)
			
 
				-{
			
 
				-	if (is_mbm_event(rr->evt_type))
			
 
				-		on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_count, rr, 1);
			
 
				-	else
			
 
				-		on_each_cpu_mask(&cqm_cpumask, __intel_cqm_event_count, rr, 1);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Exchange the RMID of a group of events.
			
 
				- */
			
 
				-static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid)
			
 
				-{
			
 
				-	struct perf_event *event;
			
 
				-	struct list_head *head = &group->hw.cqm_group_entry;
			
 
				-	u32 old_rmid = group->hw.cqm_rmid;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * If our RMID is being deallocated, perform a read now.
			
 
				-	 */
			
 
				-	if (__rmid_valid(old_rmid) && !__rmid_valid(rmid)) {
			
 
				-		struct rmid_read rr = {
			
 
				-			.rmid = old_rmid,
			
 
				-			.evt_type = group->attr.config,
			
 
				-			.value = ATOMIC64_INIT(0),
			
 
				-		};
			
 
				-
			
 
				-		cqm_mask_call(&rr);
			
 
				-		local64_set(&group->count, atomic64_read(&rr.value));
			
 
				-	}
			
 
				-
			
 
				-	raw_spin_lock_irq(&cache_lock);
			
 
				-
			
 
				-	group->hw.cqm_rmid = rmid;
			
 
				-	list_for_each_entry(event, head, hw.cqm_group_entry)
			
 
				-		event->hw.cqm_rmid = rmid;
			
 
				-
			
 
				-	raw_spin_unlock_irq(&cache_lock);
			
 
				-
			
 
				-	/*
			
 
				-	 * If the allocation is for mbm, init the mbm stats.
			
 
				-	 * Need to check if each event in the group is mbm event
			
 
				-	 * because there could be multiple type of events in the same group.
			
 
				-	 */
			
 
				-	if (__rmid_valid(rmid)) {
			
 
				-		event = group;
			
 
				-		if (is_mbm_event(event->attr.config))
			
 
				-			init_mbm_sample(rmid, event->attr.config);
			
 
				-
			
 
				-		list_for_each_entry(event, head, hw.cqm_group_entry) {
			
 
				-			if (is_mbm_event(event->attr.config))
			
 
				-				init_mbm_sample(rmid, event->attr.config);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return old_rmid;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * If we fail to assign a new RMID for intel_cqm_rotation_rmid because
			
 
				- * cachelines are still tagged with RMIDs in limbo, we progressively
			
 
				- * increment the threshold until we find an RMID in limbo with <=
			
 
				- * __intel_cqm_threshold lines tagged. This is designed to mitigate the
			
 
				- * problem where cachelines tagged with an RMID are not steadily being
			
 
				- * evicted.
			
 
				- *
			
 
				- * On successful rotations we decrease the threshold back towards zero.
			
 
				- *
			
 
				- * __intel_cqm_max_threshold provides an upper bound on the threshold,
			
 
				- * and is measured in bytes because it's exposed to userland.
			
 
				- */
			
 
				-static unsigned int __intel_cqm_threshold;
			
 
				-static unsigned int __intel_cqm_max_threshold;
			
 
				-
			
 
				-/*
			
 
				- * Test whether an RMID has a zero occupancy value on this cpu.
			
 
				- */
			
 
				-static void intel_cqm_stable(void *arg)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry;
			
 
				-
			
 
				-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
			
 
				-		if (entry->state != RMID_AVAILABLE)
			
 
				-			break;
			
 
				-
			
 
				-		if (__rmid_read(entry->rmid) > __intel_cqm_threshold)
			
 
				-			entry->state = RMID_DIRTY;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * If we have group events waiting for an RMID that don't conflict with
			
 
				- * events already running, assign @rmid.
			
 
				- */
			
 
				-static bool intel_cqm_sched_in_event(u32 rmid)
			
 
				-{
			
 
				-	struct perf_event *leader, *event;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	leader = list_first_entry(&cache_groups, struct perf_event,
			
 
				-				  hw.cqm_groups_entry);
			
 
				-	event = leader;
			
 
				-
			
 
				-	list_for_each_entry_continue(event, &cache_groups,
			
 
				-				     hw.cqm_groups_entry) {
			
 
				-		if (__rmid_valid(event->hw.cqm_rmid))
			
 
				-			continue;
			
 
				-
			
 
				-		if (__conflict_event(event, leader))
			
 
				-			continue;
			
 
				-
			
 
				-		intel_cqm_xchg_rmid(event, rmid);
			
 
				-		return true;
			
 
				-	}
			
 
				-
			
 
				-	return false;
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Initially use this constant for both the limbo queue time and the
			
 
				- * rotation timer interval, pmu::hrtimer_interval_ms.
			
 
				- *
			
 
				- * They don't need to be the same, but the two are related since if you
			
 
				- * rotate faster than you recycle RMIDs, you may run out of available
			
 
				- * RMIDs.
			
 
				- */
			
 
				-#define RMID_DEFAULT_QUEUE_TIME 250	/* ms */
			
 
				-
			
 
				-static unsigned int __rmid_queue_time_ms = RMID_DEFAULT_QUEUE_TIME;
			
 
				-
			
 
				-/*
			
 
				- * intel_cqm_rmid_stabilize - move RMIDs from limbo to free list
			
 
				- * @nr_available: number of freeable RMIDs on the limbo list
			
 
				- *
			
 
				- * Quiescent state; wait for all 'freed' RMIDs to become unused, i.e. no
			
 
				- * cachelines are tagged with those RMIDs. After this we can reuse them
			
 
				- * and know that the current set of active RMIDs is stable.
			
 
				- *
			
 
				- * Return %true or %false depending on whether stabilization needs to be
			
 
				- * reattempted.
			
 
				- *
			
 
				- * If we return %true then @nr_available is updated to indicate the
			
 
				- * number of RMIDs on the limbo list that have been queued for the
			
 
				- * minimum queue time (RMID_AVAILABLE), but whose data occupancy values
			
 
				- * are above __intel_cqm_threshold.
			
 
				- */
			
 
				-static bool intel_cqm_rmid_stabilize(unsigned int *available)
			
 
				-{
			
 
				-	struct cqm_rmid_entry *entry, *tmp;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	*available = 0;
			
 
				-	list_for_each_entry(entry, &cqm_rmid_limbo_lru, list) {
			
 
				-		unsigned long min_queue_time;
			
 
				-		unsigned long now = jiffies;
			
 
				-
			
 
				-		/*
			
 
				-		 * We hold RMIDs placed into limbo for a minimum queue
			
 
				-		 * time. Before the minimum queue time has elapsed we do
			
 
				-		 * not recycle RMIDs.
			
 
				-		 *
			
 
				-		 * The reasoning is that until a sufficient time has
			
 
				-		 * passed since we stopped using an RMID, any RMID
			
 
				-		 * placed onto the limbo list will likely still have
			
 
				-		 * data tagged in the cache, which means we'll probably
			
 
				-		 * fail to recycle it anyway.
			
 
				-		 *
			
 
				-		 * We can save ourselves an expensive IPI by skipping
			
 
				-		 * any RMIDs that have not been queued for the minimum
			
 
				-		 * time.
			
 
				-		 */
			
 
				-		min_queue_time = entry->queue_time +
			
 
				-			msecs_to_jiffies(__rmid_queue_time_ms);
			
 
				-
			
 
				-		if (time_after(min_queue_time, now))
			
 
				-			break;
			
 
				-
			
 
				-		entry->state = RMID_AVAILABLE;
			
 
				-		(*available)++;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * Fast return if none of the RMIDs on the limbo list have been
			
 
				-	 * sitting on the queue for the minimum queue time.
			
 
				-	 */
			
 
				-	if (!*available)
			
 
				-		return false;
			
 
				-
			
 
				-	/*
			
 
				-	 * Test whether an RMID is free for each package.
			
 
				-	 */
			
 
				-	on_each_cpu_mask(&cqm_cpumask, intel_cqm_stable, NULL, true);
			
 
				-
			
 
				-	list_for_each_entry_safe(entry, tmp, &cqm_rmid_limbo_lru, list) {
			
 
				-		/*
			
 
				-		 * Exhausted all RMIDs that have waited min queue time.
			
 
				-		 */
			
 
				-		if (entry->state == RMID_YOUNG)
			
 
				-			break;
			
 
				-
			
 
				-		if (entry->state == RMID_DIRTY)
			
 
				-			continue;
			
 
				-
			
 
				-		list_del(&entry->list);	/* remove from limbo */
			
 
				-
			
 
				-		/*
			
 
				-		 * The rotation RMID gets priority if it's
			
 
				-		 * currently invalid. In which case, skip adding
			
 
				-		 * the RMID to the the free lru.
			
 
				-		 */
			
 
				-		if (!__rmid_valid(intel_cqm_rotation_rmid)) {
			
 
				-			intel_cqm_rotation_rmid = entry->rmid;
			
 
				-			continue;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * If we have groups waiting for RMIDs, hand
			
 
				-		 * them one now provided they don't conflict.
			
 
				-		 */
			
 
				-		if (intel_cqm_sched_in_event(entry->rmid))
			
 
				-			continue;
			
 
				-
			
 
				-		/*
			
 
				-		 * Otherwise place it onto the free list.
			
 
				-		 */
			
 
				-		list_add_tail(&entry->list, &cqm_rmid_free_lru);
			
 
				-	}
			
 
				-
			
 
				-
			
 
				-	return __rmid_valid(intel_cqm_rotation_rmid);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Pick a victim group and move it to the tail of the group list.
			
 
				- * @next: The first group without an RMID
			
 
				- */
			
 
				-static void __intel_cqm_pick_and_rotate(struct perf_event *next)
			
 
				-{
			
 
				-	struct perf_event *rotor;
			
 
				-	u32 rmid;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	rotor = list_first_entry(&cache_groups, struct perf_event,
			
 
				-				 hw.cqm_groups_entry);
			
 
				-
			
 
				-	/*
			
 
				-	 * The group at the front of the list should always have a valid
			
 
				-	 * RMID. If it doesn't then no groups have RMIDs assigned and we
			
 
				-	 * don't need to rotate the list.
			
 
				-	 */
			
 
				-	if (next == rotor)
			
 
				-		return;
			
 
				-
			
 
				-	rmid = intel_cqm_xchg_rmid(rotor, INVALID_RMID);
			
 
				-	__put_rmid(rmid);
			
 
				-
			
 
				-	list_rotate_left(&cache_groups);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Deallocate the RMIDs from any events that conflict with @event, and
			
 
				- * place them on the back of the group list.
			
 
				- */
			
 
				-static void intel_cqm_sched_out_conflicting_events(struct perf_event *event)
			
 
				-{
			
 
				-	struct perf_event *group, *g;
			
 
				-	u32 rmid;
			
 
				-
			
 
				-	lockdep_assert_held(&cache_mutex);
			
 
				-
			
 
				-	list_for_each_entry_safe(group, g, &cache_groups, hw.cqm_groups_entry) {
			
 
				-		if (group == event)
			
 
				-			continue;
			
 
				-
			
 
				-		rmid = group->hw.cqm_rmid;
			
 
				-
			
 
				-		/*
			
 
				-		 * Skip events that don't have a valid RMID.
			
 
				-		 */
			
 
				-		if (!__rmid_valid(rmid))
			
 
				-			continue;
			
 
				-
			
 
				-		/*
			
 
				-		 * No conflict? No problem! Leave the event alone.
			
 
				-		 */
			
 
				-		if (!__conflict_event(group, event))
			
 
				-			continue;
			
 
				-
			
 
				-		intel_cqm_xchg_rmid(group, INVALID_RMID);
			
 
				-		__put_rmid(rmid);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Attempt to rotate the groups and assign new RMIDs.
			
 
				- *
			
 
				- * We rotate for two reasons,
			
 
				- *   1. To handle the scheduling of conflicting events
			
 
				- *   2. To recycle RMIDs
			
 
				- *
			
 
				- * Rotating RMIDs is complicated because the hardware doesn't give us
			
 
				- * any clues.
			
 
				- *
			
 
				- * There's problems with the hardware interface; when you change the
			
 
				- * task:RMID map cachelines retain their 'old' tags, giving a skewed
			
 
				- * picture. In order to work around this, we must always keep one free
			
 
				- * RMID - intel_cqm_rotation_rmid.
			
 
				- *
			
 
				- * Rotation works by taking away an RMID from a group (the old RMID),
			
 
				- * and assigning the free RMID to another group (the new RMID). We must
			
 
				- * then wait for the old RMID to not be used (no cachelines tagged).
			
 
				- * This ensure that all cachelines are tagged with 'active' RMIDs. At
			
 
				- * this point we can start reading values for the new RMID and treat the
			
 
				- * old RMID as the free RMID for the next rotation.
			
 
				- *
			
 
				- * Return %true or %false depending on whether we did any rotating.
			
 
				- */
			
 
				-static bool __intel_cqm_rmid_rotate(void)
			
 
				-{
			
 
				-	struct perf_event *group, *start = NULL;
			
 
				-	unsigned int threshold_limit;
			
 
				-	unsigned int nr_needed = 0;
			
 
				-	unsigned int nr_available;
			
 
				-	bool rotated = false;
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-
			
 
				-again:
			
 
				-	/*
			
 
				-	 * Fast path through this function if there are no groups and no
			
 
				-	 * RMIDs that need cleaning.
			
 
				-	 */
			
 
				-	if (list_empty(&cache_groups) && list_empty(&cqm_rmid_limbo_lru))
			
 
				-		goto out;
			
 
				-
			
 
				-	list_for_each_entry(group, &cache_groups, hw.cqm_groups_entry) {
			
 
				-		if (!__rmid_valid(group->hw.cqm_rmid)) {
			
 
				-			if (!start)
			
 
				-				start = group;
			
 
				-			nr_needed++;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * We have some event groups, but they all have RMIDs assigned
			
 
				-	 * and no RMIDs need cleaning.
			
 
				-	 */
			
 
				-	if (!nr_needed && list_empty(&cqm_rmid_limbo_lru))
			
 
				-		goto out;
			
 
				-
			
 
				-	if (!nr_needed)
			
 
				-		goto stabilize;
			
 
				-
			
 
				-	/*
			
 
				-	 * We have more event groups without RMIDs than available RMIDs,
			
 
				-	 * or we have event groups that conflict with the ones currently
			
 
				-	 * scheduled.
			
 
				-	 *
			
 
				-	 * We force deallocate the rmid of the group at the head of
			
 
				-	 * cache_groups. The first event group without an RMID then gets
			
 
				-	 * assigned intel_cqm_rotation_rmid. This ensures we always make
			
 
				-	 * forward progress.
			
 
				-	 *
			
 
				-	 * Rotate the cache_groups list so the previous head is now the
			
 
				-	 * tail.
			
 
				-	 */
			
 
				-	__intel_cqm_pick_and_rotate(start);
			
 
				-
			
 
				-	/*
			
 
				-	 * If the rotation is going to succeed, reduce the threshold so
			
 
				-	 * that we don't needlessly reuse dirty RMIDs.
			
 
				-	 */
			
 
				-	if (__rmid_valid(intel_cqm_rotation_rmid)) {
			
 
				-		intel_cqm_xchg_rmid(start, intel_cqm_rotation_rmid);
			
 
				-		intel_cqm_rotation_rmid = __get_rmid();
			
 
				-
			
 
				-		intel_cqm_sched_out_conflicting_events(start);
			
 
				-
			
 
				-		if (__intel_cqm_threshold)
			
 
				-			__intel_cqm_threshold--;
			
 
				-	}
			
 
				-
			
 
				-	rotated = true;
			
 
				-
			
 
				-stabilize:
			
 
				-	/*
			
 
				-	 * We now need to stablize the RMID we freed above (if any) to
			
 
				-	 * ensure that the next time we rotate we have an RMID with zero
			
 
				-	 * occupancy value.
			
 
				-	 *
			
 
				-	 * Alternatively, if we didn't need to perform any rotation,
			
 
				-	 * we'll have a bunch of RMIDs in limbo that need stabilizing.
			
 
				-	 */
			
 
				-	threshold_limit = __intel_cqm_max_threshold / cqm_l3_scale;
			
 
				-
			
 
				-	while (intel_cqm_rmid_stabilize(&nr_available) &&
			
 
				-	       __intel_cqm_threshold < threshold_limit) {
			
 
				-		unsigned int steal_limit;
			
 
				-
			
 
				-		/*
			
 
				-		 * Don't spin if nobody is actively waiting for an RMID,
			
 
				-		 * the rotation worker will be kicked as soon as an
			
 
				-		 * event needs an RMID anyway.
			
 
				-		 */
			
 
				-		if (!nr_needed)
			
 
				-			break;
			
 
				-
			
 
				-		/* Allow max 25% of RMIDs to be in limbo. */
			
 
				-		steal_limit = (cqm_max_rmid + 1) / 4;
			
 
				-
			
 
				-		/*
			
 
				-		 * We failed to stabilize any RMIDs so our rotation
			
 
				-		 * logic is now stuck. In order to make forward progress
			
 
				-		 * we have a few options:
			
 
				-		 *
			
 
				-		 *   1. rotate ("steal") another RMID
			
 
				-		 *   2. increase the threshold
			
 
				-		 *   3. do nothing
			
 
				-		 *
			
 
				-		 * We do both of 1. and 2. until we hit the steal limit.
			
 
				-		 *
			
 
				-		 * The steal limit prevents all RMIDs ending up on the
			
 
				-		 * limbo list. This can happen if every RMID has a
			
 
				-		 * non-zero occupancy above threshold_limit, and the
			
 
				-		 * occupancy values aren't dropping fast enough.
			
 
				-		 *
			
 
				-		 * Note that there is prioritisation at work here - we'd
			
 
				-		 * rather increase the number of RMIDs on the limbo list
			
 
				-		 * than increase the threshold, because increasing the
			
 
				-		 * threshold skews the event data (because we reuse
			
 
				-		 * dirty RMIDs) - threshold bumps are a last resort.
			
 
				-		 */
			
 
				-		if (nr_available < steal_limit)
			
 
				-			goto again;
			
 
				-
			
 
				-		__intel_cqm_threshold++;
			
 
				-	}
			
 
				-
			
 
				-out:
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-	return rotated;
			
 
				-}
			
 
				-
			
 
				-static void intel_cqm_rmid_rotate(struct work_struct *work);
			
 
				-
			
 
				-static DECLARE_DELAYED_WORK(intel_cqm_rmid_work, intel_cqm_rmid_rotate);
			
 
				-
			
 
				-static struct pmu intel_cqm_pmu;
			
 
				-
			
 
				-static void intel_cqm_rmid_rotate(struct work_struct *work)
			
 
				-{
			
 
				-	unsigned long delay;
			
 
				-
			
 
				-	__intel_cqm_rmid_rotate();
			
 
				-
			
 
				-	delay = msecs_to_jiffies(intel_cqm_pmu.hrtimer_interval_ms);
			
 
				-	schedule_delayed_work(&intel_cqm_rmid_work, delay);
			
 
				-}
			
 
				-
			
 
				-static u64 update_sample(unsigned int rmid, u32 evt_type, int first)
			
 
				-{
			
 
				-	struct sample *mbm_current;
			
 
				-	u32 vrmid = rmid_2_index(rmid);
			
 
				-	u64 val, bytes, shift;
			
 
				-	u32 eventid;
			
 
				-
			
 
				-	if (evt_type == QOS_MBM_LOCAL_EVENT_ID) {
			
 
				-		mbm_current = &mbm_local[vrmid];
			
 
				-		eventid     = QOS_MBM_LOCAL_EVENT_ID;
			
 
				-	} else {
			
 
				-		mbm_current = &mbm_total[vrmid];
			
 
				-		eventid     = QOS_MBM_TOTAL_EVENT_ID;
			
 
				-	}
			
 
				-
			
 
				-	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
			
 
				-	rdmsrl(MSR_IA32_QM_CTR, val);
			
 
				-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
			
 
				-		return mbm_current->total_bytes;
			
 
				-
			
 
				-	if (first) {
			
 
				-		mbm_current->prev_msr = val;
			
 
				-		mbm_current->total_bytes = 0;
			
 
				-		return mbm_current->total_bytes;
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * The h/w guarantees that counters will not overflow
			
 
				-	 * so long as we poll them at least once per second.
			
 
				-	 */
			
 
				-	shift = 64 - MBM_CNTR_WIDTH;
			
 
				-	bytes = (val << shift) - (mbm_current->prev_msr << shift);
			
 
				-	bytes >>= shift;
			
 
				-
			
 
				-	bytes *= cqm_l3_scale;
			
 
				-
			
 
				-	mbm_current->total_bytes += bytes;
			
 
				-	mbm_current->prev_msr = val;
			
 
				-
			
 
				-	return mbm_current->total_bytes;
			
 
				-}
			
 
				-
			
 
				-static u64 rmid_read_mbm(unsigned int rmid, u32 evt_type)
			
 
				-{
			
 
				-	return update_sample(rmid, evt_type, 0);
			
 
				-}
			
 
				-
			
 
				-static void __intel_mbm_event_init(void *info)
			
 
				-{
			
 
				-	struct rmid_read *rr = info;
			
 
				-
			
 
				-	update_sample(rr->rmid, rr->evt_type, 1);
			
 
				-}
			
 
				-
			
 
				-static void init_mbm_sample(u32 rmid, u32 evt_type)
			
 
				-{
			
 
				-	struct rmid_read rr = {
			
 
				-		.rmid = rmid,
			
 
				-		.evt_type = evt_type,
			
 
				-		.value = ATOMIC64_INIT(0),
			
 
				-	};
			
 
				-
			
 
				-	/* on each socket, init sample */
			
 
				-	on_each_cpu_mask(&cqm_cpumask, __intel_mbm_event_init, &rr, 1);
			
 
				-}
			
 
				-
			
 
				-/*
			
 
				- * Find a group and setup RMID.
			
 
				- *
			
 
				- * If we're part of a group, we use the group's RMID.
			
 
				- */
			
 
				-static void intel_cqm_setup_event(struct perf_event *event,
			
 
				-				  struct perf_event **group)
			
 
				-{
			
 
				-	struct perf_event *iter;
			
 
				-	bool conflict = false;
			
 
				-	u32 rmid;
			
 
				-
			
 
				-	event->hw.is_group_event = false;
			
 
				-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
			
 
				-		rmid = iter->hw.cqm_rmid;
			
 
				-
			
 
				-		if (__match_event(iter, event)) {
			
 
				-			/* All tasks in a group share an RMID */
			
 
				-			event->hw.cqm_rmid = rmid;
			
 
				-			*group = iter;
			
 
				-			if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
			
 
				-				init_mbm_sample(rmid, event->attr.config);
			
 
				-			return;
			
 
				-		}
			
 
				-
			
 
				-		/*
			
 
				-		 * We only care about conflicts for events that are
			
 
				-		 * actually scheduled in (and hence have a valid RMID).
			
 
				-		 */
			
 
				-		if (__conflict_event(iter, event) && __rmid_valid(rmid))
			
 
				-			conflict = true;
			
 
				-	}
			
 
				-
			
 
				-	if (conflict)
			
 
				-		rmid = INVALID_RMID;
			
 
				-	else
			
 
				-		rmid = __get_rmid();
			
 
				-
			
 
				-	if (is_mbm_event(event->attr.config) && __rmid_valid(rmid))
			
 
				-		init_mbm_sample(rmid, event->attr.config);
			
 
				-
			
 
				-	event->hw.cqm_rmid = rmid;
			
 
				-}
			
 
				-
			
 
				-static void intel_cqm_event_read(struct perf_event *event)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-	u32 rmid;
			
 
				-	u64 val;
			
 
				-
			
 
				-	/*
			
 
				-	 * Task events are handled by intel_cqm_event_count().
			
 
				-	 */
			
 
				-	if (event->cpu == -1)
			
 
				-		return;
			
 
				-
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-	rmid = event->hw.cqm_rmid;
			
 
				-
			
 
				-	if (!__rmid_valid(rmid))
			
 
				-		goto out;
			
 
				-
			
 
				-	if (is_mbm_event(event->attr.config))
			
 
				-		val = rmid_read_mbm(rmid, event->attr.config);
			
 
				-	else
			
 
				-		val = __rmid_read(rmid);
			
 
				-
			
 
				-	/*
			
 
				-	 * Ignore this reading on error states and do not update the value.
			
 
				-	 */
			
 
				-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
			
 
				-		goto out;
			
 
				-
			
 
				-	local64_set(&event->count, val);
			
 
				-out:
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-}
			
 
				-
			
 
				-static void __intel_cqm_event_count(void *info)
			
 
				-{
			
 
				-	struct rmid_read *rr = info;
			
 
				-	u64 val;
			
 
				-
			
 
				-	val = __rmid_read(rr->rmid);
			
 
				-
			
 
				-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
			
 
				-		return;
			
 
				-
			
 
				-	atomic64_add(val, &rr->value);
			
 
				-}
			
 
				-
			
 
				-static inline bool cqm_group_leader(struct perf_event *event)
			
 
				-{
			
 
				-	return !list_empty(&event->hw.cqm_groups_entry);
			
 
				-}
			
 
				-
			
 
				-static void __intel_mbm_event_count(void *info)
			
 
				-{
			
 
				-	struct rmid_read *rr = info;
			
 
				-	u64 val;
			
 
				-
			
 
				-	val = rmid_read_mbm(rr->rmid, rr->evt_type);
			
 
				-	if (val & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL))
			
 
				-		return;
			
 
				-	atomic64_add(val, &rr->value);
			
 
				-}
			
 
				-
			
 
				-static enum hrtimer_restart mbm_hrtimer_handle(struct hrtimer *hrtimer)
			
 
				-{
			
 
				-	struct perf_event *iter, *iter1;
			
 
				-	int ret = HRTIMER_RESTART;
			
 
				-	struct list_head *head;
			
 
				-	unsigned long flags;
			
 
				-	u32 grp_rmid;
			
 
				-
			
 
				-	/*
			
 
				-	 * Need to cache_lock as the timer Event Select MSR reads
			
 
				-	 * can race with the mbm/cqm count() and mbm_init() reads.
			
 
				-	 */
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-
			
 
				-	if (list_empty(&cache_groups)) {
			
 
				-		ret = HRTIMER_NORESTART;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) {
			
 
				-		grp_rmid = iter->hw.cqm_rmid;
			
 
				-		if (!__rmid_valid(grp_rmid))
			
 
				-			continue;
			
 
				-		if (is_mbm_event(iter->attr.config))
			
 
				-			update_sample(grp_rmid, iter->attr.config, 0);
			
 
				-
			
 
				-		head = &iter->hw.cqm_group_entry;
			
 
				-		if (list_empty(head))
			
 
				-			continue;
			
 
				-		list_for_each_entry(iter1, head, hw.cqm_group_entry) {
			
 
				-			if (!iter1->hw.is_group_event)
			
 
				-				break;
			
 
				-			if (is_mbm_event(iter1->attr.config))
			
 
				-				update_sample(iter1->hw.cqm_rmid,
			
 
				-					      iter1->attr.config, 0);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	hrtimer_forward_now(hrtimer, ms_to_ktime(MBM_CTR_OVERFLOW_TIME));
			
 
				-out:
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static void __mbm_start_timer(void *info)
			
 
				-{
			
 
				-	hrtimer_start(&mbm_timers[pkg_id], ms_to_ktime(MBM_CTR_OVERFLOW_TIME),
			
 
				-			     HRTIMER_MODE_REL_PINNED);
			
 
				-}
			
 
				-
			
 
				-static void __mbm_stop_timer(void *info)
			
 
				-{
			
 
				-	hrtimer_cancel(&mbm_timers[pkg_id]);
			
 
				-}
			
 
				-
			
 
				-static void mbm_start_timers(void)
			
 
				-{
			
 
				-	on_each_cpu_mask(&cqm_cpumask, __mbm_start_timer, NULL, 1);
			
 
				-}
			
 
				-
			
 
				-static void mbm_stop_timers(void)
			
 
				-{
			
 
				-	on_each_cpu_mask(&cqm_cpumask, __mbm_stop_timer, NULL, 1);
			
 
				-}
			
 
				-
			
 
				-static void mbm_hrtimer_init(void)
			
 
				-{
			
 
				-	struct hrtimer *hr;
			
 
				-	int i;
			
 
				-
			
 
				-	for (i = 0; i < mbm_socket_max; i++) {
			
 
				-		hr = &mbm_timers[i];
			
 
				-		hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
			
 
				-		hr->function = mbm_hrtimer_handle;
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static u64 intel_cqm_event_count(struct perf_event *event)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-	struct rmid_read rr = {
			
 
				-		.evt_type = event->attr.config,
			
 
				-		.value = ATOMIC64_INIT(0),
			
 
				-	};
			
 
				-
			
 
				-	/*
			
 
				-	 * We only need to worry about task events. System-wide events
			
 
				-	 * are handled like usual, i.e. entirely with
			
 
				-	 * intel_cqm_event_read().
			
 
				-	 */
			
 
				-	if (event->cpu != -1)
			
 
				-		return __perf_event_count(event);
			
 
				-
			
 
				-	/*
			
 
				-	 * Only the group leader gets to report values except in case of
			
 
				-	 * multiple events in the same group, we still need to read the
			
 
				-	 * other events.This stops us
			
 
				-	 * reporting duplicate values to userspace, and gives us a clear
			
 
				-	 * rule for which task gets to report the values.
			
 
				-	 *
			
 
				-	 * Note that it is impossible to attribute these values to
			
 
				-	 * specific packages - we forfeit that ability when we create
			
 
				-	 * task events.
			
 
				-	 */
			
 
				-	if (!cqm_group_leader(event) && !event->hw.is_group_event)
			
 
				-		return 0;
			
 
				-
			
 
				-	/*
			
 
				-	 * Getting up-to-date values requires an SMP IPI which is not
			
 
				-	 * possible if we're being called in interrupt context. Return
			
 
				-	 * the cached values instead.
			
 
				-	 */
			
 
				-	if (unlikely(in_interrupt()))
			
 
				-		goto out;
			
 
				-
			
 
				-	/*
			
 
				-	 * Notice that we don't perform the reading of an RMID
			
 
				-	 * atomically, because we can't hold a spin lock across the
			
 
				-	 * IPIs.
			
 
				-	 *
			
 
				-	 * Speculatively perform the read, since @event might be
			
 
				-	 * assigned a different (possibly invalid) RMID while we're
			
 
				-	 * busying performing the IPI calls. It's therefore necessary to
			
 
				-	 * check @event's RMID afterwards, and if it has changed,
			
 
				-	 * discard the result of the read.
			
 
				-	 */
			
 
				-	rr.rmid = ACCESS_ONCE(event->hw.cqm_rmid);
			
 
				-
			
 
				-	if (!__rmid_valid(rr.rmid))
			
 
				-		goto out;
			
 
				-
			
 
				-	cqm_mask_call(&rr);
			
 
				-
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-	if (event->hw.cqm_rmid == rr.rmid)
			
 
				-		local64_set(&event->count, atomic64_read(&rr.value));
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-out:
			
 
				-	return __perf_event_count(event);
			
 
				-}
			
 
				-
			
 
				-static void intel_cqm_event_start(struct perf_event *event, int mode)
			
 
				-{
			
 
				-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
			
 
				-	u32 rmid = event->hw.cqm_rmid;
			
 
				-
			
 
				-	if (!(event->hw.cqm_state & PERF_HES_STOPPED))
			
 
				-		return;
			
 
				-
			
 
				-	event->hw.cqm_state &= ~PERF_HES_STOPPED;
			
 
				-
			
 
				-	if (state->rmid_usecnt++) {
			
 
				-		if (!WARN_ON_ONCE(state->rmid != rmid))
			
 
				-			return;
			
 
				-	} else {
			
 
				-		WARN_ON_ONCE(state->rmid);
			
 
				-	}
			
 
				-
			
 
				-	state->rmid = rmid;
			
 
				-	wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid);
			
 
				-}
			
 
				-
			
 
				-static void intel_cqm_event_stop(struct perf_event *event, int mode)
			
 
				-{
			
 
				-	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
			
 
				-
			
 
				-	if (event->hw.cqm_state & PERF_HES_STOPPED)
			
 
				-		return;
			
 
				-
			
 
				-	event->hw.cqm_state |= PERF_HES_STOPPED;
			
 
				-
			
 
				-	intel_cqm_event_read(event);
			
 
				-
			
 
				-	if (!--state->rmid_usecnt) {
			
 
				-		state->rmid = 0;
			
 
				-		wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid);
			
 
				-	} else {
			
 
				-		WARN_ON_ONCE(!state->rmid);
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-static int intel_cqm_event_add(struct perf_event *event, int mode)
			
 
				-{
			
 
				-	unsigned long flags;
			
 
				-	u32 rmid;
			
 
				-
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-
			
 
				-	event->hw.cqm_state = PERF_HES_STOPPED;
			
 
				-	rmid = event->hw.cqm_rmid;
			
 
				-
			
 
				-	if (__rmid_valid(rmid) && (mode & PERF_EF_START))
			
 
				-		intel_cqm_event_start(event, mode);
			
 
				-
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static void intel_cqm_event_destroy(struct perf_event *event)
			
 
				-{
			
 
				-	struct perf_event *group_other = NULL;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-	/*
			
 
				-	* Hold the cache_lock as mbm timer handlers could be
			
 
				-	* scanning the list of events.
			
 
				-	*/
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-
			
 
				-	/*
			
 
				-	 * If there's another event in this group...
			
 
				-	 */
			
 
				-	if (!list_empty(&event->hw.cqm_group_entry)) {
			
 
				-		group_other = list_first_entry(&event->hw.cqm_group_entry,
			
 
				-					       struct perf_event,
			
 
				-					       hw.cqm_group_entry);
			
 
				-		list_del(&event->hw.cqm_group_entry);
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * And we're the group leader..
			
 
				-	 */
			
 
				-	if (cqm_group_leader(event)) {
			
 
				-		/*
			
 
				-		 * If there was a group_other, make that leader, otherwise
			
 
				-		 * destroy the group and return the RMID.
			
 
				-		 */
			
 
				-		if (group_other) {
			
 
				-			list_replace(&event->hw.cqm_groups_entry,
			
 
				-				     &group_other->hw.cqm_groups_entry);
			
 
				-		} else {
			
 
				-			u32 rmid = event->hw.cqm_rmid;
			
 
				-
			
 
				-			if (__rmid_valid(rmid))
			
 
				-				__put_rmid(rmid);
			
 
				-			list_del(&event->hw.cqm_groups_entry);
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-
			
 
				-	/*
			
 
				-	 * Stop the mbm overflow timers when the last event is destroyed.
			
 
				-	*/
			
 
				-	if (mbm_enabled && list_empty(&cache_groups))
			
 
				-		mbm_stop_timers();
			
 
				-
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-}
			
 
				-
			
 
				-static int intel_cqm_event_init(struct perf_event *event)
			
 
				-{
			
 
				-	struct perf_event *group = NULL;
			
 
				-	bool rotate = false;
			
 
				-	unsigned long flags;
			
 
				-
			
 
				-	if (event->attr.type != intel_cqm_pmu.type)
			
 
				-		return -ENOENT;
			
 
				-
			
 
				-	if ((event->attr.config < QOS_L3_OCCUP_EVENT_ID) ||
			
 
				-	     (event->attr.config > QOS_MBM_LOCAL_EVENT_ID))
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	if ((is_cqm_event(event->attr.config) && !cqm_enabled) ||
			
 
				-	    (is_mbm_event(event->attr.config) && !mbm_enabled))
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	/* unsupported modes and filters */
			
 
				-	if (event->attr.exclude_user   ||
			
 
				-	    event->attr.exclude_kernel ||
			
 
				-	    event->attr.exclude_hv     ||
			
 
				-	    event->attr.exclude_idle   ||
			
 
				-	    event->attr.exclude_host   ||
			
 
				-	    event->attr.exclude_guest  ||
			
 
				-	    event->attr.sample_period) /* no sampling */
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	INIT_LIST_HEAD(&event->hw.cqm_group_entry);
			
 
				-	INIT_LIST_HEAD(&event->hw.cqm_groups_entry);
			
 
				-
			
 
				-	event->destroy = intel_cqm_event_destroy;
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-
			
 
				-	/*
			
 
				-	 * Start the mbm overflow timers when the first event is created.
			
 
				-	*/
			
 
				-	if (mbm_enabled && list_empty(&cache_groups))
			
 
				-		mbm_start_timers();
			
 
				-
			
 
				-	/* Will also set rmid */
			
 
				-	intel_cqm_setup_event(event, &group);
			
 
				-
			
 
				-	/*
			
 
				-	* Hold the cache_lock as mbm timer handlers be
			
 
				-	* scanning the list of events.
			
 
				-	*/
			
 
				-	raw_spin_lock_irqsave(&cache_lock, flags);
			
 
				-
			
 
				-	if (group) {
			
 
				-		list_add_tail(&event->hw.cqm_group_entry,
			
 
				-			      &group->hw.cqm_group_entry);
			
 
				-	} else {
			
 
				-		list_add_tail(&event->hw.cqm_groups_entry,
			
 
				-			      &cache_groups);
			
 
				-
			
 
				-		/*
			
 
				-		 * All RMIDs are either in use or have recently been
			
 
				-		 * used. Kick the rotation worker to clean/free some.
			
 
				-		 *
			
 
				-		 * We only do this for the group leader, rather than for
			
 
				-		 * every event in a group to save on needless work.
			
 
				-		 */
			
 
				-		if (!__rmid_valid(event->hw.cqm_rmid))
			
 
				-			rotate = true;
			
 
				-	}
			
 
				-
			
 
				-	raw_spin_unlock_irqrestore(&cache_lock, flags);
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-
			
 
				-	if (rotate)
			
 
				-		schedule_delayed_work(&intel_cqm_rmid_work, 0);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-EVENT_ATTR_STR(llc_occupancy, intel_cqm_llc, "event=0x01");
			
 
				-EVENT_ATTR_STR(llc_occupancy.per-pkg, intel_cqm_llc_pkg, "1");
			
 
				-EVENT_ATTR_STR(llc_occupancy.unit, intel_cqm_llc_unit, "Bytes");
			
 
				-EVENT_ATTR_STR(llc_occupancy.scale, intel_cqm_llc_scale, NULL);
			
 
				-EVENT_ATTR_STR(llc_occupancy.snapshot, intel_cqm_llc_snapshot, "1");
			
 
				-
			
 
				-EVENT_ATTR_STR(total_bytes, intel_cqm_total_bytes, "event=0x02");
			
 
				-EVENT_ATTR_STR(total_bytes.per-pkg, intel_cqm_total_bytes_pkg, "1");
			
 
				-EVENT_ATTR_STR(total_bytes.unit, intel_cqm_total_bytes_unit, "MB");
			
 
				-EVENT_ATTR_STR(total_bytes.scale, intel_cqm_total_bytes_scale, "1e-6");
			
 
				-
			
 
				-EVENT_ATTR_STR(local_bytes, intel_cqm_local_bytes, "event=0x03");
			
 
				-EVENT_ATTR_STR(local_bytes.per-pkg, intel_cqm_local_bytes_pkg, "1");
			
 
				-EVENT_ATTR_STR(local_bytes.unit, intel_cqm_local_bytes_unit, "MB");
			
 
				-EVENT_ATTR_STR(local_bytes.scale, intel_cqm_local_bytes_scale, "1e-6");
			
 
				-
			
 
				-static struct attribute *intel_cqm_events_attr[] = {
			
 
				-	EVENT_PTR(intel_cqm_llc),
			
 
				-	EVENT_PTR(intel_cqm_llc_pkg),
			
 
				-	EVENT_PTR(intel_cqm_llc_unit),
			
 
				-	EVENT_PTR(intel_cqm_llc_scale),
			
 
				-	EVENT_PTR(intel_cqm_llc_snapshot),
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static struct attribute *intel_mbm_events_attr[] = {
			
 
				-	EVENT_PTR(intel_cqm_total_bytes),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_pkg),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_pkg),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_unit),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_unit),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_scale),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_scale),
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static struct attribute *intel_cmt_mbm_events_attr[] = {
			
 
				-	EVENT_PTR(intel_cqm_llc),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes),
			
 
				-	EVENT_PTR(intel_cqm_llc_pkg),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_pkg),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_pkg),
			
 
				-	EVENT_PTR(intel_cqm_llc_unit),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_unit),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_unit),
			
 
				-	EVENT_PTR(intel_cqm_llc_scale),
			
 
				-	EVENT_PTR(intel_cqm_total_bytes_scale),
			
 
				-	EVENT_PTR(intel_cqm_local_bytes_scale),
			
 
				-	EVENT_PTR(intel_cqm_llc_snapshot),
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static struct attribute_group intel_cqm_events_group = {
			
 
				-	.name = "events",
			
 
				-	.attrs = NULL,
			
 
				-};
			
 
				-
			
 
				-PMU_FORMAT_ATTR(event, "config:0-7");
			
 
				-static struct attribute *intel_cqm_formats_attr[] = {
			
 
				-	&format_attr_event.attr,
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static struct attribute_group intel_cqm_format_group = {
			
 
				-	.name = "format",
			
 
				-	.attrs = intel_cqm_formats_attr,
			
 
				-};
			
 
				-
			
 
				-static ssize_t
			
 
				-max_recycle_threshold_show(struct device *dev, struct device_attribute *attr,
			
 
				-			   char *page)
			
 
				-{
			
 
				-	ssize_t rv;
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-	rv = snprintf(page, PAGE_SIZE-1, "%u\n", __intel_cqm_max_threshold);
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-
			
 
				-	return rv;
			
 
				-}
			
 
				-
			
 
				-static ssize_t
			
 
				-max_recycle_threshold_store(struct device *dev,
			
 
				-			    struct device_attribute *attr,
			
 
				-			    const char *buf, size_t count)
			
 
				-{
			
 
				-	unsigned int bytes, cachelines;
			
 
				-	int ret;
			
 
				-
			
 
				-	ret = kstrtouint(buf, 0, &bytes);
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				-
			
 
				-	mutex_lock(&cache_mutex);
			
 
				-
			
 
				-	__intel_cqm_max_threshold = bytes;
			
 
				-	cachelines = bytes / cqm_l3_scale;
			
 
				-
			
 
				-	/*
			
 
				-	 * The new maximum takes effect immediately.
			
 
				-	 */
			
 
				-	if (__intel_cqm_threshold > cachelines)
			
 
				-		__intel_cqm_threshold = cachelines;
			
 
				-
			
 
				-	mutex_unlock(&cache_mutex);
			
 
				-
			
 
				-	return count;
			
 
				-}
			
 
				-
			
 
				-static DEVICE_ATTR_RW(max_recycle_threshold);
			
 
				-
			
 
				-static struct attribute *intel_cqm_attrs[] = {
			
 
				-	&dev_attr_max_recycle_threshold.attr,
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static const struct attribute_group intel_cqm_group = {
			
 
				-	.attrs = intel_cqm_attrs,
			
 
				-};
			
 
				-
			
 
				-static const struct attribute_group *intel_cqm_attr_groups[] = {
			
 
				-	&intel_cqm_events_group,
			
 
				-	&intel_cqm_format_group,
			
 
				-	&intel_cqm_group,
			
 
				-	NULL,
			
 
				-};
			
 
				-
			
 
				-static struct pmu intel_cqm_pmu = {
			
 
				-	.hrtimer_interval_ms = RMID_DEFAULT_QUEUE_TIME,
			
 
				-	.attr_groups	     = intel_cqm_attr_groups,
			
 
				-	.task_ctx_nr	     = perf_sw_context,
			
 
				-	.event_init	     = intel_cqm_event_init,
			
 
				-	.add		     = intel_cqm_event_add,
			
 
				-	.del		     = intel_cqm_event_stop,
			
 
				-	.start		     = intel_cqm_event_start,
			
 
				-	.stop		     = intel_cqm_event_stop,
			
 
				-	.read		     = intel_cqm_event_read,
			
 
				-	.count		     = intel_cqm_event_count,
			
 
				-};
			
 
				-
			
 
				-static inline void cqm_pick_event_reader(int cpu)
			
 
				-{
			
 
				-	int reader;
			
 
				-
			
 
				-	/* First online cpu in package becomes the reader */
			
 
				-	reader = cpumask_any_and(&cqm_cpumask, topology_core_cpumask(cpu));
			
 
				-	if (reader >= nr_cpu_ids)
			
 
				-		cpumask_set_cpu(cpu, &cqm_cpumask);
			
 
				-}
			
 
				-
			
 
				-static int intel_cqm_cpu_starting(unsigned int cpu)
			
 
				-{
			
 
				-	struct intel_pqr_state *state = &per_cpu(pqr_state, cpu);
			
 
				-	struct cpuinfo_x86 *c = &cpu_data(cpu);
			
 
				-
			
 
				-	state->rmid = 0;
			
 
				-	state->closid = 0;
			
 
				-	state->rmid_usecnt = 0;
			
 
				-
			
 
				-	WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid);
			
 
				-	WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale);
			
 
				-
			
 
				-	cqm_pick_event_reader(cpu);
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static int intel_cqm_cpu_exit(unsigned int cpu)
			
 
				-{
			
 
				-	int target;
			
 
				-
			
 
				-	/* Is @cpu the current cqm reader for this package ? */
			
 
				-	if (!cpumask_test_and_clear_cpu(cpu, &cqm_cpumask))
			
 
				-		return 0;
			
 
				-
			
 
				-	/* Find another online reader in this package */
			
 
				-	target = cpumask_any_but(topology_core_cpumask(cpu), cpu);
			
 
				-
			
 
				-	if (target < nr_cpu_ids)
			
 
				-		cpumask_set_cpu(target, &cqm_cpumask);
			
 
				-
			
 
				-	return 0;
			
 
				-}
			
 
				-
			
 
				-static const struct x86_cpu_id intel_cqm_match[] = {
			
 
				-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_OCCUP_LLC },
			
 
				-	{}
			
 
				-};
			
 
				-
			
 
				-static void mbm_cleanup(void)
			
 
				-{
			
 
				-	if (!mbm_enabled)
			
 
				-		return;
			
 
				-
			
 
				-	kfree(mbm_local);
			
 
				-	kfree(mbm_total);
			
 
				-	mbm_enabled = false;
			
 
				-}
			
 
				-
			
 
				-static const struct x86_cpu_id intel_mbm_local_match[] = {
			
 
				-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_LOCAL },
			
 
				-	{}
			
 
				-};
			
 
				-
			
 
				-static const struct x86_cpu_id intel_mbm_total_match[] = {
			
 
				-	{ .vendor = X86_VENDOR_INTEL, .feature = X86_FEATURE_CQM_MBM_TOTAL },
			
 
				-	{}
			
 
				-};
			
 
				-
			
 
				-static int intel_mbm_init(void)
			
 
				-{
			
 
				-	int ret = 0, array_size, maxid = cqm_max_rmid + 1;
			
 
				-
			
 
				-	mbm_socket_max = topology_max_packages();
			
 
				-	array_size = sizeof(struct sample) * maxid * mbm_socket_max;
			
 
				-	mbm_local = kmalloc(array_size, GFP_KERNEL);
			
 
				-	if (!mbm_local)
			
 
				-		return -ENOMEM;
			
 
				-
			
 
				-	mbm_total = kmalloc(array_size, GFP_KERNEL);
			
 
				-	if (!mbm_total) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	array_size = sizeof(struct hrtimer) * mbm_socket_max;
			
 
				-	mbm_timers = kmalloc(array_size, GFP_KERNEL);
			
 
				-	if (!mbm_timers) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-	mbm_hrtimer_init();
			
 
				-
			
 
				-out:
			
 
				-	if (ret)
			
 
				-		mbm_cleanup();
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				-static int __init intel_cqm_init(void)
			
 
				-{
			
 
				-	char *str = NULL, scale[20];
			
 
				-	int cpu, ret;
			
 
				-
			
 
				-	if (x86_match_cpu(intel_cqm_match))
			
 
				-		cqm_enabled = true;
			
 
				-
			
 
				-	if (x86_match_cpu(intel_mbm_local_match) &&
			
 
				-	     x86_match_cpu(intel_mbm_total_match))
			
 
				-		mbm_enabled = true;
			
 
				-
			
 
				-	if (!cqm_enabled && !mbm_enabled)
			
 
				-		return -ENODEV;
			
 
				-
			
 
				-	cqm_l3_scale = boot_cpu_data.x86_cache_occ_scale;
			
 
				-
			
 
				-	/*
			
 
				-	 * It's possible that not all resources support the same number
			
 
				-	 * of RMIDs. Instead of making scheduling much more complicated
			
 
				-	 * (where we have to match a task's RMID to a cpu that supports
			
 
				-	 * that many RMIDs) just find the minimum RMIDs supported across
			
 
				-	 * all cpus.
			
 
				-	 *
			
 
				-	 * Also, check that the scales match on all cpus.
			
 
				-	 */
			
 
				-	cpus_read_lock();
			
 
				-	for_each_online_cpu(cpu) {
			
 
				-		struct cpuinfo_x86 *c = &cpu_data(cpu);
			
 
				-
			
 
				-		if (c->x86_cache_max_rmid < cqm_max_rmid)
			
 
				-			cqm_max_rmid = c->x86_cache_max_rmid;
			
 
				-
			
 
				-		if (c->x86_cache_occ_scale != cqm_l3_scale) {
			
 
				-			pr_err("Multiple LLC scale values, disabling\n");
			
 
				-			ret = -EINVAL;
			
 
				-			goto out;
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * A reasonable upper limit on the max threshold is the number
			
 
				-	 * of lines tagged per RMID if all RMIDs have the same number of
			
 
				-	 * lines tagged in the LLC.
			
 
				-	 *
			
 
				-	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
			
 
				-	 */
			
 
				-	__intel_cqm_max_threshold =
			
 
				-		boot_cpu_data.x86_cache_size * 1024 / (cqm_max_rmid + 1);
			
 
				-
			
 
				-	snprintf(scale, sizeof(scale), "%u", cqm_l3_scale);
			
 
				-	str = kstrdup(scale, GFP_KERNEL);
			
 
				-	if (!str) {
			
 
				-		ret = -ENOMEM;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	event_attr_intel_cqm_llc_scale.event_str = str;
			
 
				-
			
 
				-	ret = intel_cqm_setup_rmid_cache();
			
 
				-	if (ret)
			
 
				-		goto out;
			
 
				-
			
 
				-	if (mbm_enabled)
			
 
				-		ret = intel_mbm_init();
			
 
				-	if (ret && !cqm_enabled)
			
 
				-		goto out;
			
 
				-
			
 
				-	if (cqm_enabled && mbm_enabled)
			
 
				-		intel_cqm_events_group.attrs = intel_cmt_mbm_events_attr;
			
 
				-	else if (!cqm_enabled && mbm_enabled)
			
 
				-		intel_cqm_events_group.attrs = intel_mbm_events_attr;
			
 
				-	else if (cqm_enabled && !mbm_enabled)
			
 
				-		intel_cqm_events_group.attrs = intel_cqm_events_attr;
			
 
				-
			
 
				-	ret = perf_pmu_register(&intel_cqm_pmu, "intel_cqm", -1);
			
 
				-	if (ret) {
			
 
				-		pr_err("Intel CQM perf registration failed: %d\n", ret);
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				-	if (cqm_enabled)
			
 
				-		pr_info("Intel CQM monitoring enabled\n");
			
 
				-	if (mbm_enabled)
			
 
				-		pr_info("Intel MBM enabled\n");
			
 
				-
			
 
				-	/*
			
 
				-	 * Setup the hot cpu notifier once we are sure cqm
			
 
				-	 * is enabled to avoid notifier leak.
			
 
				-	 */
			
 
				-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_STARTING,
			
 
				-				     "perf/x86/cqm:starting",
			
 
				-				     intel_cqm_cpu_starting, NULL);
			
 
				-	cpuhp_setup_state_cpuslocked(CPUHP_AP_PERF_X86_CQM_ONLINE,
			
 
				-				     "perf/x86/cqm:online",
			
 
				-				     NULL, intel_cqm_cpu_exit);
			
 
				-out:
			
 
				-	cpus_read_unlock();
			
 
				-
			
 
				-	if (ret) {
			
 
				-		kfree(str);
			
 
				-		cqm_cleanup();
			
 
				-		mbm_cleanup();
			
 
				-	}
			
 
				-
			
 
				-	return ret;
			
 
				-}
			
 
				-device_initcall(intel_cqm_init);
			
--- a/arch/x86/include/asm/intel_rdt.h
+++ b/arch/x86/include/asm/intel_rdt.h
@@ -1,286 +0,0 @@
 
				-#ifndef _ASM_X86_INTEL_RDT_H
			
 
				-#define _ASM_X86_INTEL_RDT_H
			
 
				-
			
 
				-#ifdef CONFIG_INTEL_RDT_A
			
 
				-
			
 
				-#include <linux/sched.h>
			
 
				-#include <linux/kernfs.h>
			
 
				-#include <linux/jump_label.h>
			
 
				-
			
 
				-#include <asm/intel_rdt_common.h>
			
 
				-
			
 
				-#define IA32_L3_QOS_CFG		0xc81
			
 
				-#define IA32_L3_CBM_BASE	0xc90
			
 
				-#define IA32_L2_CBM_BASE	0xd10
			
 
				-#define IA32_MBA_THRTL_BASE	0xd50
			
 
				-
			
 
				-#define L3_QOS_CDP_ENABLE	0x01ULL
			
 
				-
			
 
				-/**
			
 
				- * struct rdtgroup - store rdtgroup's data in resctrl file system.
			
 
				- * @kn:				kernfs node
			
 
				- * @rdtgroup_list:		linked list for all rdtgroups
			
 
				- * @closid:			closid for this rdtgroup
			
 
				- * @cpu_mask:			CPUs assigned to this rdtgroup
			
 
				- * @flags:			status bits
			
 
				- * @waitcount:			how many cpus expect to find this
			
 
				- *				group when they acquire rdtgroup_mutex
			
 
				- */
			
 
				-struct rdtgroup {
			
 
				-	struct kernfs_node	*kn;
			
 
				-	struct list_head	rdtgroup_list;
			
 
				-	int			closid;
			
 
				-	struct cpumask		cpu_mask;
			
 
				-	int			flags;
			
 
				-	atomic_t		waitcount;
			
 
				-};
			
 
				-
			
 
				-/* rdtgroup.flags */
			
 
				-#define	RDT_DELETED		1
			
 
				-
			
 
				-/* rftype.flags */
			
 
				-#define RFTYPE_FLAGS_CPUS_LIST	1
			
 
				-
			
 
				-/* List of all resource groups */
			
 
				-extern struct list_head rdt_all_groups;
			
 
				-
			
 
				-extern int max_name_width, max_data_width;
			
 
				-
			
 
				-int __init rdtgroup_init(void);
			
 
				-
			
 
				-/**
			
 
				- * struct rftype - describe each file in the resctrl file system
			
 
				- * @name:	File name
			
 
				- * @mode:	Access mode
			
 
				- * @kf_ops:	File operations
			
 
				- * @flags:	File specific RFTYPE_FLAGS_* flags
			
 
				- * @seq_show:	Show content of the file
			
 
				- * @write:	Write to the file
			
 
				- */
			
 
				-struct rftype {
			
 
				-	char			*name;
			
 
				-	umode_t			mode;
			
 
				-	struct kernfs_ops	*kf_ops;
			
 
				-	unsigned long		flags;
			
 
				-
			
 
				-	int (*seq_show)(struct kernfs_open_file *of,
			
 
				-			struct seq_file *sf, void *v);
			
 
				-	/*
			
 
				-	 * write() is the generic write callback which maps directly to
			
 
				-	 * kernfs write operation and overrides all other operations.
			
 
				-	 * Maximum write size is determined by ->max_write_len.
			
 
				-	 */
			
 
				-	ssize_t (*write)(struct kernfs_open_file *of,
			
 
				-			 char *buf, size_t nbytes, loff_t off);
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				- * struct rdt_domain - group of cpus sharing an RDT resource
			
 
				- * @list:	all instances of this resource
			
 
				- * @id:		unique id for this instance
			
 
				- * @cpu_mask:	which cpus share this resource
			
 
				- * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
			
 
				- * @new_ctrl:	new ctrl value to be loaded
			
 
				- * @have_new_ctrl: did user provide new_ctrl for this domain
			
 
				- */
			
 
				-struct rdt_domain {
			
 
				-	struct list_head	list;
			
 
				-	int			id;
			
 
				-	struct cpumask		cpu_mask;
			
 
				-	u32			*ctrl_val;
			
 
				-	u32			new_ctrl;
			
 
				-	bool			have_new_ctrl;
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				- * struct msr_param - set a range of MSRs from a domain
			
 
				- * @res:       The resource to use
			
 
				- * @low:       Beginning index from base MSR
			
 
				- * @high:      End index
			
 
				- */
			
 
				-struct msr_param {
			
 
				-	struct rdt_resource	*res;
			
 
				-	int			low;
			
 
				-	int			high;
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				- * struct rdt_cache - Cache allocation related data
			
 
				- * @cbm_len:		Length of the cache bit mask
			
 
				- * @min_cbm_bits:	Minimum number of consecutive bits to be set
			
 
				- * @cbm_idx_mult:	Multiplier of CBM index
			
 
				- * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
			
 
				- *			closid * cbm_idx_multi + cbm_idx_offset
			
 
				- *			in a cache bit mask
			
 
				- */
			
 
				-struct rdt_cache {
			
 
				-	unsigned int	cbm_len;
			
 
				-	unsigned int	min_cbm_bits;
			
 
				-	unsigned int	cbm_idx_mult;
			
 
				-	unsigned int	cbm_idx_offset;
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				- * struct rdt_membw - Memory bandwidth allocation related data
			
 
				- * @max_delay:		Max throttle delay. Delay is the hardware
			
 
				- *			representation for memory bandwidth.
			
 
				- * @min_bw:		Minimum memory bandwidth percentage user can request
			
 
				- * @bw_gran:		Granularity at which the memory bandwidth is allocated
			
 
				- * @delay_linear:	True if memory B/W delay is in linear scale
			
 
				- * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
			
 
				- */
			
 
				-struct rdt_membw {
			
 
				-	u32		max_delay;
			
 
				-	u32		min_bw;
			
 
				-	u32		bw_gran;
			
 
				-	u32		delay_linear;
			
 
				-	u32		*mb_map;
			
 
				-};
			
 
				-
			
 
				-/**
			
 
				- * struct rdt_resource - attributes of an RDT resource
			
 
				- * @enabled:		Is this feature enabled on this machine
			
 
				- * @capable:		Is this feature available on this machine
			
 
				- * @name:		Name to use in "schemata" file
			
 
				- * @num_closid:		Number of CLOSIDs available
			
 
				- * @cache_level:	Which cache level defines scope of this resource
			
 
				- * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
			
 
				- * @msr_base:		Base MSR address for CBMs
			
 
				- * @msr_update:		Function pointer to update QOS MSRs
			
 
				- * @data_width:		Character width of data when displaying
			
 
				- * @domains:		All domains for this resource
			
 
				- * @cache:		Cache allocation related data
			
 
				- * @info_files:		resctrl info files for the resource
			
 
				- * @nr_info_files:	Number of info files
			
 
				- * @format_str:		Per resource format string to show domain value
			
 
				- * @parse_ctrlval:	Per resource function pointer to parse control values
			
 
				- */
			
 
				-struct rdt_resource {
			
 
				-	bool			enabled;
			
 
				-	bool			capable;
			
 
				-	char			*name;
			
 
				-	int			num_closid;
			
 
				-	int			cache_level;
			
 
				-	u32			default_ctrl;
			
 
				-	unsigned int		msr_base;
			
 
				-	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
			
 
				-				 struct rdt_resource *r);
			
 
				-	int			data_width;
			
 
				-	struct list_head	domains;
			
 
				-	struct rdt_cache	cache;
			
 
				-	struct rdt_membw	membw;
			
 
				-	struct rftype		*info_files;
			
 
				-	int			nr_info_files;
			
 
				-	const char		*format_str;
			
 
				-	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
			
 
				-				 struct rdt_domain *d);
			
 
				-};
			
 
				-
			
 
				-void rdt_get_cache_infofile(struct rdt_resource *r);
			
 
				-void rdt_get_mba_infofile(struct rdt_resource *r);
			
 
				-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
			
 
				-int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
			
 
				-
			
 
				-extern struct mutex rdtgroup_mutex;
			
 
				-
			
 
				-extern struct rdt_resource rdt_resources_all[];
			
 
				-extern struct rdtgroup rdtgroup_default;
			
 
				-DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
			
 
				-
			
 
				-int __init rdtgroup_init(void);
			
 
				-
			
 
				-enum {
			
 
				-	RDT_RESOURCE_L3,
			
 
				-	RDT_RESOURCE_L3DATA,
			
 
				-	RDT_RESOURCE_L3CODE,
			
 
				-	RDT_RESOURCE_L2,
			
 
				-	RDT_RESOURCE_MBA,
			
 
				-
			
 
				-	/* Must be the last */
			
 
				-	RDT_NUM_RESOURCES,
			
 
				-};
			
 
				-
			
 
				-#define for_each_capable_rdt_resource(r)				      \
			
 
				-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				-	     r++)							      \
			
 
				-		if (r->capable)
			
 
				-
			
 
				-#define for_each_enabled_rdt_resource(r)				      \
			
 
				-	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				-	     r++)							      \
			
 
				-		if (r->enabled)
			
 
				-
			
 
				-/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
			
 
				-union cpuid_0x10_1_eax {
			
 
				-	struct {
			
 
				-		unsigned int cbm_len:5;
			
 
				-	} split;
			
 
				-	unsigned int full;
			
 
				-};
			
 
				-
			
 
				-/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
			
 
				-union cpuid_0x10_3_eax {
			
 
				-	struct {
			
 
				-		unsigned int max_delay:12;
			
 
				-	} split;
			
 
				-	unsigned int full;
			
 
				-};
			
 
				-
			
 
				-/* CPUID.(EAX=10H, ECX=ResID).EDX */
			
 
				-union cpuid_0x10_x_edx {
			
 
				-	struct {
			
 
				-		unsigned int cos_max:16;
			
 
				-	} split;
			
 
				-	unsigned int full;
			
 
				-};
			
 
				-
			
 
				-DECLARE_PER_CPU_READ_MOSTLY(int, cpu_closid);
			
 
				-
			
 
				-void rdt_ctrl_update(void *arg);
			
 
				-struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
			
 
				-void rdtgroup_kn_unlock(struct kernfs_node *kn);
			
 
				-ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
			
 
				-				char *buf, size_t nbytes, loff_t off);
			
 
				-int rdtgroup_schemata_show(struct kernfs_open_file *of,
			
 
				-			   struct seq_file *s, void *v);
			
 
				-
			
 
				-/*
			
 
				- * intel_rdt_sched_in() - Writes the task's CLOSid to IA32_PQR_MSR
			
 
				- *
			
 
				- * Following considerations are made so that this has minimal impact
			
 
				- * on scheduler hot path:
			
 
				- * - This will stay as no-op unless we are running on an Intel SKU
			
 
				- *   which supports resource control and we enable by mounting the
			
 
				- *   resctrl file system.
			
 
				- * - Caches the per cpu CLOSid values and does the MSR write only
			
 
				- *   when a task with a different CLOSid is scheduled in.
			
 
				- *
			
 
				- * Must be called with preemption disabled.
			
 
				- */
			
 
				-static inline void intel_rdt_sched_in(void)
			
 
				-{
			
 
				-	if (static_branch_likely(&rdt_enable_key)) {
			
 
				-		struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
			
 
				-		int closid;
			
 
				-
			
 
				-		/*
			
 
				-		 * If this task has a closid assigned, use it.
			
 
				-		 * Else use the closid assigned to this cpu.
			
 
				-		 */
			
 
				-		closid = current->closid;
			
 
				-		if (closid == 0)
			
 
				-			closid = this_cpu_read(cpu_closid);
			
 
				-
			
 
				-		if (closid != state->closid) {
			
 
				-			state->closid = closid;
			
 
				-			wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, closid);
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				-#else
			
 
				-
			
 
				-static inline void intel_rdt_sched_in(void) {}
			
 
				-
			
 
				-#endif /* CONFIG_INTEL_RDT_A */
			
 
				-#endif /* _ASM_X86_INTEL_RDT_H */
			
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@@ -1,27 +0,0 @@
 
				-#ifndef _ASM_X86_INTEL_RDT_COMMON_H
			
 
				-#define _ASM_X86_INTEL_RDT_COMMON_H
			
 
				-
			
 
				-#define MSR_IA32_PQR_ASSOC	0x0c8f
			
 
				-
			
 
				-/**
			
 
				- * struct intel_pqr_state - State cache for the PQR MSR
			
 
				- * @rmid:		The cached Resource Monitoring ID
			
 
				- * @closid:		The cached Class Of Service ID
			
 
				- * @rmid_usecnt:	The usage counter for rmid
			
 
				- *
			
 
				- * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
			
 
				- * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
			
 
				- * contains both parts, so we need to cache them.
			
 
				- *
			
 
				- * The cache also helps to avoid pointless updates if the value does
			
 
				- * not change.
			
 
				- */
			
 
				-struct intel_pqr_state {
			
 
				-	u32			rmid;
			
 
				-	u32			closid;
			
 
				-	int			rmid_usecnt;
			
 
				-};
			
 
				-
			
 
				-DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
			
 
				-
			
 
				-#endif /* _ASM_X86_INTEL_RDT_COMMON_H */
			
--- a/arch/x86/include/asm/intel_rdt_sched.h
+++ b/arch/x86/include/asm/intel_rdt_sched.h
@@ -0,0 +1,92 @@
 
				+#ifndef _ASM_X86_INTEL_RDT_SCHED_H
			
 
				+#define _ASM_X86_INTEL_RDT_SCHED_H
			
 
				+
			
 
				+#ifdef CONFIG_INTEL_RDT
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/jump_label.h>
			
 
				+
			
 
				+#define IA32_PQR_ASSOC	0x0c8f
			
 
				+
			
 
				+/**
			
 
				+ * struct intel_pqr_state - State cache for the PQR MSR
			
 
				+ * @cur_rmid:		The cached Resource Monitoring ID
			
 
				+ * @cur_closid:	The cached Class Of Service ID
			
 
				+ * @default_rmid:	The user assigned Resource Monitoring ID
			
 
				+ * @default_closid:	The user assigned cached Class Of Service ID
			
 
				+ *
			
 
				+ * The upper 32 bits of IA32_PQR_ASSOC contain closid and the
			
 
				+ * lower 10 bits rmid. The update to IA32_PQR_ASSOC always
			
 
				+ * contains both parts, so we need to cache them. This also
			
 
				+ * stores the user configured per cpu CLOSID and RMID.
			
 
				+ *
			
 
				+ * The cache also helps to avoid pointless updates if the value does
			
 
				+ * not change.
			
 
				+ */
			
 
				+struct intel_pqr_state {
			
 
				+	u32			cur_rmid;
			
 
				+	u32			cur_closid;
			
 
				+	u32			default_rmid;
			
 
				+	u32			default_closid;
			
 
				+};
			
 
				+
			
 
				+DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
			
 
				+
			
 
				+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
			
 
				+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
			
 
				+DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
			
 
				+
			
 
				+/*
			
 
				+ * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR
			
 
				+ *
			
 
				+ * Following considerations are made so that this has minimal impact
			
 
				+ * on scheduler hot path:
			
 
				+ * - This will stay as no-op unless we are running on an Intel SKU
			
 
				+ *   which supports resource control or monitoring and we enable by
			
 
				+ *   mounting the resctrl file system.
			
 
				+ * - Caches the per cpu CLOSid/RMID values and does the MSR write only
			
 
				+ *   when a task with a different CLOSid/RMID is scheduled in.
			
 
				+ * - We allocate RMIDs/CLOSids globally in order to keep this as
			
 
				+ *   simple as possible.
			
 
				+ * Must be called with preemption disabled.
			
 
				+ */
			
 
				+static void __intel_rdt_sched_in(void)
			
 
				+{
			
 
				+	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
			
 
				+	u32 closid = state->default_closid;
			
 
				+	u32 rmid = state->default_rmid;
			
 
				+
			
 
				+	/*
			
 
				+	 * If this task has a closid/rmid assigned, use it.
			
 
				+	 * Else use the closid/rmid assigned to this cpu.
			
 
				+	 */
			
 
				+	if (static_branch_likely(&rdt_alloc_enable_key)) {
			
 
				+		if (current->closid)
			
 
				+			closid = current->closid;
			
 
				+	}
			
 
				+
			
 
				+	if (static_branch_likely(&rdt_mon_enable_key)) {
			
 
				+		if (current->rmid)
			
 
				+			rmid = current->rmid;
			
 
				+	}
			
 
				+
			
 
				+	if (closid != state->cur_closid || rmid != state->cur_rmid) {
			
 
				+		state->cur_closid = closid;
			
 
				+		state->cur_rmid = rmid;
			
 
				+		wrmsr(IA32_PQR_ASSOC, rmid, closid);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline void intel_rdt_sched_in(void)
			
 
				+{
			
 
				+	if (static_branch_likely(&rdt_enable_key))
			
 
				+		__intel_rdt_sched_in();
			
 
				+}
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+static inline void intel_rdt_sched_in(void) {}
			
 
				+
			
 
				+#endif /* CONFIG_INTEL_RDT */
			
 
				+
			
 
				+#endif /* _ASM_X86_INTEL_RDT_SCHED_H */
			
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -33,7 +33,7 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 
				 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
			
 
				 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
			
 
				 
			
 
				-obj-$(CONFIG_INTEL_RDT_A)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o
			
 
				+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
			
 
				 
			
 
				 obj-$(CONFIG_X86_MCE)			+= mcheck/
			
 
				 obj-$(CONFIG_MTRR)			+= mtrr/
			
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -30,7 +30,8 @@
 
				 #include <linux/cpuhotplug.h>
			
 
				 
			
 
				 #include <asm/intel-family.h>
			
 
				-#include <asm/intel_rdt.h>
			
 
				+#include <asm/intel_rdt_sched.h>
			
 
				+#include "intel_rdt.h"
			
 
				 
			
 
				 #define MAX_MBA_BW	100u
			
 
				 #define MBA_IS_LINEAR	0x4
			
@@ -38,7 +39,13 @@
 
				 /* Mutex to protect rdtgroup access. */
			
 
				 DEFINE_MUTEX(rdtgroup_mutex);
			
 
				 
			
 
				-DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
			
 
				+/*
			
 
				+ * The cached intel_pqr_state is strictly per CPU and can never be
			
 
				+ * updated from a remote CPU. Functions which modify the state
			
 
				+ * are called with interrupts disabled and no preemption, which
			
 
				+ * is sufficient for the protection.
			
 
				+ */
			
 
				+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
			
 
				 
			
 
				 /*
			
 
				  * Used to store the max resource name width and max resource data width
			
@@ -46,6 +53,12 @@ DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);
 
				  */
			
 
				 int max_name_width, max_data_width;
			
 
				 
			
 
				+/*
			
 
				+ * Global boolean for rdt_alloc which is true if any
			
 
				+ * resource allocation is enabled.
			
 
				+ */
			
 
				+bool rdt_alloc_capable;
			
 
				+
			
 
				 static void
			
 
				 mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
			
 
				 static void
			
@@ -54,7 +67,9 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r);
 
				 #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains)
			
 
				 
			
 
				 struct rdt_resource rdt_resources_all[] = {
			
 
				+	[RDT_RESOURCE_L3] =
			
 
				 	{
			
 
				+		.rid			= RDT_RESOURCE_L3,
			
 
				 		.name			= "L3",
			
 
				 		.domains		= domain_init(RDT_RESOURCE_L3),
			
 
				 		.msr_base		= IA32_L3_CBM_BASE,
			
@@ -67,8 +82,11 @@ struct rdt_resource rdt_resources_all[] = {
 
				 		},
			
 
				 		.parse_ctrlval		= parse_cbm,
			
 
				 		.format_str		= "%d=%0*x",
			
 
				+		.fflags			= RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				+	[RDT_RESOURCE_L3DATA] =
			
 
				 	{
			
 
				+		.rid			= RDT_RESOURCE_L3DATA,
			
 
				 		.name			= "L3DATA",
			
 
				 		.domains		= domain_init(RDT_RESOURCE_L3DATA),
			
 
				 		.msr_base		= IA32_L3_CBM_BASE,
			
@@ -81,8 +99,11 @@ struct rdt_resource rdt_resources_all[] = {
 
				 		},
			
 
				 		.parse_ctrlval		= parse_cbm,
			
 
				 		.format_str		= "%d=%0*x",
			
 
				+		.fflags			= RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				+	[RDT_RESOURCE_L3CODE] =
			
 
				 	{
			
 
				+		.rid			= RDT_RESOURCE_L3CODE,
			
 
				 		.name			= "L3CODE",
			
 
				 		.domains		= domain_init(RDT_RESOURCE_L3CODE),
			
 
				 		.msr_base		= IA32_L3_CBM_BASE,
			
@@ -95,8 +116,11 @@ struct rdt_resource rdt_resources_all[] = {
 
				 		},
			
 
				 		.parse_ctrlval		= parse_cbm,
			
 
				 		.format_str		= "%d=%0*x",
			
 
				+		.fflags			= RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				+	[RDT_RESOURCE_L2] =
			
 
				 	{
			
 
				+		.rid			= RDT_RESOURCE_L2,
			
 
				 		.name			= "L2",
			
 
				 		.domains		= domain_init(RDT_RESOURCE_L2),
			
 
				 		.msr_base		= IA32_L2_CBM_BASE,
			
@@ -109,8 +133,11 @@ struct rdt_resource rdt_resources_all[] = {
 
				 		},
			
 
				 		.parse_ctrlval		= parse_cbm,
			
 
				 		.format_str		= "%d=%0*x",
			
 
				+		.fflags			= RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				+	[RDT_RESOURCE_MBA] =
			
 
				 	{
			
 
				+		.rid			= RDT_RESOURCE_MBA,
			
 
				 		.name			= "MB",
			
 
				 		.domains		= domain_init(RDT_RESOURCE_MBA),
			
 
				 		.msr_base		= IA32_MBA_THRTL_BASE,
			
@@ -118,6 +145,7 @@ struct rdt_resource rdt_resources_all[] = {
 
				 		.cache_level		= 3,
			
 
				 		.parse_ctrlval		= parse_bw,
			
 
				 		.format_str		= "%d=%*d",
			
 
				+		.fflags			= RFTYPE_RES_MB,
			
 
				 	},
			
 
				 };
			
 
				 
			
@@ -144,33 +172,28 @@ static unsigned int cbm_idx(struct rdt_resource *r, unsigned int closid)
 
				  * is always 20 on hsw server parts. The minimum cache bitmask length
			
 
				  * allowed for HSW server is always 2 bits. Hardcode all of them.
			
 
				  */
			
 
				-static inline bool cache_alloc_hsw_probe(void)
			
 
				+static inline void cache_alloc_hsw_probe(void)
			
 
				 {
			
 
				-	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
			
 
				-	    boot_cpu_data.x86 == 6 &&
			
 
				-	    boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) {
			
 
				-		struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				-		u32 l, h, max_cbm = BIT_MASK(20) - 1;
			
 
				-
			
 
				-		if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
			
 
				-			return false;
			
 
				-		rdmsr(IA32_L3_CBM_BASE, l, h);
			
 
				+	struct rdt_resource *r  = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+	u32 l, h, max_cbm = BIT_MASK(20) - 1;
			
 
				 
			
 
				-		/* If all the bits were set in MSR, return success */
			
 
				-		if (l != max_cbm)
			
 
				-			return false;
			
 
				+	if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0))
			
 
				+		return;
			
 
				+	rdmsr(IA32_L3_CBM_BASE, l, h);
			
 
				 
			
 
				-		r->num_closid = 4;
			
 
				-		r->default_ctrl = max_cbm;
			
 
				-		r->cache.cbm_len = 20;
			
 
				-		r->cache.min_cbm_bits = 2;
			
 
				-		r->capable = true;
			
 
				-		r->enabled = true;
			
 
				+	/* If all the bits were set in MSR, return success */
			
 
				+	if (l != max_cbm)
			
 
				+		return;
			
 
				 
			
 
				-		return true;
			
 
				-	}
			
 
				+	r->num_closid = 4;
			
 
				+	r->default_ctrl = max_cbm;
			
 
				+	r->cache.cbm_len = 20;
			
 
				+	r->cache.shareable_bits = 0xc0000;
			
 
				+	r->cache.min_cbm_bits = 2;
			
 
				+	r->alloc_capable = true;
			
 
				+	r->alloc_enabled = true;
			
 
				 
			
 
				-	return false;
			
 
				+	rdt_alloc_capable = true;
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -213,15 +236,14 @@ static bool rdt_get_mem_config(struct rdt_resource *r)
 
				 			return false;
			
 
				 	}
			
 
				 	r->data_width = 3;
			
 
				-	rdt_get_mba_infofile(r);
			
 
				 
			
 
				-	r->capable = true;
			
 
				-	r->enabled = true;
			
 
				+	r->alloc_capable = true;
			
 
				+	r->alloc_enabled = true;
			
 
				 
			
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-static void rdt_get_cache_config(int idx, struct rdt_resource *r)
			
 
				+static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r)
			
 
				 {
			
 
				 	union cpuid_0x10_1_eax eax;
			
 
				 	union cpuid_0x10_x_edx edx;
			
@@ -231,10 +253,10 @@ static void rdt_get_cache_config(int idx, struct rdt_resource *r)
 
				 	r->num_closid = edx.split.cos_max + 1;
			
 
				 	r->cache.cbm_len = eax.split.cbm_len + 1;
			
 
				 	r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1;
			
 
				+	r->cache.shareable_bits = ebx & r->default_ctrl;
			
 
				 	r->data_width = (r->cache.cbm_len + 3) / 4;
			
 
				-	rdt_get_cache_infofile(r);
			
 
				-	r->capable = true;
			
 
				-	r->enabled = true;
			
 
				+	r->alloc_capable = true;
			
 
				+	r->alloc_enabled = true;
			
 
				 }
			
 
				 
			
 
				 static void rdt_get_cdp_l3_config(int type)
			
@@ -246,12 +268,12 @@ static void rdt_get_cdp_l3_config(int type)
 
				 	r->cache.cbm_len = r_l3->cache.cbm_len;
			
 
				 	r->default_ctrl = r_l3->default_ctrl;
			
 
				 	r->data_width = (r->cache.cbm_len + 3) / 4;
			
 
				-	r->capable = true;
			
 
				+	r->alloc_capable = true;
			
 
				 	/*
			
 
				 	 * By default, CDP is disabled. CDP can be enabled by mount parameter
			
 
				 	 * "cdp" during resctrl file system mount time.
			
 
				 	 */
			
 
				-	r->enabled = false;
			
 
				+	r->alloc_enabled = false;
			
 
				 }
			
 
				 
			
 
				 static int get_cache_id(int cpu, int level)
			
@@ -300,6 +322,19 @@ cat_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r)
 
				 		wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]);
			
 
				 }
			
 
				 
			
 
				+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r)
			
 
				+{
			
 
				+	struct rdt_domain *d;
			
 
				+
			
 
				+	list_for_each_entry(d, &r->domains, list) {
			
 
				+		/* Find the domain that contains this CPU */
			
 
				+		if (cpumask_test_cpu(cpu, &d->cpu_mask))
			
 
				+			return d;
			
 
				+	}
			
 
				+
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				 void rdt_ctrl_update(void *arg)
			
 
				 {
			
 
				 	struct msr_param *m = arg;
			
@@ -307,12 +342,10 @@ void rdt_ctrl_update(void *arg)
 
				 	int cpu = smp_processor_id();
			
 
				 	struct rdt_domain *d;
			
 
				 
			
 
				-	list_for_each_entry(d, &r->domains, list) {
			
 
				-		/* Find the domain that contains this CPU */
			
 
				-		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
			
 
				-			r->msr_update(d, m, r);
			
 
				-			return;
			
 
				-		}
			
 
				+	d = get_domain_from_cpu(cpu, r);
			
 
				+	if (d) {
			
 
				+		r->msr_update(d, m, r);
			
 
				+		return;
			
 
				 	}
			
 
				 	pr_warn_once("cpu %d not found in any domain for resource %s\n",
			
 
				 		     cpu, r->name);
			
@@ -326,8 +359,8 @@ void rdt_ctrl_update(void *arg)
 
				  * caller, return the first domain whose id is bigger than the input id.
			
 
				  * The domain list is sorted by id in ascending order.
			
 
				  */
			
 
				-static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
			
 
				-					  struct list_head **pos)
			
 
				+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
			
 
				+				   struct list_head **pos)
			
 
				 {
			
 
				 	struct rdt_domain *d;
			
 
				 	struct list_head *l;
			
@@ -377,6 +410,44 @@ static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d)
			
 
				+{
			
 
				+	size_t tsize;
			
 
				+
			
 
				+	if (is_llc_occupancy_enabled()) {
			
 
				+		d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid),
			
 
				+					   sizeof(unsigned long),
			
 
				+					   GFP_KERNEL);
			
 
				+		if (!d->rmid_busy_llc)
			
 
				+			return -ENOMEM;
			
 
				+		INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo);
			
 
				+	}
			
 
				+	if (is_mbm_total_enabled()) {
			
 
				+		tsize = sizeof(*d->mbm_total);
			
 
				+		d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
			
 
				+		if (!d->mbm_total) {
			
 
				+			kfree(d->rmid_busy_llc);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+	}
			
 
				+	if (is_mbm_local_enabled()) {
			
 
				+		tsize = sizeof(*d->mbm_local);
			
 
				+		d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL);
			
 
				+		if (!d->mbm_local) {
			
 
				+			kfree(d->rmid_busy_llc);
			
 
				+			kfree(d->mbm_total);
			
 
				+			return -ENOMEM;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	if (is_mbm_enabled()) {
			
 
				+		INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow);
			
 
				+		mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * domain_add_cpu - Add a cpu to a resource's domain list.
			
 
				  *
			
@@ -412,14 +483,26 @@ static void domain_add_cpu(int cpu, struct rdt_resource *r)
 
				 		return;
			
 
				 
			
 
				 	d->id = id;
			
 
				+	cpumask_set_cpu(cpu, &d->cpu_mask);
			
 
				 
			
 
				-	if (domain_setup_ctrlval(r, d)) {
			
 
				+	if (r->alloc_capable && domain_setup_ctrlval(r, d)) {
			
 
				+		kfree(d);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (r->mon_capable && domain_setup_mon_state(r, d)) {
			
 
				 		kfree(d);
			
 
				 		return;
			
 
				 	}
			
 
				 
			
 
				-	cpumask_set_cpu(cpu, &d->cpu_mask);
			
 
				 	list_add_tail(&d->list, add_pos);
			
 
				+
			
 
				+	/*
			
 
				+	 * If resctrl is mounted, add
			
 
				+	 * per domain monitor data directories.
			
 
				+	 */
			
 
				+	if (static_branch_unlikely(&rdt_mon_enable_key))
			
 
				+		mkdir_mondata_subdir_allrdtgrp(r, d);
			
 
				 }
			
 
				 
			
 
				 static void domain_remove_cpu(int cpu, struct rdt_resource *r)
			
@@ -435,19 +518,58 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r)
 
				 
			
 
				 	cpumask_clear_cpu(cpu, &d->cpu_mask);
			
 
				 	if (cpumask_empty(&d->cpu_mask)) {
			
 
				+		/*
			
 
				+		 * If resctrl is mounted, remove all the
			
 
				+		 * per domain monitor data directories.
			
 
				+		 */
			
 
				+		if (static_branch_unlikely(&rdt_mon_enable_key))
			
 
				+			rmdir_mondata_subdir_allrdtgrp(r, d->id);
			
 
				 		kfree(d->ctrl_val);
			
 
				+		kfree(d->rmid_busy_llc);
			
 
				+		kfree(d->mbm_total);
			
 
				+		kfree(d->mbm_local);
			
 
				 		list_del(&d->list);
			
 
				+		if (is_mbm_enabled())
			
 
				+			cancel_delayed_work(&d->mbm_over);
			
 
				+		if (is_llc_occupancy_enabled() &&  has_busy_rmid(r, d)) {
			
 
				+			/*
			
 
				+			 * When a package is going down, forcefully
			
 
				+			 * decrement rmid->ebusy. There is no way to know
			
 
				+			 * that the L3 was flushed and hence may lead to
			
 
				+			 * incorrect counts in rare scenarios, but leaving
			
 
				+			 * the RMID as busy creates RMID leaks if the
			
 
				+			 * package never comes back.
			
 
				+			 */
			
 
				+			__check_limbo(d, true);
			
 
				+			cancel_delayed_work(&d->cqm_limbo);
			
 
				+		}
			
 
				+
			
 
				 		kfree(d);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	if (r == &rdt_resources_all[RDT_RESOURCE_L3]) {
			
 
				+		if (is_mbm_enabled() && cpu == d->mbm_work_cpu) {
			
 
				+			cancel_delayed_work(&d->mbm_over);
			
 
				+			mbm_setup_overflow_handler(d, 0);
			
 
				+		}
			
 
				+		if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu &&
			
 
				+		    has_busy_rmid(r, d)) {
			
 
				+			cancel_delayed_work(&d->cqm_limbo);
			
 
				+			cqm_setup_limbo_handler(d, 0);
			
 
				+		}
			
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static void clear_closid(int cpu)
			
 
				+static void clear_closid_rmid(int cpu)
			
 
				 {
			
 
				 	struct intel_pqr_state *state = this_cpu_ptr(&pqr_state);
			
 
				 
			
 
				-	per_cpu(cpu_closid, cpu) = 0;
			
 
				-	state->closid = 0;
			
 
				-	wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0);
			
 
				+	state->default_closid = 0;
			
 
				+	state->default_rmid = 0;
			
 
				+	state->cur_closid = 0;
			
 
				+	state->cur_rmid = 0;
			
 
				+	wrmsr(IA32_PQR_ASSOC, 0, 0);
			
 
				 }
			
 
				 
			
 
				 static int intel_rdt_online_cpu(unsigned int cpu)
			
@@ -459,12 +581,23 @@ static int intel_rdt_online_cpu(unsigned int cpu)
 
				 		domain_add_cpu(cpu, r);
			
 
				 	/* The cpu is set in default rdtgroup after online. */
			
 
				 	cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask);
			
 
				-	clear_closid(cpu);
			
 
				+	clear_closid_rmid(cpu);
			
 
				 	mutex_unlock(&rdtgroup_mutex);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static void clear_childcpus(struct rdtgroup *r, unsigned int cpu)
			
 
				+{
			
 
				+	struct rdtgroup *cr;
			
 
				+
			
 
				+	list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) {
			
 
				+		if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) {
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int intel_rdt_offline_cpu(unsigned int cpu)
			
 
				 {
			
 
				 	struct rdtgroup *rdtgrp;
			
@@ -474,10 +607,12 @@ static int intel_rdt_offline_cpu(unsigned int cpu)
 
				 	for_each_capable_rdt_resource(r)
			
 
				 		domain_remove_cpu(cpu, r);
			
 
				 	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				-		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask))
			
 
				+		if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) {
			
 
				+			clear_childcpus(rdtgrp, cpu);
			
 
				 			break;
			
 
				+		}
			
 
				 	}
			
 
				-	clear_closid(cpu);
			
 
				+	clear_closid_rmid(cpu);
			
 
				 	mutex_unlock(&rdtgroup_mutex);
			
 
				 
			
 
				 	return 0;
			
@@ -492,7 +627,7 @@ static __init void rdt_init_padding(void)
 
				 	struct rdt_resource *r;
			
 
				 	int cl;
			
 
				 
			
 
				-	for_each_capable_rdt_resource(r) {
			
 
				+	for_each_alloc_capable_rdt_resource(r) {
			
 
				 		cl = strlen(r->name);
			
 
				 		if (cl > max_name_width)
			
 
				 			max_name_width = cl;
			
@@ -502,38 +637,153 @@ static __init void rdt_init_padding(void)
 
				 	}
			
 
				 }
			
 
				 
			
 
				-static __init bool get_rdt_resources(void)
			
 
				+enum {
			
 
				+	RDT_FLAG_CMT,
			
 
				+	RDT_FLAG_MBM_TOTAL,
			
 
				+	RDT_FLAG_MBM_LOCAL,
			
 
				+	RDT_FLAG_L3_CAT,
			
 
				+	RDT_FLAG_L3_CDP,
			
 
				+	RDT_FLAG_L2_CAT,
			
 
				+	RDT_FLAG_MBA,
			
 
				+};
			
 
				+
			
 
				+#define RDT_OPT(idx, n, f)	\
			
 
				+[idx] = {			\
			
 
				+	.name = n,		\
			
 
				+	.flag = f		\
			
 
				+}
			
 
				+
			
 
				+struct rdt_options {
			
 
				+	char	*name;
			
 
				+	int	flag;
			
 
				+	bool	force_off, force_on;
			
 
				+};
			
 
				+
			
 
				+static struct rdt_options rdt_options[]  __initdata = {
			
 
				+	RDT_OPT(RDT_FLAG_CMT,	    "cmt",	X86_FEATURE_CQM_OCCUP_LLC),
			
 
				+	RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL),
			
 
				+	RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL),
			
 
				+	RDT_OPT(RDT_FLAG_L3_CAT,    "l3cat",	X86_FEATURE_CAT_L3),
			
 
				+	RDT_OPT(RDT_FLAG_L3_CDP,    "l3cdp",	X86_FEATURE_CDP_L3),
			
 
				+	RDT_OPT(RDT_FLAG_L2_CAT,    "l2cat",	X86_FEATURE_CAT_L2),
			
 
				+	RDT_OPT(RDT_FLAG_MBA,	    "mba",	X86_FEATURE_MBA),
			
 
				+};
			
 
				+#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options)
			
 
				+
			
 
				+static int __init set_rdt_options(char *str)
			
 
				+{
			
 
				+	struct rdt_options *o;
			
 
				+	bool force_off;
			
 
				+	char *tok;
			
 
				+
			
 
				+	if (*str == '=')
			
 
				+		str++;
			
 
				+	while ((tok = strsep(&str, ",")) != NULL) {
			
 
				+		force_off = *tok == '!';
			
 
				+		if (force_off)
			
 
				+			tok++;
			
 
				+		for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
			
 
				+			if (strcmp(tok, o->name) == 0) {
			
 
				+				if (force_off)
			
 
				+					o->force_off = true;
			
 
				+				else
			
 
				+					o->force_on = true;
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return 1;
			
 
				+}
			
 
				+__setup("rdt", set_rdt_options);
			
 
				+
			
 
				+static bool __init rdt_cpu_has(int flag)
			
 
				+{
			
 
				+	bool ret = boot_cpu_has(flag);
			
 
				+	struct rdt_options *o;
			
 
				+
			
 
				+	if (!ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) {
			
 
				+		if (flag == o->flag) {
			
 
				+			if (o->force_off)
			
 
				+				ret = false;
			
 
				+			if (o->force_on)
			
 
				+				ret = true;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static __init bool get_rdt_alloc_resources(void)
			
 
				 {
			
 
				 	bool ret = false;
			
 
				 
			
 
				-	if (cache_alloc_hsw_probe())
			
 
				+	if (rdt_alloc_capable)
			
 
				 		return true;
			
 
				 
			
 
				 	if (!boot_cpu_has(X86_FEATURE_RDT_A))
			
 
				 		return false;
			
 
				 
			
 
				-	if (boot_cpu_has(X86_FEATURE_CAT_L3)) {
			
 
				-		rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]);
			
 
				-		if (boot_cpu_has(X86_FEATURE_CDP_L3)) {
			
 
				+	if (rdt_cpu_has(X86_FEATURE_CAT_L3)) {
			
 
				+		rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]);
			
 
				+		if (rdt_cpu_has(X86_FEATURE_CDP_L3)) {
			
 
				 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA);
			
 
				 			rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE);
			
 
				 		}
			
 
				 		ret = true;
			
 
				 	}
			
 
				-	if (boot_cpu_has(X86_FEATURE_CAT_L2)) {
			
 
				+	if (rdt_cpu_has(X86_FEATURE_CAT_L2)) {
			
 
				 		/* CPUID 0x10.2 fields are same format at 0x10.1 */
			
 
				-		rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]);
			
 
				+		rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]);
			
 
				 		ret = true;
			
 
				 	}
			
 
				 
			
 
				-	if (boot_cpu_has(X86_FEATURE_MBA)) {
			
 
				+	if (rdt_cpu_has(X86_FEATURE_MBA)) {
			
 
				 		if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA]))
			
 
				 			ret = true;
			
 
				 	}
			
 
				-
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static __init bool get_rdt_mon_resources(void)
			
 
				+{
			
 
				+	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC))
			
 
				+		rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID);
			
 
				+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL))
			
 
				+		rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID);
			
 
				+	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL))
			
 
				+		rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID);
			
 
				+
			
 
				+	if (!rdt_mon_features)
			
 
				+		return false;
			
 
				+
			
 
				+	return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]);
			
 
				+}
			
 
				+
			
 
				+static __init void rdt_quirks(void)
			
 
				+{
			
 
				+	switch (boot_cpu_data.x86_model) {
			
 
				+	case INTEL_FAM6_HASWELL_X:
			
 
				+		if (!rdt_options[RDT_FLAG_L3_CAT].force_off)
			
 
				+			cache_alloc_hsw_probe();
			
 
				+		break;
			
 
				+	case INTEL_FAM6_SKYLAKE_X:
			
 
				+		if (boot_cpu_data.x86_mask <= 4)
			
 
				+			set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat");
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static __init bool get_rdt_resources(void)
			
 
				+{
			
 
				+	rdt_quirks();
			
 
				+	rdt_alloc_capable = get_rdt_alloc_resources();
			
 
				+	rdt_mon_capable = get_rdt_mon_resources();
			
 
				+
			
 
				+	return (rdt_mon_capable || rdt_alloc_capable);
			
 
				+}
			
 
				+
			
 
				 static int __init intel_rdt_late_init(void)
			
 
				 {
			
 
				 	struct rdt_resource *r;
			
@@ -556,9 +806,12 @@ static int __init intel_rdt_late_init(void)
 
				 		return ret;
			
 
				 	}
			
 
				 
			
 
				-	for_each_capable_rdt_resource(r)
			
 
				+	for_each_alloc_capable_rdt_resource(r)
			
 
				 		pr_info("Intel RDT %s allocation detected\n", r->name);
			
 
				 
			
 
				+	for_each_mon_capable_rdt_resource(r)
			
 
				+		pr_info("Intel RDT %s monitoring detected\n", r->name);
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
 
				 
			
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -0,0 +1,440 @@
 
				+#ifndef _ASM_X86_INTEL_RDT_H
			
 
				+#define _ASM_X86_INTEL_RDT_H
			
 
				+
			
 
				+#include <linux/sched.h>
			
 
				+#include <linux/kernfs.h>
			
 
				+#include <linux/jump_label.h>
			
 
				+
			
 
				+#define IA32_L3_QOS_CFG		0xc81
			
 
				+#define IA32_L3_CBM_BASE	0xc90
			
 
				+#define IA32_L2_CBM_BASE	0xd10
			
 
				+#define IA32_MBA_THRTL_BASE	0xd50
			
 
				+
			
 
				+#define L3_QOS_CDP_ENABLE	0x01ULL
			
 
				+
			
 
				+/*
			
 
				+ * Event IDs are used to program IA32_QM_EVTSEL before reading event
			
 
				+ * counter from IA32_QM_CTR
			
 
				+ */
			
 
				+#define QOS_L3_OCCUP_EVENT_ID		0x01
			
 
				+#define QOS_L3_MBM_TOTAL_EVENT_ID	0x02
			
 
				+#define QOS_L3_MBM_LOCAL_EVENT_ID	0x03
			
 
				+
			
 
				+#define CQM_LIMBOCHECK_INTERVAL	1000
			
 
				+
			
 
				+#define MBM_CNTR_WIDTH			24
			
 
				+#define MBM_OVERFLOW_INTERVAL		1000
			
 
				+
			
 
				+#define RMID_VAL_ERROR			BIT_ULL(63)
			
 
				+#define RMID_VAL_UNAVAIL		BIT_ULL(62)
			
 
				+
			
 
				+DECLARE_STATIC_KEY_FALSE(rdt_enable_key);
			
 
				+
			
 
				+/**
			
 
				+ * struct mon_evt - Entry in the event list of a resource
			
 
				+ * @evtid:		event id
			
 
				+ * @name:		name of the event
			
 
				+ */
			
 
				+struct mon_evt {
			
 
				+	u32			evtid;
			
 
				+	char			*name;
			
 
				+	struct list_head	list;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct mon_data_bits - Monitoring details for each event file
			
 
				+ * @rid:               Resource id associated with the event file.
			
 
				+ * @evtid:             Event id associated with the event file
			
 
				+ * @domid:             The domain to which the event file belongs
			
 
				+ */
			
 
				+union mon_data_bits {
			
 
				+	void *priv;
			
 
				+	struct {
			
 
				+		unsigned int rid	: 10;
			
 
				+		unsigned int evtid	: 8;
			
 
				+		unsigned int domid	: 14;
			
 
				+	} u;
			
 
				+};
			
 
				+
			
 
				+struct rmid_read {
			
 
				+	struct rdtgroup		*rgrp;
			
 
				+	struct rdt_domain	*d;
			
 
				+	int			evtid;
			
 
				+	bool			first;
			
 
				+	u64			val;
			
 
				+};
			
 
				+
			
 
				+extern unsigned int intel_cqm_threshold;
			
 
				+extern bool rdt_alloc_capable;
			
 
				+extern bool rdt_mon_capable;
			
 
				+extern unsigned int rdt_mon_features;
			
 
				+
			
 
				+enum rdt_group_type {
			
 
				+	RDTCTRL_GROUP = 0,
			
 
				+	RDTMON_GROUP,
			
 
				+	RDT_NUM_GROUP,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct mongroup - store mon group's data in resctrl fs.
			
 
				+ * @mon_data_kn		kernlfs node for the mon_data directory
			
 
				+ * @parent:			parent rdtgrp
			
 
				+ * @crdtgrp_list:		child rdtgroup node list
			
 
				+ * @rmid:			rmid for this rdtgroup
			
 
				+ */
			
 
				+struct mongroup {
			
 
				+	struct kernfs_node	*mon_data_kn;
			
 
				+	struct rdtgroup		*parent;
			
 
				+	struct list_head	crdtgrp_list;
			
 
				+	u32			rmid;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct rdtgroup - store rdtgroup's data in resctrl file system.
			
 
				+ * @kn:				kernfs node
			
 
				+ * @rdtgroup_list:		linked list for all rdtgroups
			
 
				+ * @closid:			closid for this rdtgroup
			
 
				+ * @cpu_mask:			CPUs assigned to this rdtgroup
			
 
				+ * @flags:			status bits
			
 
				+ * @waitcount:			how many cpus expect to find this
			
 
				+ *				group when they acquire rdtgroup_mutex
			
 
				+ * @type:			indicates type of this rdtgroup - either
			
 
				+ *				monitor only or ctrl_mon group
			
 
				+ * @mon:			mongroup related data
			
 
				+ */
			
 
				+struct rdtgroup {
			
 
				+	struct kernfs_node	*kn;
			
 
				+	struct list_head	rdtgroup_list;
			
 
				+	u32			closid;
			
 
				+	struct cpumask		cpu_mask;
			
 
				+	int			flags;
			
 
				+	atomic_t		waitcount;
			
 
				+	enum rdt_group_type	type;
			
 
				+	struct mongroup		mon;
			
 
				+};
			
 
				+
			
 
				+/* rdtgroup.flags */
			
 
				+#define	RDT_DELETED		1
			
 
				+
			
 
				+/* rftype.flags */
			
 
				+#define RFTYPE_FLAGS_CPUS_LIST	1
			
 
				+
			
 
				+/*
			
 
				+ * Define the file type flags for base and info directories.
			
 
				+ */
			
 
				+#define RFTYPE_INFO			BIT(0)
			
 
				+#define RFTYPE_BASE			BIT(1)
			
 
				+#define RF_CTRLSHIFT			4
			
 
				+#define RF_MONSHIFT			5
			
 
				+#define RFTYPE_CTRL			BIT(RF_CTRLSHIFT)
			
 
				+#define RFTYPE_MON			BIT(RF_MONSHIFT)
			
 
				+#define RFTYPE_RES_CACHE		BIT(8)
			
 
				+#define RFTYPE_RES_MB			BIT(9)
			
 
				+#define RF_CTRL_INFO			(RFTYPE_INFO | RFTYPE_CTRL)
			
 
				+#define RF_MON_INFO			(RFTYPE_INFO | RFTYPE_MON)
			
 
				+#define RF_CTRL_BASE			(RFTYPE_BASE | RFTYPE_CTRL)
			
 
				+
			
 
				+/* List of all resource groups */
			
 
				+extern struct list_head rdt_all_groups;
			
 
				+
			
 
				+extern int max_name_width, max_data_width;
			
 
				+
			
 
				+int __init rdtgroup_init(void);
			
 
				+
			
 
				+/**
			
 
				+ * struct rftype - describe each file in the resctrl file system
			
 
				+ * @name:	File name
			
 
				+ * @mode:	Access mode
			
 
				+ * @kf_ops:	File operations
			
 
				+ * @flags:	File specific RFTYPE_FLAGS_* flags
			
 
				+ * @fflags:	File specific RF_* or RFTYPE_* flags
			
 
				+ * @seq_show:	Show content of the file
			
 
				+ * @write:	Write to the file
			
 
				+ */
			
 
				+struct rftype {
			
 
				+	char			*name;
			
 
				+	umode_t			mode;
			
 
				+	struct kernfs_ops	*kf_ops;
			
 
				+	unsigned long		flags;
			
 
				+	unsigned long		fflags;
			
 
				+
			
 
				+	int (*seq_show)(struct kernfs_open_file *of,
			
 
				+			struct seq_file *sf, void *v);
			
 
				+	/*
			
 
				+	 * write() is the generic write callback which maps directly to
			
 
				+	 * kernfs write operation and overrides all other operations.
			
 
				+	 * Maximum write size is determined by ->max_write_len.
			
 
				+	 */
			
 
				+	ssize_t (*write)(struct kernfs_open_file *of,
			
 
				+			 char *buf, size_t nbytes, loff_t off);
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct mbm_state - status for each MBM counter in each domain
			
 
				+ * @chunks:	Total data moved (multiply by rdt_group.mon_scale to get bytes)
			
 
				+ * @prev_msr	Value of IA32_QM_CTR for this RMID last time we read it
			
 
				+ */
			
 
				+struct mbm_state {
			
 
				+	u64	chunks;
			
 
				+	u64	prev_msr;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct rdt_domain - group of cpus sharing an RDT resource
			
 
				+ * @list:	all instances of this resource
			
 
				+ * @id:		unique id for this instance
			
 
				+ * @cpu_mask:	which cpus share this resource
			
 
				+ * @rmid_busy_llc:
			
 
				+ *		bitmap of which limbo RMIDs are above threshold
			
 
				+ * @mbm_total:	saved state for MBM total bandwidth
			
 
				+ * @mbm_local:	saved state for MBM local bandwidth
			
 
				+ * @mbm_over:	worker to periodically read MBM h/w counters
			
 
				+ * @cqm_limbo:	worker to periodically read CQM h/w counters
			
 
				+ * @mbm_work_cpu:
			
 
				+ *		worker cpu for MBM h/w counters
			
 
				+ * @cqm_work_cpu:
			
 
				+ *		worker cpu for CQM h/w counters
			
 
				+ * @ctrl_val:	array of cache or mem ctrl values (indexed by CLOSID)
			
 
				+ * @new_ctrl:	new ctrl value to be loaded
			
 
				+ * @have_new_ctrl: did user provide new_ctrl for this domain
			
 
				+ */
			
 
				+struct rdt_domain {
			
 
				+	struct list_head	list;
			
 
				+	int			id;
			
 
				+	struct cpumask		cpu_mask;
			
 
				+	unsigned long		*rmid_busy_llc;
			
 
				+	struct mbm_state	*mbm_total;
			
 
				+	struct mbm_state	*mbm_local;
			
 
				+	struct delayed_work	mbm_over;
			
 
				+	struct delayed_work	cqm_limbo;
			
 
				+	int			mbm_work_cpu;
			
 
				+	int			cqm_work_cpu;
			
 
				+	u32			*ctrl_val;
			
 
				+	u32			new_ctrl;
			
 
				+	bool			have_new_ctrl;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct msr_param - set a range of MSRs from a domain
			
 
				+ * @res:       The resource to use
			
 
				+ * @low:       Beginning index from base MSR
			
 
				+ * @high:      End index
			
 
				+ */
			
 
				+struct msr_param {
			
 
				+	struct rdt_resource	*res;
			
 
				+	int			low;
			
 
				+	int			high;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct rdt_cache - Cache allocation related data
			
 
				+ * @cbm_len:		Length of the cache bit mask
			
 
				+ * @min_cbm_bits:	Minimum number of consecutive bits to be set
			
 
				+ * @cbm_idx_mult:	Multiplier of CBM index
			
 
				+ * @cbm_idx_offset:	Offset of CBM index. CBM index is computed by:
			
 
				+ *			closid * cbm_idx_multi + cbm_idx_offset
			
 
				+ *			in a cache bit mask
			
 
				+ * @shareable_bits:	Bitmask of shareable resource with other
			
 
				+ *			executing entities
			
 
				+ */
			
 
				+struct rdt_cache {
			
 
				+	unsigned int	cbm_len;
			
 
				+	unsigned int	min_cbm_bits;
			
 
				+	unsigned int	cbm_idx_mult;
			
 
				+	unsigned int	cbm_idx_offset;
			
 
				+	unsigned int	shareable_bits;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * struct rdt_membw - Memory bandwidth allocation related data
			
 
				+ * @max_delay:		Max throttle delay. Delay is the hardware
			
 
				+ *			representation for memory bandwidth.
			
 
				+ * @min_bw:		Minimum memory bandwidth percentage user can request
			
 
				+ * @bw_gran:		Granularity at which the memory bandwidth is allocated
			
 
				+ * @delay_linear:	True if memory B/W delay is in linear scale
			
 
				+ * @mb_map:		Mapping of memory B/W percentage to memory B/W delay
			
 
				+ */
			
 
				+struct rdt_membw {
			
 
				+	u32		max_delay;
			
 
				+	u32		min_bw;
			
 
				+	u32		bw_gran;
			
 
				+	u32		delay_linear;
			
 
				+	u32		*mb_map;
			
 
				+};
			
 
				+
			
 
				+static inline bool is_llc_occupancy_enabled(void)
			
 
				+{
			
 
				+	return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID));
			
 
				+}
			
 
				+
			
 
				+static inline bool is_mbm_total_enabled(void)
			
 
				+{
			
 
				+	return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID));
			
 
				+}
			
 
				+
			
 
				+static inline bool is_mbm_local_enabled(void)
			
 
				+{
			
 
				+	return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID));
			
 
				+}
			
 
				+
			
 
				+static inline bool is_mbm_enabled(void)
			
 
				+{
			
 
				+	return (is_mbm_total_enabled() || is_mbm_local_enabled());
			
 
				+}
			
 
				+
			
 
				+static inline bool is_mbm_event(int e)
			
 
				+{
			
 
				+	return (e >= QOS_L3_MBM_TOTAL_EVENT_ID &&
			
 
				+		e <= QOS_L3_MBM_LOCAL_EVENT_ID);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * struct rdt_resource - attributes of an RDT resource
			
 
				+ * @rid:		The index of the resource
			
 
				+ * @alloc_enabled:	Is allocation enabled on this machine
			
 
				+ * @mon_enabled:		Is monitoring enabled for this feature
			
 
				+ * @alloc_capable:	Is allocation available on this machine
			
 
				+ * @mon_capable:		Is monitor feature available on this machine
			
 
				+ * @name:		Name to use in "schemata" file
			
 
				+ * @num_closid:		Number of CLOSIDs available
			
 
				+ * @cache_level:	Which cache level defines scope of this resource
			
 
				+ * @default_ctrl:	Specifies default cache cbm or memory B/W percent.
			
 
				+ * @msr_base:		Base MSR address for CBMs
			
 
				+ * @msr_update:		Function pointer to update QOS MSRs
			
 
				+ * @data_width:		Character width of data when displaying
			
 
				+ * @domains:		All domains for this resource
			
 
				+ * @cache:		Cache allocation related data
			
 
				+ * @format_str:		Per resource format string to show domain value
			
 
				+ * @parse_ctrlval:	Per resource function pointer to parse control values
			
 
				+ * @evt_list:			List of monitoring events
			
 
				+ * @num_rmid:			Number of RMIDs available
			
 
				+ * @mon_scale:			cqm counter * mon_scale = occupancy in bytes
			
 
				+ * @fflags:			flags to choose base and info files
			
 
				+ */
			
 
				+struct rdt_resource {
			
 
				+	int			rid;
			
 
				+	bool			alloc_enabled;
			
 
				+	bool			mon_enabled;
			
 
				+	bool			alloc_capable;
			
 
				+	bool			mon_capable;
			
 
				+	char			*name;
			
 
				+	int			num_closid;
			
 
				+	int			cache_level;
			
 
				+	u32			default_ctrl;
			
 
				+	unsigned int		msr_base;
			
 
				+	void (*msr_update)	(struct rdt_domain *d, struct msr_param *m,
			
 
				+				 struct rdt_resource *r);
			
 
				+	int			data_width;
			
 
				+	struct list_head	domains;
			
 
				+	struct rdt_cache	cache;
			
 
				+	struct rdt_membw	membw;
			
 
				+	const char		*format_str;
			
 
				+	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
			
 
				+				 struct rdt_domain *d);
			
 
				+	struct list_head	evt_list;
			
 
				+	int			num_rmid;
			
 
				+	unsigned int		mon_scale;
			
 
				+	unsigned long		fflags;
			
 
				+};
			
 
				+
			
 
				+int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
			
 
				+int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
			
 
				+
			
 
				+extern struct mutex rdtgroup_mutex;
			
 
				+
			
 
				+extern struct rdt_resource rdt_resources_all[];
			
 
				+extern struct rdtgroup rdtgroup_default;
			
 
				+DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
			
 
				+
			
 
				+int __init rdtgroup_init(void);
			
 
				+
			
 
				+enum {
			
 
				+	RDT_RESOURCE_L3,
			
 
				+	RDT_RESOURCE_L3DATA,
			
 
				+	RDT_RESOURCE_L3CODE,
			
 
				+	RDT_RESOURCE_L2,
			
 
				+	RDT_RESOURCE_MBA,
			
 
				+
			
 
				+	/* Must be the last */
			
 
				+	RDT_NUM_RESOURCES,
			
 
				+};
			
 
				+
			
 
				+#define for_each_capable_rdt_resource(r)				      \
			
 
				+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				+	     r++)							      \
			
 
				+		if (r->alloc_capable || r->mon_capable)
			
 
				+
			
 
				+#define for_each_alloc_capable_rdt_resource(r)				      \
			
 
				+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				+	     r++)							      \
			
 
				+		if (r->alloc_capable)
			
 
				+
			
 
				+#define for_each_mon_capable_rdt_resource(r)				      \
			
 
				+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				+	     r++)							      \
			
 
				+		if (r->mon_capable)
			
 
				+
			
 
				+#define for_each_alloc_enabled_rdt_resource(r)				      \
			
 
				+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				+	     r++)							      \
			
 
				+		if (r->alloc_enabled)
			
 
				+
			
 
				+#define for_each_mon_enabled_rdt_resource(r)				      \
			
 
				+	for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\
			
 
				+	     r++)							      \
			
 
				+		if (r->mon_enabled)
			
 
				+
			
 
				+/* CPUID.(EAX=10H, ECX=ResID=1).EAX */
			
 
				+union cpuid_0x10_1_eax {
			
 
				+	struct {
			
 
				+		unsigned int cbm_len:5;
			
 
				+	} split;
			
 
				+	unsigned int full;
			
 
				+};
			
 
				+
			
 
				+/* CPUID.(EAX=10H, ECX=ResID=3).EAX */
			
 
				+union cpuid_0x10_3_eax {
			
 
				+	struct {
			
 
				+		unsigned int max_delay:12;
			
 
				+	} split;
			
 
				+	unsigned int full;
			
 
				+};
			
 
				+
			
 
				+/* CPUID.(EAX=10H, ECX=ResID).EDX */
			
 
				+union cpuid_0x10_x_edx {
			
 
				+	struct {
			
 
				+		unsigned int cos_max:16;
			
 
				+	} split;
			
 
				+	unsigned int full;
			
 
				+};
			
 
				+
			
 
				+void rdt_ctrl_update(void *arg);
			
 
				+struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
			
 
				+void rdtgroup_kn_unlock(struct kernfs_node *kn);
			
 
				+struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
			
 
				+				   struct list_head **pos);
			
 
				+ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
			
 
				+				char *buf, size_t nbytes, loff_t off);
			
 
				+int rdtgroup_schemata_show(struct kernfs_open_file *of,
			
 
				+			   struct seq_file *s, void *v);
			
 
				+struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
			
 
				+int alloc_rmid(void);
			
 
				+void free_rmid(u32 rmid);
			
 
				+int rdt_get_mon_l3_config(struct rdt_resource *r);
			
 
				+void mon_event_count(void *info);
			
 
				+int rdtgroup_mondata_show(struct seq_file *m, void *arg);
			
 
				+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
			
 
				+				    unsigned int dom_id);
			
 
				+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
			
 
				+				    struct rdt_domain *d);
			
 
				+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
			
 
				+		    struct rdtgroup *rdtgrp, int evtid, int first);
			
 
				+void mbm_setup_overflow_handler(struct rdt_domain *dom,
			
 
				+				unsigned long delay_ms);
			
 
				+void mbm_handle_overflow(struct work_struct *work);
			
 
				+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
			
 
				+void cqm_handle_limbo(struct work_struct *work);
			
 
				+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
			
 
				+void __check_limbo(struct rdt_domain *d, bool force_free);
			
 
				+
			
 
				+#endif /* _ASM_X86_INTEL_RDT_H */
			
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -26,7 +26,7 @@
 
				 #include <linux/kernfs.h>
			
 
				 #include <linux/seq_file.h>
			
 
				 #include <linux/slab.h>
			
 
				-#include <asm/intel_rdt.h>
			
 
				+#include "intel_rdt.h"
			
 
				 
			
 
				 /*
			
 
				  * Check whether MBA bandwidth percentage value is correct. The value is
			
@@ -192,7 +192,7 @@ static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
 
				 {
			
 
				 	struct rdt_resource *r;
			
 
				 
			
 
				-	for_each_enabled_rdt_resource(r) {
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				 		if (!strcmp(resname, r->name) && closid < r->num_closid)
			
 
				 			return parse_line(tok, r);
			
 
				 	}
			
@@ -221,7 +221,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
				 
			
 
				 	closid = rdtgrp->closid;
			
 
				 
			
 
				-	for_each_enabled_rdt_resource(r) {
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				 		list_for_each_entry(dom, &r->domains, list)
			
 
				 			dom->have_new_ctrl = false;
			
 
				 	}
			
@@ -237,7 +237,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
				 			goto out;
			
 
				 	}
			
 
				 
			
 
				-	for_each_enabled_rdt_resource(r) {
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				 		ret = update_domains(r, closid);
			
 
				 		if (ret)
			
 
				 			goto out;
			
@@ -269,12 +269,13 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 
				 {
			
 
				 	struct rdtgroup *rdtgrp;
			
 
				 	struct rdt_resource *r;
			
 
				-	int closid, ret = 0;
			
 
				+	int ret = 0;
			
 
				+	u32 closid;
			
 
				 
			
 
				 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				 	if (rdtgrp) {
			
 
				 		closid = rdtgrp->closid;
			
 
				-		for_each_enabled_rdt_resource(r) {
			
 
				+		for_each_alloc_enabled_rdt_resource(r) {
			
 
				 			if (closid < r->num_closid)
			
 
				 				show_doms(s, r, closid);
			
 
				 		}
			
@@ -284,3 +285,57 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 
				 	rdtgroup_kn_unlock(of->kn);
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+void mon_event_read(struct rmid_read *rr, struct rdt_domain *d,
			
 
				+		    struct rdtgroup *rdtgrp, int evtid, int first)
			
 
				+{
			
 
				+	/*
			
 
				+	 * setup the parameters to send to the IPI to read the data.
			
 
				+	 */
			
 
				+	rr->rgrp = rdtgrp;
			
 
				+	rr->evtid = evtid;
			
 
				+	rr->d = d;
			
 
				+	rr->val = 0;
			
 
				+	rr->first = first;
			
 
				+
			
 
				+	smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1);
			
 
				+}
			
 
				+
			
 
				+int rdtgroup_mondata_show(struct seq_file *m, void *arg)
			
 
				+{
			
 
				+	struct kernfs_open_file *of = m->private;
			
 
				+	u32 resid, evtid, domid;
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+	struct rdt_resource *r;
			
 
				+	union mon_data_bits md;
			
 
				+	struct rdt_domain *d;
			
 
				+	struct rmid_read rr;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				+
			
 
				+	md.priv = of->kn->priv;
			
 
				+	resid = md.u.rid;
			
 
				+	domid = md.u.domid;
			
 
				+	evtid = md.u.evtid;
			
 
				+
			
 
				+	r = &rdt_resources_all[resid];
			
 
				+	d = rdt_find_domain(r, domid, NULL);
			
 
				+	if (!d) {
			
 
				+		ret = -ENOENT;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	mon_event_read(&rr, d, rdtgrp, evtid, false);
			
 
				+
			
 
				+	if (rr.val & RMID_VAL_ERROR)
			
 
				+		seq_puts(m, "Error\n");
			
 
				+	else if (rr.val & RMID_VAL_UNAVAIL)
			
 
				+		seq_puts(m, "Unavailable\n");
			
 
				+	else
			
 
				+		seq_printf(m, "%llu\n", rr.val * r->mon_scale);
			
 
				+
			
 
				+out:
			
 
				+	rdtgroup_kn_unlock(of->kn);
			
 
				+	return ret;
			
 
				+}
			
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -0,0 +1,499 @@
 
				+/*
			
 
				+ * Resource Director Technology(RDT)
			
 
				+ * - Monitoring code
			
 
				+ *
			
 
				+ * Copyright (C) 2017 Intel Corporation
			
 
				+ *
			
 
				+ * Author:
			
 
				+ *    Vikas Shivappa <vikas.shivappa@intel.com>
			
 
				+ *
			
 
				+ * This replaces the cqm.c based on perf but we reuse a lot of
			
 
				+ * code and datastructures originally from Peter Zijlstra and Matt Fleming.
			
 
				+ *
			
 
				+ * This program is free software; you can redistribute it and/or modify it
			
 
				+ * under the terms and conditions of the GNU General Public License,
			
 
				+ * version 2, as published by the Free Software Foundation.
			
 
				+ *
			
 
				+ * This program is distributed in the hope it will be useful, but WITHOUT
			
 
				+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
			
 
				+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
			
 
				+ * more details.
			
 
				+ *
			
 
				+ * More information about RDT be found in the Intel (R) x86 Architecture
			
 
				+ * Software Developer Manual June 2016, volume 3, section 17.17.
			
 
				+ */
			
 
				+
			
 
				+#include <linux/module.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <asm/cpu_device_id.h>
			
 
				+#include "intel_rdt.h"
			
 
				+
			
 
				+#define MSR_IA32_QM_CTR		0x0c8e
			
 
				+#define MSR_IA32_QM_EVTSEL		0x0c8d
			
 
				+
			
 
				+struct rmid_entry {
			
 
				+	u32				rmid;
			
 
				+	int				busy;
			
 
				+	struct list_head		list;
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * @rmid_free_lru    A least recently used list of free RMIDs
			
 
				+ *     These RMIDs are guaranteed to have an occupancy less than the
			
 
				+ *     threshold occupancy
			
 
				+ */
			
 
				+static LIST_HEAD(rmid_free_lru);
			
 
				+
			
 
				+/**
			
 
				+ * @rmid_limbo_count     count of currently unused but (potentially)
			
 
				+ *     dirty RMIDs.
			
 
				+ *     This counts RMIDs that no one is currently using but that
			
 
				+ *     may have a occupancy value > intel_cqm_threshold. User can change
			
 
				+ *     the threshold occupancy value.
			
 
				+ */
			
 
				+unsigned int rmid_limbo_count;
			
 
				+
			
 
				+/**
			
 
				+ * @rmid_entry - The entry in the limbo and free lists.
			
 
				+ */
			
 
				+static struct rmid_entry	*rmid_ptrs;
			
 
				+
			
 
				+/*
			
 
				+ * Global boolean for rdt_monitor which is true if any
			
 
				+ * resource monitoring is enabled.
			
 
				+ */
			
 
				+bool rdt_mon_capable;
			
 
				+
			
 
				+/*
			
 
				+ * Global to indicate which monitoring events are enabled.
			
 
				+ */
			
 
				+unsigned int rdt_mon_features;
			
 
				+
			
 
				+/*
			
 
				+ * This is the threshold cache occupancy at which we will consider an
			
 
				+ * RMID available for re-allocation.
			
 
				+ */
			
 
				+unsigned int intel_cqm_threshold;
			
 
				+
			
 
				+static inline struct rmid_entry *__rmid_entry(u32 rmid)
			
 
				+{
			
 
				+	struct rmid_entry *entry;
			
 
				+
			
 
				+	entry = &rmid_ptrs[rmid];
			
 
				+	WARN_ON(entry->rmid != rmid);
			
 
				+
			
 
				+	return entry;
			
 
				+}
			
 
				+
			
 
				+static u64 __rmid_read(u32 rmid, u32 eventid)
			
 
				+{
			
 
				+	u64 val;
			
 
				+
			
 
				+	/*
			
 
				+	 * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured
			
 
				+	 * with a valid event code for supported resource type and the bits
			
 
				+	 * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID,
			
 
				+	 * IA32_QM_CTR.data (bits 61:0) reports the monitored data.
			
 
				+	 * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62)
			
 
				+	 * are error bits.
			
 
				+	 */
			
 
				+	wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid);
			
 
				+	rdmsrl(MSR_IA32_QM_CTR, val);
			
 
				+
			
 
				+	return val;
			
 
				+}
			
 
				+
			
 
				+static bool rmid_dirty(struct rmid_entry *entry)
			
 
				+{
			
 
				+	u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
			
 
				+
			
 
				+	return val >= intel_cqm_threshold;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Check the RMIDs that are marked as busy for this domain. If the
			
 
				+ * reported LLC occupancy is below the threshold clear the busy bit and
			
 
				+ * decrement the count. If the busy count gets to zero on an RMID, we
			
 
				+ * free the RMID
			
 
				+ */
			
 
				+void __check_limbo(struct rdt_domain *d, bool force_free)
			
 
				+{
			
 
				+	struct rmid_entry *entry;
			
 
				+	struct rdt_resource *r;
			
 
				+	u32 crmid = 1, nrmid;
			
 
				+
			
 
				+	r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+
			
 
				+	/*
			
 
				+	 * Skip RMID 0 and start from RMID 1 and check all the RMIDs that
			
 
				+	 * are marked as busy for occupancy < threshold. If the occupancy
			
 
				+	 * is less than the threshold decrement the busy counter of the
			
 
				+	 * RMID and move it to the free list when the counter reaches 0.
			
 
				+	 */
			
 
				+	for (;;) {
			
 
				+		nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid);
			
 
				+		if (nrmid >= r->num_rmid)
			
 
				+			break;
			
 
				+
			
 
				+		entry = __rmid_entry(nrmid);
			
 
				+		if (force_free || !rmid_dirty(entry)) {
			
 
				+			clear_bit(entry->rmid, d->rmid_busy_llc);
			
 
				+			if (!--entry->busy) {
			
 
				+				rmid_limbo_count--;
			
 
				+				list_add_tail(&entry->list, &rmid_free_lru);
			
 
				+			}
			
 
				+		}
			
 
				+		crmid = nrmid + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d)
			
 
				+{
			
 
				+	return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * As of now the RMIDs allocation is global.
			
 
				+ * However we keep track of which packages the RMIDs
			
 
				+ * are used to optimize the limbo list management.
			
 
				+ */
			
 
				+int alloc_rmid(void)
			
 
				+{
			
 
				+	struct rmid_entry *entry;
			
 
				+
			
 
				+	lockdep_assert_held(&rdtgroup_mutex);
			
 
				+
			
 
				+	if (list_empty(&rmid_free_lru))
			
 
				+		return rmid_limbo_count ? -EBUSY : -ENOSPC;
			
 
				+
			
 
				+	entry = list_first_entry(&rmid_free_lru,
			
 
				+				 struct rmid_entry, list);
			
 
				+	list_del(&entry->list);
			
 
				+
			
 
				+	return entry->rmid;
			
 
				+}
			
 
				+
			
 
				+static void add_rmid_to_limbo(struct rmid_entry *entry)
			
 
				+{
			
 
				+	struct rdt_resource *r;
			
 
				+	struct rdt_domain *d;
			
 
				+	int cpu;
			
 
				+	u64 val;
			
 
				+
			
 
				+	r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+
			
 
				+	entry->busy = 0;
			
 
				+	cpu = get_cpu();
			
 
				+	list_for_each_entry(d, &r->domains, list) {
			
 
				+		if (cpumask_test_cpu(cpu, &d->cpu_mask)) {
			
 
				+			val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID);
			
 
				+			if (val <= intel_cqm_threshold)
			
 
				+				continue;
			
 
				+		}
			
 
				+
			
 
				+		/*
			
 
				+		 * For the first limbo RMID in the domain,
			
 
				+		 * setup up the limbo worker.
			
 
				+		 */
			
 
				+		if (!has_busy_rmid(r, d))
			
 
				+			cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL);
			
 
				+		set_bit(entry->rmid, d->rmid_busy_llc);
			
 
				+		entry->busy++;
			
 
				+	}
			
 
				+	put_cpu();
			
 
				+
			
 
				+	if (entry->busy)
			
 
				+		rmid_limbo_count++;
			
 
				+	else
			
 
				+		list_add_tail(&entry->list, &rmid_free_lru);
			
 
				+}
			
 
				+
			
 
				+void free_rmid(u32 rmid)
			
 
				+{
			
 
				+	struct rmid_entry *entry;
			
 
				+
			
 
				+	if (!rmid)
			
 
				+		return;
			
 
				+
			
 
				+	lockdep_assert_held(&rdtgroup_mutex);
			
 
				+
			
 
				+	entry = __rmid_entry(rmid);
			
 
				+
			
 
				+	if (is_llc_occupancy_enabled())
			
 
				+		add_rmid_to_limbo(entry);
			
 
				+	else
			
 
				+		list_add_tail(&entry->list, &rmid_free_lru);
			
 
				+}
			
 
				+
			
 
				+static int __mon_event_count(u32 rmid, struct rmid_read *rr)
			
 
				+{
			
 
				+	u64 chunks, shift, tval;
			
 
				+	struct mbm_state *m;
			
 
				+
			
 
				+	tval = __rmid_read(rmid, rr->evtid);
			
 
				+	if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) {
			
 
				+		rr->val = tval;
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	switch (rr->evtid) {
			
 
				+	case QOS_L3_OCCUP_EVENT_ID:
			
 
				+		rr->val += tval;
			
 
				+		return 0;
			
 
				+	case QOS_L3_MBM_TOTAL_EVENT_ID:
			
 
				+		m = &rr->d->mbm_total[rmid];
			
 
				+		break;
			
 
				+	case QOS_L3_MBM_LOCAL_EVENT_ID:
			
 
				+		m = &rr->d->mbm_local[rmid];
			
 
				+		break;
			
 
				+	default:
			
 
				+		/*
			
 
				+		 * Code would never reach here because
			
 
				+		 * an invalid event id would fail the __rmid_read.
			
 
				+		 */
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (rr->first) {
			
 
				+		m->prev_msr = tval;
			
 
				+		m->chunks = 0;
			
 
				+		return 0;
			
 
				+	}
			
 
				+
			
 
				+	shift = 64 - MBM_CNTR_WIDTH;
			
 
				+	chunks = (tval << shift) - (m->prev_msr << shift);
			
 
				+	chunks >>= shift;
			
 
				+	m->chunks += chunks;
			
 
				+	m->prev_msr = tval;
			
 
				+
			
 
				+	rr->val += m->chunks;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This is called via IPI to read the CQM/MBM counters
			
 
				+ * on a domain.
			
 
				+ */
			
 
				+void mon_event_count(void *info)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp, *entry;
			
 
				+	struct rmid_read *rr = info;
			
 
				+	struct list_head *head;
			
 
				+
			
 
				+	rdtgrp = rr->rgrp;
			
 
				+
			
 
				+	if (__mon_event_count(rdtgrp->mon.rmid, rr))
			
 
				+		return;
			
 
				+
			
 
				+	/*
			
 
				+	 * For Ctrl groups read data from child monitor groups.
			
 
				+	 */
			
 
				+	head = &rdtgrp->mon.crdtgrp_list;
			
 
				+
			
 
				+	if (rdtgrp->type == RDTCTRL_GROUP) {
			
 
				+		list_for_each_entry(entry, head, mon.crdtgrp_list) {
			
 
				+			if (__mon_event_count(entry->mon.rmid, rr))
			
 
				+				return;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void mbm_update(struct rdt_domain *d, int rmid)
			
 
				+{
			
 
				+	struct rmid_read rr;
			
 
				+
			
 
				+	rr.first = false;
			
 
				+	rr.d = d;
			
 
				+
			
 
				+	/*
			
 
				+	 * This is protected from concurrent reads from user
			
 
				+	 * as both the user and we hold the global mutex.
			
 
				+	 */
			
 
				+	if (is_mbm_total_enabled()) {
			
 
				+		rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID;
			
 
				+		__mon_event_count(rmid, &rr);
			
 
				+	}
			
 
				+	if (is_mbm_local_enabled()) {
			
 
				+		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
			
 
				+		__mon_event_count(rmid, &rr);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handler to scan the limbo list and move the RMIDs
			
 
				+ * to free list whose occupancy < threshold_occupancy.
			
 
				+ */
			
 
				+void cqm_handle_limbo(struct work_struct *work)
			
 
				+{
			
 
				+	unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL);
			
 
				+	int cpu = smp_processor_id();
			
 
				+	struct rdt_resource *r;
			
 
				+	struct rdt_domain *d;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+	d = get_domain_from_cpu(cpu, r);
			
 
				+
			
 
				+	if (!d) {
			
 
				+		pr_warn_once("Failure to get domain for limbo worker\n");
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				+	__check_limbo(d, false);
			
 
				+
			
 
				+	if (has_busy_rmid(r, d))
			
 
				+		schedule_delayed_work_on(cpu, &d->cqm_limbo, delay);
			
 
				+
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+}
			
 
				+
			
 
				+void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms)
			
 
				+{
			
 
				+	unsigned long delay = msecs_to_jiffies(delay_ms);
			
 
				+	struct rdt_resource *r;
			
 
				+	int cpu;
			
 
				+
			
 
				+	r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+
			
 
				+	cpu = cpumask_any(&dom->cpu_mask);
			
 
				+	dom->cqm_work_cpu = cpu;
			
 
				+
			
 
				+	schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay);
			
 
				+}
			
 
				+
			
 
				+void mbm_handle_overflow(struct work_struct *work)
			
 
				+{
			
 
				+	unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL);
			
 
				+	struct rdtgroup *prgrp, *crgrp;
			
 
				+	int cpu = smp_processor_id();
			
 
				+	struct list_head *head;
			
 
				+	struct rdt_domain *d;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	if (!static_branch_likely(&rdt_enable_key))
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]);
			
 
				+	if (!d)
			
 
				+		goto out_unlock;
			
 
				+
			
 
				+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		mbm_update(d, prgrp->mon.rmid);
			
 
				+
			
 
				+		head = &prgrp->mon.crdtgrp_list;
			
 
				+		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
			
 
				+			mbm_update(d, crgrp->mon.rmid);
			
 
				+	}
			
 
				+
			
 
				+	schedule_delayed_work_on(cpu, &d->mbm_over, delay);
			
 
				+
			
 
				+out_unlock:
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+}
			
 
				+
			
 
				+void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms)
			
 
				+{
			
 
				+	unsigned long delay = msecs_to_jiffies(delay_ms);
			
 
				+	int cpu;
			
 
				+
			
 
				+	if (!static_branch_likely(&rdt_enable_key))
			
 
				+		return;
			
 
				+	cpu = cpumask_any(&dom->cpu_mask);
			
 
				+	dom->mbm_work_cpu = cpu;
			
 
				+	schedule_delayed_work_on(cpu, &dom->mbm_over, delay);
			
 
				+}
			
 
				+
			
 
				+static int dom_data_init(struct rdt_resource *r)
			
 
				+{
			
 
				+	struct rmid_entry *entry = NULL;
			
 
				+	int i, nr_rmids;
			
 
				+
			
 
				+	nr_rmids = r->num_rmid;
			
 
				+	rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL);
			
 
				+	if (!rmid_ptrs)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	for (i = 0; i < nr_rmids; i++) {
			
 
				+		entry = &rmid_ptrs[i];
			
 
				+		INIT_LIST_HEAD(&entry->list);
			
 
				+
			
 
				+		entry->rmid = i;
			
 
				+		list_add_tail(&entry->list, &rmid_free_lru);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * RMID 0 is special and is always allocated. It's used for all
			
 
				+	 * tasks that are not monitored.
			
 
				+	 */
			
 
				+	entry = __rmid_entry(0);
			
 
				+	list_del(&entry->list);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static struct mon_evt llc_occupancy_event = {
			
 
				+	.name		= "llc_occupancy",
			
 
				+	.evtid		= QOS_L3_OCCUP_EVENT_ID,
			
 
				+};
			
 
				+
			
 
				+static struct mon_evt mbm_total_event = {
			
 
				+	.name		= "mbm_total_bytes",
			
 
				+	.evtid		= QOS_L3_MBM_TOTAL_EVENT_ID,
			
 
				+};
			
 
				+
			
 
				+static struct mon_evt mbm_local_event = {
			
 
				+	.name		= "mbm_local_bytes",
			
 
				+	.evtid		= QOS_L3_MBM_LOCAL_EVENT_ID,
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * Initialize the event list for the resource.
			
 
				+ *
			
 
				+ * Note that MBM events are also part of RDT_RESOURCE_L3 resource
			
 
				+ * because as per the SDM the total and local memory bandwidth
			
 
				+ * are enumerated as part of L3 monitoring.
			
 
				+ */
			
 
				+static void l3_mon_evt_init(struct rdt_resource *r)
			
 
				+{
			
 
				+	INIT_LIST_HEAD(&r->evt_list);
			
 
				+
			
 
				+	if (is_llc_occupancy_enabled())
			
 
				+		list_add_tail(&llc_occupancy_event.list, &r->evt_list);
			
 
				+	if (is_mbm_total_enabled())
			
 
				+		list_add_tail(&mbm_total_event.list, &r->evt_list);
			
 
				+	if (is_mbm_local_enabled())
			
 
				+		list_add_tail(&mbm_local_event.list, &r->evt_list);
			
 
				+}
			
 
				+
			
 
				+int rdt_get_mon_l3_config(struct rdt_resource *r)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	r->mon_scale = boot_cpu_data.x86_cache_occ_scale;
			
 
				+	r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * A reasonable upper limit on the max threshold is the number
			
 
				+	 * of lines tagged per RMID if all RMIDs have the same number of
			
 
				+	 * lines tagged in the LLC.
			
 
				+	 *
			
 
				+	 * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC.
			
 
				+	 */
			
 
				+	intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid;
			
 
				+
			
 
				+	/* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */
			
 
				+	intel_cqm_threshold /= r->mon_scale;
			
 
				+
			
 
				+	ret = dom_data_init(r);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	l3_mon_evt_init(r);
			
 
				+
			
 
				+	r->mon_capable = true;
			
 
				+	r->mon_enabled = true;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -32,17 +32,25 @@
 
				 
			
 
				 #include <uapi/linux/magic.h>
			
 
				 
			
 
				-#include <asm/intel_rdt.h>
			
 
				-#include <asm/intel_rdt_common.h>
			
 
				+#include <asm/intel_rdt_sched.h>
			
 
				+#include "intel_rdt.h"
			
 
				 
			
 
				 DEFINE_STATIC_KEY_FALSE(rdt_enable_key);
			
 
				-struct kernfs_root *rdt_root;
			
 
				+DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key);
			
 
				+DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
			
 
				+static struct kernfs_root *rdt_root;
			
 
				 struct rdtgroup rdtgroup_default;
			
 
				 LIST_HEAD(rdt_all_groups);
			
 
				 
			
 
				 /* Kernel fs node for "info" directory under root */
			
 
				 static struct kernfs_node *kn_info;
			
 
				 
			
 
				+/* Kernel fs node for "mon_groups" directory under root */
			
 
				+static struct kernfs_node *kn_mongrp;
			
 
				+
			
 
				+/* Kernel fs node for "mon_data" directory under root */
			
 
				+static struct kernfs_node *kn_mondata;
			
 
				+
			
 
				 /*
			
 
				  * Trivial allocator for CLOSIDs. Since h/w only supports a small number,
			
 
				  * we can keep a bitmap of free CLOSIDs in a single integer.
			
@@ -66,7 +74,7 @@ static void closid_init(void)
 
				 	int rdt_min_closid = 32;
			
 
				 
			
 
				 	/* Compute rdt_min_closid across all resources */
			
 
				-	for_each_enabled_rdt_resource(r)
			
 
				+	for_each_alloc_enabled_rdt_resource(r)
			
 
				 		rdt_min_closid = min(rdt_min_closid, r->num_closid);
			
 
				 
			
 
				 	closid_free_map = BIT_MASK(rdt_min_closid) - 1;
			
@@ -75,9 +83,9 @@ static void closid_init(void)
 
				 	closid_free_map &= ~1;
			
 
				 }
			
 
				 
			
 
				-int closid_alloc(void)
			
 
				+static int closid_alloc(void)
			
 
				 {
			
 
				-	int closid = ffs(closid_free_map);
			
 
				+	u32 closid = ffs(closid_free_map);
			
 
				 
			
 
				 	if (closid == 0)
			
 
				 		return -ENOSPC;
			
@@ -125,28 +133,6 @@ static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts,
			
 
				-			      int len)
			
 
				-{
			
 
				-	struct rftype *rft;
			
 
				-	int ret;
			
 
				-
			
 
				-	lockdep_assert_held(&rdtgroup_mutex);
			
 
				-
			
 
				-	for (rft = rfts; rft < rfts + len; rft++) {
			
 
				-		ret = rdtgroup_add_file(kn, rft);
			
 
				-		if (ret)
			
 
				-			goto error;
			
 
				-	}
			
 
				-
			
 
				-	return 0;
			
 
				-error:
			
 
				-	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
			
 
				-	while (--rft >= rfts)
			
 
				-		kernfs_remove_by_name(kn, rft->name);
			
 
				-	return ret;
			
 
				-}
			
 
				-
			
 
				 static int rdtgroup_seqfile_show(struct seq_file *m, void *arg)
			
 
				 {
			
 
				 	struct kernfs_open_file *of = m->private;
			
@@ -174,6 +160,11 @@ static struct kernfs_ops rdtgroup_kf_single_ops = {
 
				 	.seq_show		= rdtgroup_seqfile_show,
			
 
				 };
			
 
				 
			
 
				+static struct kernfs_ops kf_mondata_ops = {
			
 
				+	.atomic_write_len	= PAGE_SIZE,
			
 
				+	.seq_show		= rdtgroup_mondata_show,
			
 
				+};
			
 
				+
			
 
				 static bool is_cpu_list(struct kernfs_open_file *of)
			
 
				 {
			
 
				 	struct rftype *rft = of->kn->priv;
			
@@ -203,13 +194,18 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 
				 /*
			
 
				  * This is safe against intel_rdt_sched_in() called from __switch_to()
			
 
				  * because __switch_to() is executed with interrupts disabled. A local call
			
 
				- * from rdt_update_closid() is proteced against __switch_to() because
			
 
				+ * from update_closid_rmid() is proteced against __switch_to() because
			
 
				  * preemption is disabled.
			
 
				  */
			
 
				-static void rdt_update_cpu_closid(void *closid)
			
 
				+static void update_cpu_closid_rmid(void *info)
			
 
				 {
			
 
				-	if (closid)
			
 
				-		this_cpu_write(cpu_closid, *(int *)closid);
			
 
				+	struct rdtgroup *r = info;
			
 
				+
			
 
				+	if (r) {
			
 
				+		this_cpu_write(pqr_state.default_closid, r->closid);
			
 
				+		this_cpu_write(pqr_state.default_rmid, r->mon.rmid);
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * We cannot unconditionally write the MSR because the current
			
 
				 	 * executing task might have its own closid selected. Just reuse
			
@@ -221,28 +217,128 @@ static void rdt_update_cpu_closid(void *closid)
 
				 /*
			
 
				  * Update the PGR_ASSOC MSR on all cpus in @cpu_mask,
			
 
				  *
			
 
				- * Per task closids must have been set up before calling this function.
			
 
				- *
			
 
				- * The per cpu closids are updated with the smp function call, when @closid
			
 
				- * is not NULL. If @closid is NULL then all affected percpu closids must
			
 
				- * have been set up before calling this function.
			
 
				+ * Per task closids/rmids must have been set up before calling this function.
			
 
				  */
			
 
				 static void
			
 
				-rdt_update_closid(const struct cpumask *cpu_mask, int *closid)
			
 
				+update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r)
			
 
				 {
			
 
				 	int cpu = get_cpu();
			
 
				 
			
 
				 	if (cpumask_test_cpu(cpu, cpu_mask))
			
 
				-		rdt_update_cpu_closid(closid);
			
 
				-	smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1);
			
 
				+		update_cpu_closid_rmid(r);
			
 
				+	smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1);
			
 
				 	put_cpu();
			
 
				 }
			
 
				 
			
 
				+static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
			
 
				+			  cpumask_var_t tmpmask)
			
 
				+{
			
 
				+	struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp;
			
 
				+	struct list_head *head;
			
 
				+
			
 
				+	/* Check whether cpus belong to parent ctrl group */
			
 
				+	cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask);
			
 
				+	if (cpumask_weight(tmpmask))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Check whether cpus are dropped from this group */
			
 
				+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
			
 
				+	if (cpumask_weight(tmpmask)) {
			
 
				+		/* Give any dropped cpus to parent rdtgroup */
			
 
				+		cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask);
			
 
				+		update_closid_rmid(tmpmask, prgrp);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If we added cpus, remove them from previous group that owned them
			
 
				+	 * and update per-cpu rmid
			
 
				+	 */
			
 
				+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
			
 
				+	if (cpumask_weight(tmpmask)) {
			
 
				+		head = &prgrp->mon.crdtgrp_list;
			
 
				+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
			
 
				+			if (crgrp == rdtgrp)
			
 
				+				continue;
			
 
				+			cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask,
			
 
				+				       tmpmask);
			
 
				+		}
			
 
				+		update_closid_rmid(tmpmask, rdtgrp);
			
 
				+	}
			
 
				+
			
 
				+	/* Done pushing/pulling - update this group with new mask */
			
 
				+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m)
			
 
				+{
			
 
				+	struct rdtgroup *crgrp;
			
 
				+
			
 
				+	cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m);
			
 
				+	/* update the child mon group masks as well*/
			
 
				+	list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list)
			
 
				+		cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask);
			
 
				+}
			
 
				+
			
 
				+static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask,
			
 
				+			   cpumask_var_t tmpmask, cpumask_var_t tmpmask1)
			
 
				+{
			
 
				+	struct rdtgroup *r, *crgrp;
			
 
				+	struct list_head *head;
			
 
				+
			
 
				+	/* Check whether cpus are dropped from this group */
			
 
				+	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
			
 
				+	if (cpumask_weight(tmpmask)) {
			
 
				+		/* Can't drop from default group */
			
 
				+		if (rdtgrp == &rdtgroup_default)
			
 
				+			return -EINVAL;
			
 
				+
			
 
				+		/* Give any dropped cpus to rdtgroup_default */
			
 
				+		cpumask_or(&rdtgroup_default.cpu_mask,
			
 
				+			   &rdtgroup_default.cpu_mask, tmpmask);
			
 
				+		update_closid_rmid(tmpmask, &rdtgroup_default);
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If we added cpus, remove them from previous group and
			
 
				+	 * the prev group's child groups that owned them
			
 
				+	 * and update per-cpu closid/rmid.
			
 
				+	 */
			
 
				+	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
			
 
				+	if (cpumask_weight(tmpmask)) {
			
 
				+		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
			
 
				+			if (r == rdtgrp)
			
 
				+				continue;
			
 
				+			cpumask_and(tmpmask1, &r->cpu_mask, tmpmask);
			
 
				+			if (cpumask_weight(tmpmask1))
			
 
				+				cpumask_rdtgrp_clear(r, tmpmask1);
			
 
				+		}
			
 
				+		update_closid_rmid(tmpmask, rdtgrp);
			
 
				+	}
			
 
				+
			
 
				+	/* Done pushing/pulling - update this group with new mask */
			
 
				+	cpumask_copy(&rdtgrp->cpu_mask, newmask);
			
 
				+
			
 
				+	/*
			
 
				+	 * Clear child mon group masks since there is a new parent mask
			
 
				+	 * now and update the rmid for the cpus the child lost.
			
 
				+	 */
			
 
				+	head = &rdtgrp->mon.crdtgrp_list;
			
 
				+	list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
			
 
				+		cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask);
			
 
				+		update_closid_rmid(tmpmask, rdtgrp);
			
 
				+		cpumask_clear(&crgrp->cpu_mask);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
			
 
				 				   char *buf, size_t nbytes, loff_t off)
			
 
				 {
			
 
				-	cpumask_var_t tmpmask, newmask;
			
 
				-	struct rdtgroup *rdtgrp, *r;
			
 
				+	cpumask_var_t tmpmask, newmask, tmpmask1;
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				 	int ret;
			
 
				 
			
 
				 	if (!buf)
			
@@ -254,6 +350,11 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 
				 		free_cpumask_var(tmpmask);
			
 
				 		return -ENOMEM;
			
 
				 	}
			
 
				+	if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) {
			
 
				+		free_cpumask_var(tmpmask);
			
 
				+		free_cpumask_var(newmask);
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				 
			
 
				 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				 	if (!rdtgrp) {
			
@@ -276,41 +377,18 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 
				 		goto unlock;
			
 
				 	}
			
 
				 
			
 
				-	/* Check whether cpus are dropped from this group */
			
 
				-	cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask);
			
 
				-	if (cpumask_weight(tmpmask)) {
			
 
				-		/* Can't drop from default group */
			
 
				-		if (rdtgrp == &rdtgroup_default) {
			
 
				-			ret = -EINVAL;
			
 
				-			goto unlock;
			
 
				-		}
			
 
				-		/* Give any dropped cpus to rdtgroup_default */
			
 
				-		cpumask_or(&rdtgroup_default.cpu_mask,
			
 
				-			   &rdtgroup_default.cpu_mask, tmpmask);
			
 
				-		rdt_update_closid(tmpmask, &rdtgroup_default.closid);
			
 
				-	}
			
 
				-
			
 
				-	/*
			
 
				-	 * If we added cpus, remove them from previous group that owned them
			
 
				-	 * and update per-cpu closid
			
 
				-	 */
			
 
				-	cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask);
			
 
				-	if (cpumask_weight(tmpmask)) {
			
 
				-		list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) {
			
 
				-			if (r == rdtgrp)
			
 
				-				continue;
			
 
				-			cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask);
			
 
				-		}
			
 
				-		rdt_update_closid(tmpmask, &rdtgrp->closid);
			
 
				-	}
			
 
				-
			
 
				-	/* Done pushing/pulling - update this group with new mask */
			
 
				-	cpumask_copy(&rdtgrp->cpu_mask, newmask);
			
 
				+	if (rdtgrp->type == RDTCTRL_GROUP)
			
 
				+		ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1);
			
 
				+	else if (rdtgrp->type == RDTMON_GROUP)
			
 
				+		ret = cpus_mon_write(rdtgrp, newmask, tmpmask);
			
 
				+	else
			
 
				+		ret = -EINVAL;
			
 
				 
			
 
				 unlock:
			
 
				 	rdtgroup_kn_unlock(of->kn);
			
 
				 	free_cpumask_var(tmpmask);
			
 
				 	free_cpumask_var(newmask);
			
 
				+	free_cpumask_var(tmpmask1);
			
 
				 
			
 
				 	return ret ?: nbytes;
			
 
				 }
			
@@ -336,6 +414,7 @@ static void move_myself(struct callback_head *head)
 
				 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
			
 
				 	    (rdtgrp->flags & RDT_DELETED)) {
			
 
				 		current->closid = 0;
			
 
				+		current->rmid = 0;
			
 
				 		kfree(rdtgrp);
			
 
				 	}
			
 
				 
			
@@ -374,7 +453,20 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 
				 		atomic_dec(&rdtgrp->waitcount);
			
 
				 		kfree(callback);
			
 
				 	} else {
			
 
				-		tsk->closid = rdtgrp->closid;
			
 
				+		/*
			
 
				+		 * For ctrl_mon groups move both closid and rmid.
			
 
				+		 * For monitor groups, can move the tasks only from
			
 
				+		 * their parent CTRL group.
			
 
				+		 */
			
 
				+		if (rdtgrp->type == RDTCTRL_GROUP) {
			
 
				+			tsk->closid = rdtgrp->closid;
			
 
				+			tsk->rmid = rdtgrp->mon.rmid;
			
 
				+		} else if (rdtgrp->type == RDTMON_GROUP) {
			
 
				+			if (rdtgrp->mon.parent->closid == tsk->closid)
			
 
				+				tsk->rmid = rdtgrp->mon.rmid;
			
 
				+			else
			
 
				+				ret = -EINVAL;
			
 
				+		}
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -454,7 +546,8 @@ static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s)
 
				 
			
 
				 	rcu_read_lock();
			
 
				 	for_each_process_thread(p, t) {
			
 
				-		if (t->closid == r->closid)
			
 
				+		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
			
 
				+		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid))
			
 
				 			seq_printf(s, "%d\n", t->pid);
			
 
				 	}
			
 
				 	rcu_read_unlock();
			
@@ -476,39 +569,6 @@ static int rdtgroup_tasks_show(struct kernfs_open_file *of,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-/* Files in each rdtgroup */
			
 
				-static struct rftype rdtgroup_base_files[] = {
			
 
				-	{
			
 
				-		.name		= "cpus",
			
 
				-		.mode		= 0644,
			
 
				-		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				-		.write		= rdtgroup_cpus_write,
			
 
				-		.seq_show	= rdtgroup_cpus_show,
			
 
				-	},
			
 
				-	{
			
 
				-		.name		= "cpus_list",
			
 
				-		.mode		= 0644,
			
 
				-		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				-		.write		= rdtgroup_cpus_write,
			
 
				-		.seq_show	= rdtgroup_cpus_show,
			
 
				-		.flags		= RFTYPE_FLAGS_CPUS_LIST,
			
 
				-	},
			
 
				-	{
			
 
				-		.name		= "tasks",
			
 
				-		.mode		= 0644,
			
 
				-		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				-		.write		= rdtgroup_tasks_write,
			
 
				-		.seq_show	= rdtgroup_tasks_show,
			
 
				-	},
			
 
				-	{
			
 
				-		.name		= "schemata",
			
 
				-		.mode		= 0644,
			
 
				-		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				-		.write		= rdtgroup_schemata_write,
			
 
				-		.seq_show	= rdtgroup_schemata_show,
			
 
				-	},
			
 
				-};
			
 
				-
			
 
				 static int rdt_num_closids_show(struct kernfs_open_file *of,
			
 
				 				struct seq_file *seq, void *v)
			
 
				 {
			
@@ -536,6 +596,15 @@ static int rdt_min_cbm_bits_show(struct kernfs_open_file *of,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int rdt_shareable_bits_show(struct kernfs_open_file *of,
			
 
				+				   struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+
			
 
				+	seq_printf(seq, "%x\n", r->cache.shareable_bits);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int rdt_min_bw_show(struct kernfs_open_file *of,
			
 
				 			     struct seq_file *seq, void *v)
			
 
				 {
			
@@ -545,6 +614,28 @@ static int rdt_min_bw_show(struct kernfs_open_file *of,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int rdt_num_rmids_show(struct kernfs_open_file *of,
			
 
				+			      struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+
			
 
				+	seq_printf(seq, "%d\n", r->num_rmid);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int rdt_mon_features_show(struct kernfs_open_file *of,
			
 
				+				 struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+	struct mon_evt *mevt;
			
 
				+
			
 
				+	list_for_each_entry(mevt, &r->evt_list, list)
			
 
				+		seq_printf(seq, "%s\n", mevt->name);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int rdt_bw_gran_show(struct kernfs_open_file *of,
			
 
				 			     struct seq_file *seq, void *v)
			
 
				 {
			
@@ -563,100 +654,220 @@ static int rdt_delay_linear_show(struct kernfs_open_file *of,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int max_threshold_occ_show(struct kernfs_open_file *of,
			
 
				+				  struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+
			
 
				+	seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
			
 
				+				       char *buf, size_t nbytes, loff_t off)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+	unsigned int bytes;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = kstrtouint(buf, 0, &bytes);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (bytes > (boot_cpu_data.x86_cache_size * 1024))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	intel_cqm_threshold = bytes / r->mon_scale;
			
 
				+
			
 
				+	return nbytes;
			
 
				+}
			
 
				+
			
 
				 /* rdtgroup information files for one cache resource. */
			
 
				-static struct rftype res_cache_info_files[] = {
			
 
				+static struct rftype res_common_files[] = {
			
 
				 	{
			
 
				 		.name		= "num_closids",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_num_closids_show,
			
 
				+		.fflags		= RF_CTRL_INFO,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "mon_features",
			
 
				+		.mode		= 0444,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.seq_show	= rdt_mon_features_show,
			
 
				+		.fflags		= RF_MON_INFO,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "num_rmids",
			
 
				+		.mode		= 0444,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.seq_show	= rdt_num_rmids_show,
			
 
				+		.fflags		= RF_MON_INFO,
			
 
				 	},
			
 
				 	{
			
 
				 		.name		= "cbm_mask",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_default_ctrl_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				 	{
			
 
				 		.name		= "min_cbm_bits",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_min_cbm_bits_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				-};
			
 
				-
			
 
				-/* rdtgroup information files for memory bandwidth. */
			
 
				-static struct rftype res_mba_info_files[] = {
			
 
				 	{
			
 
				-		.name		= "num_closids",
			
 
				+		.name		= "shareable_bits",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				-		.seq_show	= rdt_num_closids_show,
			
 
				+		.seq_show	= rdt_shareable_bits_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				 	{
			
 
				 		.name		= "min_bandwidth",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_min_bw_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
			
 
				 	},
			
 
				 	{
			
 
				 		.name		= "bandwidth_gran",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_bw_gran_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
			
 
				 	},
			
 
				 	{
			
 
				 		.name		= "delay_linear",
			
 
				 		.mode		= 0444,
			
 
				 		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				 		.seq_show	= rdt_delay_linear_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_MB,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "max_threshold_occupancy",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= max_threshold_occ_write,
			
 
				+		.seq_show	= max_threshold_occ_show,
			
 
				+		.fflags		= RF_MON_INFO | RFTYPE_RES_CACHE,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "cpus",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= rdtgroup_cpus_write,
			
 
				+		.seq_show	= rdtgroup_cpus_show,
			
 
				+		.fflags		= RFTYPE_BASE,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "cpus_list",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= rdtgroup_cpus_write,
			
 
				+		.seq_show	= rdtgroup_cpus_show,
			
 
				+		.flags		= RFTYPE_FLAGS_CPUS_LIST,
			
 
				+		.fflags		= RFTYPE_BASE,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "tasks",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= rdtgroup_tasks_write,
			
 
				+		.seq_show	= rdtgroup_tasks_show,
			
 
				+		.fflags		= RFTYPE_BASE,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "schemata",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= rdtgroup_schemata_write,
			
 
				+		.seq_show	= rdtgroup_schemata_show,
			
 
				+		.fflags		= RF_CTRL_BASE,
			
 
				 	},
			
 
				 };
			
 
				 
			
 
				-void rdt_get_mba_infofile(struct rdt_resource *r)
			
 
				+static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
			
 
				 {
			
 
				-	r->info_files = res_mba_info_files;
			
 
				-	r->nr_info_files = ARRAY_SIZE(res_mba_info_files);
			
 
				-}
			
 
				+	struct rftype *rfts, *rft;
			
 
				+	int ret, len;
			
 
				 
			
 
				-void rdt_get_cache_infofile(struct rdt_resource *r)
			
 
				-{
			
 
				-	r->info_files = res_cache_info_files;
			
 
				-	r->nr_info_files = ARRAY_SIZE(res_cache_info_files);
			
 
				+	rfts = res_common_files;
			
 
				+	len = ARRAY_SIZE(res_common_files);
			
 
				+
			
 
				+	lockdep_assert_held(&rdtgroup_mutex);
			
 
				+
			
 
				+	for (rft = rfts; rft < rfts + len; rft++) {
			
 
				+		if ((fflags & rft->fflags) == rft->fflags) {
			
 
				+			ret = rdtgroup_add_file(kn, rft);
			
 
				+			if (ret)
			
 
				+				goto error;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+error:
			
 
				+	pr_warn("Failed to add %s, err=%d\n", rft->name, ret);
			
 
				+	while (--rft >= rfts) {
			
 
				+		if ((fflags & rft->fflags) == rft->fflags)
			
 
				+			kernfs_remove_by_name(kn, rft->name);
			
 
				+	}
			
 
				+	return ret;
			
 
				 }
			
 
				 
			
 
				-static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
			
 
				+static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
			
 
				+				      unsigned long fflags)
			
 
				 {
			
 
				 	struct kernfs_node *kn_subdir;
			
 
				-	struct rftype *res_info_files;
			
 
				-	struct rdt_resource *r;
			
 
				-	int ret, len;
			
 
				+	int ret;
			
 
				 
			
 
				-	/* create the directory */
			
 
				-	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
			
 
				-	if (IS_ERR(kn_info))
			
 
				-		return PTR_ERR(kn_info);
			
 
				+	kn_subdir = kernfs_create_dir(kn_info, name,
			
 
				+				      kn_info->mode, r);
			
 
				+	if (IS_ERR(kn_subdir))
			
 
				+		return PTR_ERR(kn_subdir);
			
 
				+
			
 
				+	kernfs_get(kn_subdir);
			
 
				+	ret = rdtgroup_kn_set_ugid(kn_subdir);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = rdtgroup_add_files(kn_subdir, fflags);
			
 
				+	if (!ret)
			
 
				+		kernfs_activate(kn_subdir);
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn)
			
 
				+{
			
 
				+	struct rdt_resource *r;
			
 
				+	unsigned long fflags;
			
 
				+	char name[32];
			
 
				+	int ret;
			
 
				+
			
 
				+	/* create the directory */
			
 
				+	kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL);
			
 
				+	if (IS_ERR(kn_info))
			
 
				+		return PTR_ERR(kn_info);
			
 
				 	kernfs_get(kn_info);
			
 
				 
			
 
				-	for_each_enabled_rdt_resource(r) {
			
 
				-		kn_subdir = kernfs_create_dir(kn_info, r->name,
			
 
				-					      kn_info->mode, r);
			
 
				-		if (IS_ERR(kn_subdir)) {
			
 
				-			ret = PTR_ERR(kn_subdir);
			
 
				-			goto out_destroy;
			
 
				-		}
			
 
				-		kernfs_get(kn_subdir);
			
 
				-		ret = rdtgroup_kn_set_ugid(kn_subdir);
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		fflags =  r->fflags | RF_CTRL_INFO;
			
 
				+		ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags);
			
 
				 		if (ret)
			
 
				 			goto out_destroy;
			
 
				+	}
			
 
				 
			
 
				-		res_info_files = r->info_files;
			
 
				-		len = r->nr_info_files;
			
 
				-
			
 
				-		ret = rdtgroup_add_files(kn_subdir, res_info_files, len);
			
 
				+	for_each_mon_enabled_rdt_resource(r) {
			
 
				+		fflags =  r->fflags | RF_MON_INFO;
			
 
				+		sprintf(name, "%s_MON", r->name);
			
 
				+		ret = rdtgroup_mkdir_info_resdir(r, name, fflags);
			
 
				 		if (ret)
			
 
				 			goto out_destroy;
			
 
				-		kernfs_activate(kn_subdir);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -678,6 +889,39 @@ out_destroy:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static int
			
 
				+mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp,
			
 
				+		    char *name, struct kernfs_node **dest_kn)
			
 
				+{
			
 
				+	struct kernfs_node *kn;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* create the directory */
			
 
				+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
			
 
				+	if (IS_ERR(kn))
			
 
				+		return PTR_ERR(kn);
			
 
				+
			
 
				+	if (dest_kn)
			
 
				+		*dest_kn = kn;
			
 
				+
			
 
				+	/*
			
 
				+	 * This extra ref will be put in kernfs_remove() and guarantees
			
 
				+	 * that @rdtgrp->kn is always accessible.
			
 
				+	 */
			
 
				+	kernfs_get(kn);
			
 
				+
			
 
				+	ret = rdtgroup_kn_set_ugid(kn);
			
 
				+	if (ret)
			
 
				+		goto out_destroy;
			
 
				+
			
 
				+	kernfs_activate(kn);
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_destroy:
			
 
				+	kernfs_remove(kn);
			
 
				+	return ret;
			
 
				+}
			
 
				 static void l3_qos_cfg_update(void *arg)
			
 
				 {
			
 
				 	bool *enable = arg;
			
@@ -718,14 +962,15 @@ static int cdp_enable(void)
 
				 	struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				 	int ret;
			
 
				 
			
 
				-	if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable)
			
 
				+	if (!r_l3->alloc_capable || !r_l3data->alloc_capable ||
			
 
				+	    !r_l3code->alloc_capable)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	ret = set_l3_qos_cfg(r_l3, true);
			
 
				 	if (!ret) {
			
 
				-		r_l3->enabled = false;
			
 
				-		r_l3data->enabled = true;
			
 
				-		r_l3code->enabled = true;
			
 
				+		r_l3->alloc_enabled = false;
			
 
				+		r_l3data->alloc_enabled = true;
			
 
				+		r_l3code->alloc_enabled = true;
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -734,11 +979,11 @@ static void cdp_disable(void)
 
				 {
			
 
				 	struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				 
			
 
				-	r->enabled = r->capable;
			
 
				+	r->alloc_enabled = r->alloc_capable;
			
 
				 
			
 
				-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) {
			
 
				-		rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false;
			
 
				-		rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false;
			
 
				+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) {
			
 
				+		rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false;
			
 
				+		rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false;
			
 
				 		set_l3_qos_cfg(r, false);
			
 
				 	}
			
 
				 }
			
@@ -823,10 +1068,16 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
			
 
				+			     struct rdtgroup *prgrp,
			
 
				+			     struct kernfs_node **mon_data_kn);
			
 
				+
			
 
				 static struct dentry *rdt_mount(struct file_system_type *fs_type,
			
 
				 				int flags, const char *unused_dev_name,
			
 
				 				void *data)
			
 
				 {
			
 
				+	struct rdt_domain *dom;
			
 
				+	struct rdt_resource *r;
			
 
				 	struct dentry *dentry;
			
 
				 	int ret;
			
 
				 
			
@@ -853,15 +1104,54 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 
				 		goto out_cdp;
			
 
				 	}
			
 
				 
			
 
				+	if (rdt_mon_capable) {
			
 
				+		ret = mongroup_create_dir(rdtgroup_default.kn,
			
 
				+					  NULL, "mon_groups",
			
 
				+					  &kn_mongrp);
			
 
				+		if (ret) {
			
 
				+			dentry = ERR_PTR(ret);
			
 
				+			goto out_info;
			
 
				+		}
			
 
				+		kernfs_get(kn_mongrp);
			
 
				+
			
 
				+		ret = mkdir_mondata_all(rdtgroup_default.kn,
			
 
				+					&rdtgroup_default, &kn_mondata);
			
 
				+		if (ret) {
			
 
				+			dentry = ERR_PTR(ret);
			
 
				+			goto out_mongrp;
			
 
				+		}
			
 
				+		kernfs_get(kn_mondata);
			
 
				+		rdtgroup_default.mon.mon_data_kn = kn_mondata;
			
 
				+	}
			
 
				+
			
 
				 	dentry = kernfs_mount(fs_type, flags, rdt_root,
			
 
				 			      RDTGROUP_SUPER_MAGIC, NULL);
			
 
				 	if (IS_ERR(dentry))
			
 
				-		goto out_destroy;
			
 
				+		goto out_mondata;
			
 
				+
			
 
				+	if (rdt_alloc_capable)
			
 
				+		static_branch_enable(&rdt_alloc_enable_key);
			
 
				+	if (rdt_mon_capable)
			
 
				+		static_branch_enable(&rdt_mon_enable_key);
			
 
				+
			
 
				+	if (rdt_alloc_capable || rdt_mon_capable)
			
 
				+		static_branch_enable(&rdt_enable_key);
			
 
				+
			
 
				+	if (is_mbm_enabled()) {
			
 
				+		r = &rdt_resources_all[RDT_RESOURCE_L3];
			
 
				+		list_for_each_entry(dom, &r->domains, list)
			
 
				+			mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL);
			
 
				+	}
			
 
				 
			
 
				-	static_branch_enable(&rdt_enable_key);
			
 
				 	goto out;
			
 
				 
			
 
				-out_destroy:
			
 
				+out_mondata:
			
 
				+	if (rdt_mon_capable)
			
 
				+		kernfs_remove(kn_mondata);
			
 
				+out_mongrp:
			
 
				+	if (rdt_mon_capable)
			
 
				+		kernfs_remove(kn_mongrp);
			
 
				+out_info:
			
 
				 	kernfs_remove(kn_info);
			
 
				 out_cdp:
			
 
				 	cdp_disable();
			
@@ -909,6 +1199,18 @@ static int reset_all_ctrls(struct rdt_resource *r)
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static bool is_closid_match(struct task_struct *t, struct rdtgroup *r)
			
 
				+{
			
 
				+	return (rdt_alloc_capable &&
			
 
				+		(r->type == RDTCTRL_GROUP) && (t->closid == r->closid));
			
 
				+}
			
 
				+
			
 
				+static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r)
			
 
				+{
			
 
				+	return (rdt_mon_capable &&
			
 
				+		(r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid));
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Move tasks from one to the other group. If @from is NULL, then all tasks
			
 
				  * in the systems are moved unconditionally (used for teardown).
			
@@ -924,8 +1226,11 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 
				 
			
 
				 	read_lock(&tasklist_lock);
			
 
				 	for_each_process_thread(p, t) {
			
 
				-		if (!from || t->closid == from->closid) {
			
 
				+		if (!from || is_closid_match(t, from) ||
			
 
				+		    is_rmid_match(t, from)) {
			
 
				 			t->closid = to->closid;
			
 
				+			t->rmid = to->mon.rmid;
			
 
				+
			
 
				 #ifdef CONFIG_SMP
			
 
				 			/*
			
 
				 			 * This is safe on x86 w/o barriers as the ordering
			
@@ -944,6 +1249,19 @@ static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to,
 
				 	read_unlock(&tasklist_lock);
			
 
				 }
			
 
				 
			
 
				+static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	struct rdtgroup *sentry, *stmp;
			
 
				+	struct list_head *head;
			
 
				+
			
 
				+	head = &rdtgrp->mon.crdtgrp_list;
			
 
				+	list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) {
			
 
				+		free_rmid(sentry->mon.rmid);
			
 
				+		list_del(&sentry->mon.crdtgrp_list);
			
 
				+		kfree(sentry);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Forcibly remove all of subdirectories under root.
			
 
				  */
			
@@ -955,6 +1273,9 @@ static void rmdir_all_sub(void)
 
				 	rdt_move_group_tasks(NULL, &rdtgroup_default, NULL);
			
 
				 
			
 
				 	list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		/* Free any child rmids */
			
 
				+		free_all_child_rdtgrp(rdtgrp);
			
 
				+
			
 
				 		/* Remove each rdtgroup other than root */
			
 
				 		if (rdtgrp == &rdtgroup_default)
			
 
				 			continue;
			
@@ -967,16 +1288,20 @@ static void rmdir_all_sub(void)
 
				 		cpumask_or(&rdtgroup_default.cpu_mask,
			
 
				 			   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
			
 
				 
			
 
				+		free_rmid(rdtgrp->mon.rmid);
			
 
				+
			
 
				 		kernfs_remove(rdtgrp->kn);
			
 
				 		list_del(&rdtgrp->rdtgroup_list);
			
 
				 		kfree(rdtgrp);
			
 
				 	}
			
 
				 	/* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */
			
 
				 	get_online_cpus();
			
 
				-	rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid);
			
 
				+	update_closid_rmid(cpu_online_mask, &rdtgroup_default);
			
 
				 	put_online_cpus();
			
 
				 
			
 
				 	kernfs_remove(kn_info);
			
 
				+	kernfs_remove(kn_mongrp);
			
 
				+	kernfs_remove(kn_mondata);
			
 
				 }
			
 
				 
			
 
				 static void rdt_kill_sb(struct super_block *sb)
			
@@ -986,10 +1311,12 @@ static void rdt_kill_sb(struct super_block *sb)
 
				 	mutex_lock(&rdtgroup_mutex);
			
 
				 
			
 
				 	/*Put everything back to default values. */
			
 
				-	for_each_enabled_rdt_resource(r)
			
 
				+	for_each_alloc_enabled_rdt_resource(r)
			
 
				 		reset_all_ctrls(r);
			
 
				 	cdp_disable();
			
 
				 	rmdir_all_sub();
			
 
				+	static_branch_disable(&rdt_alloc_enable_key);
			
 
				+	static_branch_disable(&rdt_mon_enable_key);
			
 
				 	static_branch_disable(&rdt_enable_key);
			
 
				 	kernfs_kill_sb(sb);
			
 
				 	mutex_unlock(&rdtgroup_mutex);
			
@@ -1001,46 +1328,223 @@ static struct file_system_type rdt_fs_type = {
 
				 	.kill_sb = rdt_kill_sb,
			
 
				 };
			
 
				 
			
 
				-static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
			
 
				-			  umode_t mode)
			
 
				+static int mon_addfile(struct kernfs_node *parent_kn, const char *name,
			
 
				+		       void *priv)
			
 
				 {
			
 
				-	struct rdtgroup *parent, *rdtgrp;
			
 
				 	struct kernfs_node *kn;
			
 
				-	int ret, closid;
			
 
				+	int ret = 0;
			
 
				 
			
 
				-	/* Only allow mkdir in the root directory */
			
 
				-	if (parent_kn != rdtgroup_default.kn)
			
 
				-		return -EPERM;
			
 
				+	kn = __kernfs_create_file(parent_kn, name, 0444, 0,
			
 
				+				  &kf_mondata_ops, priv, NULL, NULL);
			
 
				+	if (IS_ERR(kn))
			
 
				+		return PTR_ERR(kn);
			
 
				 
			
 
				-	/* Do not accept '\n' to avoid unparsable situation. */
			
 
				-	if (strchr(name, '\n'))
			
 
				-		return -EINVAL;
			
 
				+	ret = rdtgroup_kn_set_ugid(kn);
			
 
				+	if (ret) {
			
 
				+		kernfs_remove(kn);
			
 
				+		return ret;
			
 
				+	}
			
 
				 
			
 
				-	parent = rdtgroup_kn_lock_live(parent_kn);
			
 
				-	if (!parent) {
			
 
				-		ret = -ENODEV;
			
 
				-		goto out_unlock;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Remove all subdirectories of mon_data of ctrl_mon groups
			
 
				+ * and monitor groups with given domain id.
			
 
				+ */
			
 
				+void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id)
			
 
				+{
			
 
				+	struct rdtgroup *prgrp, *crgrp;
			
 
				+	char name[32];
			
 
				+
			
 
				+	if (!r->mon_enabled)
			
 
				+		return;
			
 
				+
			
 
				+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		sprintf(name, "mon_%s_%02d", r->name, dom_id);
			
 
				+		kernfs_remove_by_name(prgrp->mon.mon_data_kn, name);
			
 
				+
			
 
				+		list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list)
			
 
				+			kernfs_remove_by_name(crgrp->mon.mon_data_kn, name);
			
 
				 	}
			
 
				+}
			
 
				 
			
 
				-	ret = closid_alloc();
			
 
				-	if (ret < 0)
			
 
				+static int mkdir_mondata_subdir(struct kernfs_node *parent_kn,
			
 
				+				struct rdt_domain *d,
			
 
				+				struct rdt_resource *r, struct rdtgroup *prgrp)
			
 
				+{
			
 
				+	union mon_data_bits priv;
			
 
				+	struct kernfs_node *kn;
			
 
				+	struct mon_evt *mevt;
			
 
				+	struct rmid_read rr;
			
 
				+	char name[32];
			
 
				+	int ret;
			
 
				+
			
 
				+	sprintf(name, "mon_%s_%02d", r->name, d->id);
			
 
				+	/* create the directory */
			
 
				+	kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp);
			
 
				+	if (IS_ERR(kn))
			
 
				+		return PTR_ERR(kn);
			
 
				+
			
 
				+	/*
			
 
				+	 * This extra ref will be put in kernfs_remove() and guarantees
			
 
				+	 * that kn is always accessible.
			
 
				+	 */
			
 
				+	kernfs_get(kn);
			
 
				+	ret = rdtgroup_kn_set_ugid(kn);
			
 
				+	if (ret)
			
 
				+		goto out_destroy;
			
 
				+
			
 
				+	if (WARN_ON(list_empty(&r->evt_list))) {
			
 
				+		ret = -EPERM;
			
 
				+		goto out_destroy;
			
 
				+	}
			
 
				+
			
 
				+	priv.u.rid = r->rid;
			
 
				+	priv.u.domid = d->id;
			
 
				+	list_for_each_entry(mevt, &r->evt_list, list) {
			
 
				+		priv.u.evtid = mevt->evtid;
			
 
				+		ret = mon_addfile(kn, mevt->name, priv.priv);
			
 
				+		if (ret)
			
 
				+			goto out_destroy;
			
 
				+
			
 
				+		if (is_mbm_event(mevt->evtid))
			
 
				+			mon_event_read(&rr, d, prgrp, mevt->evtid, true);
			
 
				+	}
			
 
				+	kernfs_activate(kn);
			
 
				+	return 0;
			
 
				+
			
 
				+out_destroy:
			
 
				+	kernfs_remove(kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Add all subdirectories of mon_data for "ctrl_mon" groups
			
 
				+ * and "monitor" groups with given domain id.
			
 
				+ */
			
 
				+void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r,
			
 
				+				    struct rdt_domain *d)
			
 
				+{
			
 
				+	struct kernfs_node *parent_kn;
			
 
				+	struct rdtgroup *prgrp, *crgrp;
			
 
				+	struct list_head *head;
			
 
				+
			
 
				+	if (!r->mon_enabled)
			
 
				+		return;
			
 
				+
			
 
				+	list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		parent_kn = prgrp->mon.mon_data_kn;
			
 
				+		mkdir_mondata_subdir(parent_kn, d, r, prgrp);
			
 
				+
			
 
				+		head = &prgrp->mon.crdtgrp_list;
			
 
				+		list_for_each_entry(crgrp, head, mon.crdtgrp_list) {
			
 
				+			parent_kn = crgrp->mon.mon_data_kn;
			
 
				+			mkdir_mondata_subdir(parent_kn, d, r, crgrp);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn,
			
 
				+				       struct rdt_resource *r,
			
 
				+				       struct rdtgroup *prgrp)
			
 
				+{
			
 
				+	struct rdt_domain *dom;
			
 
				+	int ret;
			
 
				+
			
 
				+	list_for_each_entry(dom, &r->domains, list) {
			
 
				+		ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This creates a directory mon_data which contains the monitored data.
			
 
				+ *
			
 
				+ * mon_data has one directory for each domain whic are named
			
 
				+ * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data
			
 
				+ * with L3 domain looks as below:
			
 
				+ * ./mon_data:
			
 
				+ * mon_L3_00
			
 
				+ * mon_L3_01
			
 
				+ * mon_L3_02
			
 
				+ * ...
			
 
				+ *
			
 
				+ * Each domain directory has one file per event:
			
 
				+ * ./mon_L3_00/:
			
 
				+ * llc_occupancy
			
 
				+ *
			
 
				+ */
			
 
				+static int mkdir_mondata_all(struct kernfs_node *parent_kn,
			
 
				+			     struct rdtgroup *prgrp,
			
 
				+			     struct kernfs_node **dest_kn)
			
 
				+{
			
 
				+	struct rdt_resource *r;
			
 
				+	struct kernfs_node *kn;
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * Create the mon_data directory first.
			
 
				+	 */
			
 
				+	ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (dest_kn)
			
 
				+		*dest_kn = kn;
			
 
				+
			
 
				+	/*
			
 
				+	 * Create the subdirectories for each domain. Note that all events
			
 
				+	 * in a domain like L3 are grouped into a resource whose domain is L3
			
 
				+	 */
			
 
				+	for_each_mon_enabled_rdt_resource(r) {
			
 
				+		ret = mkdir_mondata_subdir_alldom(kn, r, prgrp);
			
 
				+		if (ret)
			
 
				+			goto out_destroy;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_destroy:
			
 
				+	kernfs_remove(kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
			
 
				+			     struct kernfs_node *prgrp_kn,
			
 
				+			     const char *name, umode_t mode,
			
 
				+			     enum rdt_group_type rtype, struct rdtgroup **r)
			
 
				+{
			
 
				+	struct rdtgroup *prdtgrp, *rdtgrp;
			
 
				+	struct kernfs_node *kn;
			
 
				+	uint files = 0;
			
 
				+	int ret;
			
 
				+
			
 
				+	prdtgrp = rdtgroup_kn_lock_live(prgrp_kn);
			
 
				+	if (!prdtgrp) {
			
 
				+		ret = -ENODEV;
			
 
				 		goto out_unlock;
			
 
				-	closid = ret;
			
 
				+	}
			
 
				 
			
 
				 	/* allocate the rdtgroup. */
			
 
				 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
			
 
				 	if (!rdtgrp) {
			
 
				 		ret = -ENOSPC;
			
 
				-		goto out_closid_free;
			
 
				+		goto out_unlock;
			
 
				 	}
			
 
				-	rdtgrp->closid = closid;
			
 
				-	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
			
 
				+	*r = rdtgrp;
			
 
				+	rdtgrp->mon.parent = prdtgrp;
			
 
				+	rdtgrp->type = rtype;
			
 
				+	INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list);
			
 
				 
			
 
				 	/* kernfs creates the directory for rdtgrp */
			
 
				-	kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp);
			
 
				+	kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp);
			
 
				 	if (IS_ERR(kn)) {
			
 
				 		ret = PTR_ERR(kn);
			
 
				-		goto out_cancel_ref;
			
 
				+		goto out_free_rgrp;
			
 
				 	}
			
 
				 	rdtgrp->kn = kn;
			
 
				 
			
@@ -1056,43 +1560,211 @@ static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
 
				 	if (ret)
			
 
				 		goto out_destroy;
			
 
				 
			
 
				-	ret = rdtgroup_add_files(kn, rdtgroup_base_files,
			
 
				-				 ARRAY_SIZE(rdtgroup_base_files));
			
 
				+	files = RFTYPE_BASE | RFTYPE_CTRL;
			
 
				+	files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype);
			
 
				+	ret = rdtgroup_add_files(kn, files);
			
 
				 	if (ret)
			
 
				 		goto out_destroy;
			
 
				 
			
 
				+	if (rdt_mon_capable) {
			
 
				+		ret = alloc_rmid();
			
 
				+		if (ret < 0)
			
 
				+			goto out_destroy;
			
 
				+		rdtgrp->mon.rmid = ret;
			
 
				+
			
 
				+		ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn);
			
 
				+		if (ret)
			
 
				+			goto out_idfree;
			
 
				+	}
			
 
				 	kernfs_activate(kn);
			
 
				 
			
 
				-	ret = 0;
			
 
				-	goto out_unlock;
			
 
				+	/*
			
 
				+	 * The caller unlocks the prgrp_kn upon success.
			
 
				+	 */
			
 
				+	return 0;
			
 
				 
			
 
				+out_idfree:
			
 
				+	free_rmid(rdtgrp->mon.rmid);
			
 
				 out_destroy:
			
 
				 	kernfs_remove(rdtgrp->kn);
			
 
				-out_cancel_ref:
			
 
				-	list_del(&rdtgrp->rdtgroup_list);
			
 
				+out_free_rgrp:
			
 
				 	kfree(rdtgrp);
			
 
				-out_closid_free:
			
 
				-	closid_free(closid);
			
 
				 out_unlock:
			
 
				-	rdtgroup_kn_unlock(parent_kn);
			
 
				+	rdtgroup_kn_unlock(prgrp_kn);
			
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-static int rdtgroup_rmdir(struct kernfs_node *kn)
			
 
				+static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp)
			
 
				+{
			
 
				+	kernfs_remove(rgrp->kn);
			
 
				+	free_rmid(rgrp->mon.rmid);
			
 
				+	kfree(rgrp);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Create a monitor group under "mon_groups" directory of a control
			
 
				+ * and monitor group(ctrl_mon). This is a resource group
			
 
				+ * to monitor a subset of tasks and cpus in its parent ctrl_mon group.
			
 
				+ */
			
 
				+static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn,
			
 
				+			      struct kernfs_node *prgrp_kn,
			
 
				+			      const char *name,
			
 
				+			      umode_t mode)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp, *prgrp;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP,
			
 
				+				&rdtgrp);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	prgrp = rdtgrp->mon.parent;
			
 
				+	rdtgrp->closid = prgrp->closid;
			
 
				+
			
 
				+	/*
			
 
				+	 * Add the rdtgrp to the list of rdtgrps the parent
			
 
				+	 * ctrl_mon group has to track.
			
 
				+	 */
			
 
				+	list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list);
			
 
				+
			
 
				+	rdtgroup_kn_unlock(prgrp_kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * These are rdtgroups created under the root directory. Can be used
			
 
				+ * to allocate and monitor resources.
			
 
				+ */
			
 
				+static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
			
 
				+				   struct kernfs_node *prgrp_kn,
			
 
				+				   const char *name, umode_t mode)
			
 
				 {
			
 
				-	int ret, cpu, closid = rdtgroup_default.closid;
			
 
				 	struct rdtgroup *rdtgrp;
			
 
				-	cpumask_var_t tmpmask;
			
 
				+	struct kernfs_node *kn;
			
 
				+	u32 closid;
			
 
				+	int ret;
			
 
				 
			
 
				-	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
			
 
				-		return -ENOMEM;
			
 
				+	ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP,
			
 
				+				&rdtgrp);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				 
			
 
				-	rdtgrp = rdtgroup_kn_lock_live(kn);
			
 
				-	if (!rdtgrp) {
			
 
				-		ret = -EPERM;
			
 
				-		goto out;
			
 
				+	kn = rdtgrp->kn;
			
 
				+	ret = closid_alloc();
			
 
				+	if (ret < 0)
			
 
				+		goto out_common_fail;
			
 
				+	closid = ret;
			
 
				+
			
 
				+	rdtgrp->closid = closid;
			
 
				+	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
			
 
				+
			
 
				+	if (rdt_mon_capable) {
			
 
				+		/*
			
 
				+		 * Create an empty mon_groups directory to hold the subset
			
 
				+		 * of tasks and cpus to monitor.
			
 
				+		 */
			
 
				+		ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
			
 
				+		if (ret)
			
 
				+			goto out_id_free;
			
 
				 	}
			
 
				 
			
 
				+	goto out_unlock;
			
 
				+
			
 
				+out_id_free:
			
 
				+	closid_free(closid);
			
 
				+	list_del(&rdtgrp->rdtgroup_list);
			
 
				+out_common_fail:
			
 
				+	mkdir_rdt_prepare_clean(rdtgrp);
			
 
				+out_unlock:
			
 
				+	rdtgroup_kn_unlock(prgrp_kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * We allow creating mon groups only with in a directory called "mon_groups"
			
 
				+ * which is present in every ctrl_mon group. Check if this is a valid
			
 
				+ * "mon_groups" directory.
			
 
				+ *
			
 
				+ * 1. The directory should be named "mon_groups".
			
 
				+ * 2. The mon group itself should "not" be named "mon_groups".
			
 
				+ *   This makes sure "mon_groups" directory always has a ctrl_mon group
			
 
				+ *   as parent.
			
 
				+ */
			
 
				+static bool is_mon_groups(struct kernfs_node *kn, const char *name)
			
 
				+{
			
 
				+	return (!strcmp(kn->name, "mon_groups") &&
			
 
				+		strcmp(name, "mon_groups"));
			
 
				+}
			
 
				+
			
 
				+static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
			
 
				+			  umode_t mode)
			
 
				+{
			
 
				+	/* Do not accept '\n' to avoid unparsable situation. */
			
 
				+	if (strchr(name, '\n'))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/*
			
 
				+	 * If the parent directory is the root directory and RDT
			
 
				+	 * allocation is supported, add a control and monitoring
			
 
				+	 * subdirectory
			
 
				+	 */
			
 
				+	if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn)
			
 
				+		return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode);
			
 
				+
			
 
				+	/*
			
 
				+	 * If RDT monitoring is supported and the parent directory is a valid
			
 
				+	 * "mon_groups" directory, add a monitoring subdirectory.
			
 
				+	 */
			
 
				+	if (rdt_mon_capable && is_mon_groups(parent_kn, name))
			
 
				+		return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode);
			
 
				+
			
 
				+	return -EPERM;
			
 
				+}
			
 
				+
			
 
				+static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
			
 
				+			      cpumask_var_t tmpmask)
			
 
				+{
			
 
				+	struct rdtgroup *prdtgrp = rdtgrp->mon.parent;
			
 
				+	int cpu;
			
 
				+
			
 
				+	/* Give any tasks back to the parent group */
			
 
				+	rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask);
			
 
				+
			
 
				+	/* Update per cpu rmid of the moved CPUs first */
			
 
				+	for_each_cpu(cpu, &rdtgrp->cpu_mask)
			
 
				+		per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid;
			
 
				+	/*
			
 
				+	 * Update the MSR on moved CPUs and CPUs which have moved
			
 
				+	 * task running on them.
			
 
				+	 */
			
 
				+	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
			
 
				+	update_closid_rmid(tmpmask, NULL);
			
 
				+
			
 
				+	rdtgrp->flags = RDT_DELETED;
			
 
				+	free_rmid(rdtgrp->mon.rmid);
			
 
				+
			
 
				+	/*
			
 
				+	 * Remove the rdtgrp from the parent ctrl_mon group's list
			
 
				+	 */
			
 
				+	WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list));
			
 
				+	list_del(&rdtgrp->mon.crdtgrp_list);
			
 
				+
			
 
				+	/*
			
 
				+	 * one extra hold on this, will drop when we kfree(rdtgrp)
			
 
				+	 * in rdtgroup_kn_unlock()
			
 
				+	 */
			
 
				+	kernfs_get(kn);
			
 
				+	kernfs_remove(rdtgrp->kn);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
			
 
				+			       cpumask_var_t tmpmask)
			
 
				+{
			
 
				+	int cpu;
			
 
				+
			
 
				 	/* Give any tasks back to the default group */
			
 
				 	rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask);
			
 
				 
			
@@ -1100,18 +1772,28 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
				 	cpumask_or(&rdtgroup_default.cpu_mask,
			
 
				 		   &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask);
			
 
				 
			
 
				-	/* Update per cpu closid of the moved CPUs first */
			
 
				-	for_each_cpu(cpu, &rdtgrp->cpu_mask)
			
 
				-		per_cpu(cpu_closid, cpu) = closid;
			
 
				+	/* Update per cpu closid and rmid of the moved CPUs first */
			
 
				+	for_each_cpu(cpu, &rdtgrp->cpu_mask) {
			
 
				+		per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid;
			
 
				+		per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid;
			
 
				+	}
			
 
				+
			
 
				 	/*
			
 
				 	 * Update the MSR on moved CPUs and CPUs which have moved
			
 
				 	 * task running on them.
			
 
				 	 */
			
 
				 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
			
 
				-	rdt_update_closid(tmpmask, NULL);
			
 
				+	update_closid_rmid(tmpmask, NULL);
			
 
				 
			
 
				 	rdtgrp->flags = RDT_DELETED;
			
 
				 	closid_free(rdtgrp->closid);
			
 
				+	free_rmid(rdtgrp->mon.rmid);
			
 
				+
			
 
				+	/*
			
 
				+	 * Free all the child monitor group rmids.
			
 
				+	 */
			
 
				+	free_all_child_rdtgrp(rdtgrp);
			
 
				+
			
 
				 	list_del(&rdtgrp->rdtgroup_list);
			
 
				 
			
 
				 	/*
			
@@ -1120,7 +1802,41 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
				 	 */
			
 
				 	kernfs_get(kn);
			
 
				 	kernfs_remove(rdtgrp->kn);
			
 
				-	ret = 0;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int rdtgroup_rmdir(struct kernfs_node *kn)
			
 
				+{
			
 
				+	struct kernfs_node *parent_kn = kn->parent;
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+	cpumask_var_t tmpmask;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	rdtgrp = rdtgroup_kn_lock_live(kn);
			
 
				+	if (!rdtgrp) {
			
 
				+		ret = -EPERM;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If the rdtgroup is a ctrl_mon group and parent directory
			
 
				+	 * is the root directory, remove the ctrl_mon group.
			
 
				+	 *
			
 
				+	 * If the rdtgroup is a mon group and parent directory
			
 
				+	 * is a valid "mon_groups" directory, remove the mon group.
			
 
				+	 */
			
 
				+	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
			
 
				+		ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
			
 
				+	else if (rdtgrp->type == RDTMON_GROUP &&
			
 
				+		 is_mon_groups(parent_kn, kn->name))
			
 
				+		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
			
 
				+	else
			
 
				+		ret = -EPERM;
			
 
				+
			
 
				 out:
			
 
				 	rdtgroup_kn_unlock(kn);
			
 
				 	free_cpumask_var(tmpmask);
			
@@ -1129,7 +1845,7 @@ out:
 
				 
			
 
				 static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf)
			
 
				 {
			
 
				-	if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled)
			
 
				+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled)
			
 
				 		seq_puts(seq, ",cdp");
			
 
				 	return 0;
			
 
				 }
			
@@ -1153,10 +1869,13 @@ static int __init rdtgroup_setup_root(void)
 
				 	mutex_lock(&rdtgroup_mutex);
			
 
				 
			
 
				 	rdtgroup_default.closid = 0;
			
 
				+	rdtgroup_default.mon.rmid = 0;
			
 
				+	rdtgroup_default.type = RDTCTRL_GROUP;
			
 
				+	INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list);
			
 
				+
			
 
				 	list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups);
			
 
				 
			
 
				-	ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files,
			
 
				-				 ARRAY_SIZE(rdtgroup_base_files));
			
 
				+	ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE);
			
 
				 	if (ret) {
			
 
				 		kernfs_destroy_root(rdt_root);
			
 
				 		goto out;
			
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -56,7 +56,7 @@
 
				 #include <asm/debugreg.h>
			
 
				 #include <asm/switch_to.h>
			
 
				 #include <asm/vm86.h>
			
 
				-#include <asm/intel_rdt.h>
			
 
				+#include <asm/intel_rdt_sched.h>
			
 
				 #include <asm/proto.h>
			
 
				 
			
 
				 void __show_regs(struct pt_regs *regs, int all)
			
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
 
				 #include <asm/switch_to.h>
			
 
				 #include <asm/xen/hypervisor.h>
			
 
				 #include <asm/vdso.h>
			
 
				-#include <asm/intel_rdt.h>
			
 
				+#include <asm/intel_rdt_sched.h>
			
 
				 #include <asm/unistd.h>
			
 
				 #ifdef CONFIG_IA32_EMULATION
			
 
				 /* Not included via unistd.h */
			
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -139,14 +139,6 @@ struct hw_perf_event {
 
				 			/* for tp_event->class */
			
 
				 			struct list_head	tp_list;
			
 
				 		};
			
 
				-		struct { /* intel_cqm */
			
 
				-			int			cqm_state;
			
 
				-			u32			cqm_rmid;
			
 
				-			int			is_group_event;
			
 
				-			struct list_head	cqm_events_entry;
			
 
				-			struct list_head	cqm_groups_entry;
			
 
				-			struct list_head	cqm_group_entry;
			
 
				-		};
			
 
				 		struct { /* amd_power */
			
 
				 			u64	pwr_acc;
			
 
				 			u64	ptsc;
			
@@ -413,11 +405,6 @@ struct pmu {
 
				 	size_t				task_ctx_size;
			
 
				 
			
 
				 
			
 
				-	/*
			
 
				-	 * Return the count value for a counter.
			
 
				-	 */
			
 
				-	u64 (*count)			(struct perf_event *event); /*optional*/
			
 
				-
			
 
				 	/*
			
 
				 	 * Set up pmu-private data structures for an AUX area
			
 
				 	 */
			
@@ -1112,11 +1099,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 
				 		__perf_event_task_sched_out(prev, next);
			
 
				 }
			
 
				 
			
 
				-static inline u64 __perf_event_count(struct perf_event *event)
			
 
				-{
			
 
				-	return local64_read(&event->count) + atomic64_read(&event->child_count);
			
 
				-}
			
 
				-
			
 
				 extern void perf_event_mmap(struct vm_area_struct *vma);
			
 
				 extern struct perf_guest_info_callbacks *perf_guest_cbs;
			
 
				 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
			
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -909,8 +909,9 @@ struct task_struct {
 
				 	/* cg_list protected by css_set_lock and tsk->alloc_lock: */
			
 
				 	struct list_head		cg_list;
			
 
				 #endif
			
 
				-#ifdef CONFIG_INTEL_RDT_A
			
 
				-	int				closid;
			
 
				+#ifdef CONFIG_INTEL_RDT
			
 
				+	u32				closid;
			
 
				+	u32				rmid;
			
 
				 #endif
			
 
				 #ifdef CONFIG_FUTEX
			
 
				 	struct robust_list_head __user	*robust_list;
			
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3673,10 +3673,7 @@ unlock:
 
				 
			
 
				 static inline u64 perf_event_count(struct perf_event *event)
			
 
				 {
			
 
				-	if (event->pmu->count)
			
 
				-		return event->pmu->count(event);
			
 
				-
			
 
				-	return __perf_event_count(event);
			
 
				+	return local64_read(&event->count) + atomic64_read(&event->child_count);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3707,15 +3704,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 
				 		goto out;
			
 
				 	}
			
 
				 
			
 
				-	/*
			
 
				-	 * It must not have a pmu::count method, those are not
			
 
				-	 * NMI safe.
			
 
				-	 */
			
 
				-	if (event->pmu->count) {
			
 
				-		ret = -EOPNOTSUPP;
			
 
				-		goto out;
			
 
				-	}
			
 
				-
			
 
				 	/* If this is a per-task event, it must be for current */
			
 
				 	if ((event->attach_state & PERF_ATTACH_TASK) &&
			
 
				 	    event->hw.target != current) {