7 ani în urmă · 30de24c7dd
--- a/Documentation/x86/intel_rdt_ui.txt
+++ b/Documentation/x86/intel_rdt_ui.txt
@@ -29,7 +29,11 @@ mount options are:
 
				 L2 and L3 CDP are controlled seperately.
			
 
				 
			
 
				 RDT features are orthogonal. A particular system may support only
			
 
				-monitoring, only control, or both monitoring and control.
			
 
				+monitoring, only control, or both monitoring and control.  Cache
			
 
				+pseudo-locking is a unique way of using cache control to "pin" or
			
 
				+"lock" data in the cache. Details can be found in
			
 
				+"Cache Pseudo-Locking".
			
 
				+
			
 
				 
			
 
				 The mount succeeds if either of allocation or monitoring is present, but
			
 
				 only those files and directories supported by the system will be created.
			
@@ -65,6 +69,29 @@ related to allocation:
 
				 			some platforms support devices that have their
			
 
				 			own settings for cache use which can over-ride
			
 
				 			these bits.
			
 
				+"bit_usage":		Annotated capacity bitmasks showing how all
			
 
				+			instances of the resource are used. The legend is:
			
 
				+			"0" - Corresponding region is unused. When the system's
			
 
				+			      resources have been allocated and a "0" is found
			
 
				+			      in "bit_usage" it is a sign that resources are
			
 
				+			      wasted.
			
 
				+			"H" - Corresponding region is used by hardware only
			
 
				+			      but available for software use. If a resource
			
 
				+			      has bits set in "shareable_bits" but not all
			
 
				+			      of these bits appear in the resource groups'
			
 
				+			      schematas then the bits appearing in
			
 
				+			      "shareable_bits" but no resource group will
			
 
				+			      be marked as "H".
			
 
				+			"X" - Corresponding region is available for sharing and
			
 
				+			      used by hardware and software. These are the
			
 
				+			      bits that appear in "shareable_bits" as
			
 
				+			      well as a resource group's allocation.
			
 
				+			"S" - Corresponding region is used by software
			
 
				+			      and available for sharing.
			
 
				+			"E" - Corresponding region is used exclusively by
			
 
				+			      one resource group. No sharing allowed.
			
 
				+			"P" - Corresponding region is pseudo-locked. No
			
 
				+			      sharing allowed.
			
 
				 
			
 
				 Memory bandwitdh(MB) subdirectory contains the following files
			
 
				 with respect to allocation:
			
@@ -151,6 +178,9 @@ All groups contain the following files:
 
				 	CPUs to/from this group. As with the tasks file a hierarchy is
			
 
				 	maintained where MON groups may only include CPUs owned by the
			
 
				 	parent CTRL_MON group.
			
 
				+	When the resouce group is in pseudo-locked mode this file will
			
 
				+	only be readable, reflecting the CPUs associated with the
			
 
				+	pseudo-locked region.
			
 
				 
			
 
				 
			
 
				 "cpus_list":
			
@@ -163,6 +193,21 @@ When control is enabled all CTRL_MON groups will also contain:
 
				 	A list of all the resources available to this group.
			
 
				 	Each resource has its own line and format - see below for details.
			
 
				 
			
 
				+"size":
			
 
				+	Mirrors the display of the "schemata" file to display the size in
			
 
				+	bytes of each allocation instead of the bits representing the
			
 
				+	allocation.
			
 
				+
			
 
				+"mode":
			
 
				+	The "mode" of the resource group dictates the sharing of its
			
 
				+	allocations. A "shareable" resource group allows sharing of its
			
 
				+	allocations while an "exclusive" resource group does not. A
			
 
				+	cache pseudo-locked region is created by first writing
			
 
				+	"pseudo-locksetup" to the "mode" file before writing the cache
			
 
				+	pseudo-locked region's schemata to the resource group's "schemata"
			
 
				+	file. On successful pseudo-locked region creation the mode will
			
 
				+	automatically change to "pseudo-locked".
			
 
				+
			
 
				 When monitoring is enabled all MON groups will also contain:
			
 
				 
			
 
				 "mon_data":
			
@@ -379,6 +424,170 @@ L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
 
				 L3DATA:0=fffff;1=fffff;2=3c0;3=fffff
			
 
				 L3CODE:0=fffff;1=fffff;2=fffff;3=fffff
			
 
				 
			
 
				+Cache Pseudo-Locking
			
 
				+--------------------
			
 
				+CAT enables a user to specify the amount of cache space that an
			
 
				+application can fill. Cache pseudo-locking builds on the fact that a
			
 
				+CPU can still read and write data pre-allocated outside its current
			
 
				+allocated area on a cache hit. With cache pseudo-locking, data can be
			
 
				+preloaded into a reserved portion of cache that no application can
			
 
				+fill, and from that point on will only serve cache hits. The cache
			
 
				+pseudo-locked memory is made accessible to user space where an
			
 
				+application can map it into its virtual address space and thus have
			
 
				+a region of memory with reduced average read latency.
			
 
				+
			
 
				+The creation of a cache pseudo-locked region is triggered by a request
			
 
				+from the user to do so that is accompanied by a schemata of the region
			
 
				+to be pseudo-locked. The cache pseudo-locked region is created as follows:
			
 
				+- Create a CAT allocation CLOSNEW with a CBM matching the schemata
			
 
				+  from the user of the cache region that will contain the pseudo-locked
			
 
				+  memory. This region must not overlap with any current CAT allocation/CLOS
			
 
				+  on the system and no future overlap with this cache region is allowed
			
 
				+  while the pseudo-locked region exists.
			
 
				+- Create a contiguous region of memory of the same size as the cache
			
 
				+  region.
			
 
				+- Flush the cache, disable hardware prefetchers, disable preemption.
			
 
				+- Make CLOSNEW the active CLOS and touch the allocated memory to load
			
 
				+  it into the cache.
			
 
				+- Set the previous CLOS as active.
			
 
				+- At this point the closid CLOSNEW can be released - the cache
			
 
				+  pseudo-locked region is protected as long as its CBM does not appear in
			
 
				+  any CAT allocation. Even though the cache pseudo-locked region will from
			
 
				+  this point on not appear in any CBM of any CLOS an application running with
			
 
				+  any CLOS will be able to access the memory in the pseudo-locked region since
			
 
				+  the region continues to serve cache hits.
			
 
				+- The contiguous region of memory loaded into the cache is exposed to
			
 
				+  user-space as a character device.
			
 
				+
			
 
				+Cache pseudo-locking increases the probability that data will remain
			
 
				+in the cache via carefully configuring the CAT feature and controlling
			
 
				+application behavior. There is no guarantee that data is placed in
			
 
				+cache. Instructions like INVD, WBINVD, CLFLUSH, etc. can still evict
			
 
				+“locked” data from cache. Power management C-states may shrink or
			
 
				+power off cache. Deeper C-states will automatically be restricted on
			
 
				+pseudo-locked region creation.
			
 
				+
			
 
				+It is required that an application using a pseudo-locked region runs
			
 
				+with affinity to the cores (or a subset of the cores) associated
			
 
				+with the cache on which the pseudo-locked region resides. A sanity check
			
 
				+within the code will not allow an application to map pseudo-locked memory
			
 
				+unless it runs with affinity to cores associated with the cache on which the
			
 
				+pseudo-locked region resides. The sanity check is only done during the
			
 
				+initial mmap() handling, there is no enforcement afterwards and the
			
 
				+application self needs to ensure it remains affine to the correct cores.
			
 
				+
			
 
				+Pseudo-locking is accomplished in two stages:
			
 
				+1) During the first stage the system administrator allocates a portion
			
 
				+   of cache that should be dedicated to pseudo-locking. At this time an
			
 
				+   equivalent portion of memory is allocated, loaded into allocated
			
 
				+   cache portion, and exposed as a character device.
			
 
				+2) During the second stage a user-space application maps (mmap()) the
			
 
				+   pseudo-locked memory into its address space.
			
 
				+
			
 
				+Cache Pseudo-Locking Interface
			
 
				+------------------------------
			
 
				+A pseudo-locked region is created using the resctrl interface as follows:
			
 
				+
			
 
				+1) Create a new resource group by creating a new directory in /sys/fs/resctrl.
			
 
				+2) Change the new resource group's mode to "pseudo-locksetup" by writing
			
 
				+   "pseudo-locksetup" to the "mode" file.
			
 
				+3) Write the schemata of the pseudo-locked region to the "schemata" file. All
			
 
				+   bits within the schemata should be "unused" according to the "bit_usage"
			
 
				+   file.
			
 
				+
			
 
				+On successful pseudo-locked region creation the "mode" file will contain
			
 
				+"pseudo-locked" and a new character device with the same name as the resource
			
 
				+group will exist in /dev/pseudo_lock. This character device can be mmap()'ed
			
 
				+by user space in order to obtain access to the pseudo-locked memory region.
			
 
				+
			
 
				+An example of cache pseudo-locked region creation and usage can be found below.
			
 
				+
			
 
				+Cache Pseudo-Locking Debugging Interface
			
 
				+---------------------------------------
			
 
				+The pseudo-locking debugging interface is enabled by default (if
			
 
				+CONFIG_DEBUG_FS is enabled) and can be found in /sys/kernel/debug/resctrl.
			
 
				+
			
 
				+There is no explicit way for the kernel to test if a provided memory
			
 
				+location is present in the cache. The pseudo-locking debugging interface uses
			
 
				+the tracing infrastructure to provide two ways to measure cache residency of
			
 
				+the pseudo-locked region:
			
 
				+1) Memory access latency using the pseudo_lock_mem_latency tracepoint. Data
			
 
				+   from these measurements are best visualized using a hist trigger (see
			
 
				+   example below). In this test the pseudo-locked region is traversed at
			
 
				+   a stride of 32 bytes while hardware prefetchers and preemption
			
 
				+   are disabled. This also provides a substitute visualization of cache
			
 
				+   hits and misses.
			
 
				+2) Cache hit and miss measurements using model specific precision counters if
			
 
				+   available. Depending on the levels of cache on the system the pseudo_lock_l2
			
 
				+   and pseudo_lock_l3 tracepoints are available.
			
 
				+   WARNING: triggering this  measurement uses from two (for just L2
			
 
				+   measurements) to four (for L2 and L3 measurements) precision counters on
			
 
				+   the system, if any other measurements are in progress the counters and
			
 
				+   their corresponding event registers will be clobbered.
			
 
				+
			
 
				+When a pseudo-locked region is created a new debugfs directory is created for
			
 
				+it in debugfs as /sys/kernel/debug/resctrl/<newdir>. A single
			
 
				+write-only file, pseudo_lock_measure, is present in this directory. The
			
 
				+measurement on the pseudo-locked region depends on the number, 1 or 2,
			
 
				+written to this debugfs file. Since the measurements are recorded with the
			
 
				+tracing infrastructure the relevant tracepoints need to be enabled before the
			
 
				+measurement is triggered.
			
 
				+
			
 
				+Example of latency debugging interface:
			
 
				+In this example a pseudo-locked region named "newlock" was created. Here is
			
 
				+how we can measure the latency in cycles of reading from this region and
			
 
				+visualize this data with a histogram that is available if CONFIG_HIST_TRIGGERS
			
 
				+is set:
			
 
				+# :> /sys/kernel/debug/tracing/trace
			
 
				+# echo 'hist:keys=latency' > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/trigger
			
 
				+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
			
 
				+# echo 1 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
			
 
				+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/enable
			
 
				+# cat /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_mem_latency/hist
			
 
				+
			
 
				+# event histogram
			
 
				+#
			
 
				+# trigger info: hist:keys=latency:vals=hitcount:sort=hitcount:size=2048 [active]
			
 
				+#
			
 
				+
			
 
				+{ latency:        456 } hitcount:          1
			
 
				+{ latency:         50 } hitcount:         83
			
 
				+{ latency:         36 } hitcount:         96
			
 
				+{ latency:         44 } hitcount:        174
			
 
				+{ latency:         48 } hitcount:        195
			
 
				+{ latency:         46 } hitcount:        262
			
 
				+{ latency:         42 } hitcount:        693
			
 
				+{ latency:         40 } hitcount:       3204
			
 
				+{ latency:         38 } hitcount:       3484
			
 
				+
			
 
				+Totals:
			
 
				+    Hits: 8192
			
 
				+    Entries: 9
			
 
				+   Dropped: 0
			
 
				+
			
 
				+Example of cache hits/misses debugging:
			
 
				+In this example a pseudo-locked region named "newlock" was created on the L2
			
 
				+cache of a platform. Here is how we can obtain details of the cache hits
			
 
				+and misses using the platform's precision counters.
			
 
				+
			
 
				+# :> /sys/kernel/debug/tracing/trace
			
 
				+# echo 1 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
			
 
				+# echo 2 > /sys/kernel/debug/resctrl/newlock/pseudo_lock_measure
			
 
				+# echo 0 > /sys/kernel/debug/tracing/events/resctrl/pseudo_lock_l2/enable
			
 
				+# cat /sys/kernel/debug/tracing/trace
			
 
				+
			
 
				+# tracer: nop
			
 
				+#
			
 
				+#                              _-----=> irqs-off
			
 
				+#                             / _----=> need-resched
			
 
				+#                            | / _---=> hardirq/softirq
			
 
				+#                            || / _--=> preempt-depth
			
 
				+#                            ||| /     delay
			
 
				+#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
			
 
				+#              | |       |   ||||       |         |
			
 
				+ pseudo_lock_mea-1672  [002] ....  3132.860500: pseudo_lock_l2: hits=4097 miss=0
			
 
				+
			
 
				+
			
 
				 Examples for RDT allocation usage:
			
 
				 
			
 
				 Example 1
			
@@ -502,7 +711,172 @@ siblings and only the real time threads are scheduled on the cores 4-7.
 
				 
			
 
				 # echo F0 > p0/cpus
			
 
				 
			
 
				-4) Locking between applications
			
 
				+Example 4
			
 
				+---------
			
 
				+
			
 
				+The resource groups in previous examples were all in the default "shareable"
			
 
				+mode allowing sharing of their cache allocations. If one resource group
			
 
				+configures a cache allocation then nothing prevents another resource group
			
 
				+to overlap with that allocation.
			
 
				+
			
 
				+In this example a new exclusive resource group will be created on a L2 CAT
			
 
				+system with two L2 cache instances that can be configured with an 8-bit
			
 
				+capacity bitmask. The new exclusive resource group will be configured to use
			
 
				+25% of each cache instance.
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl/
			
 
				+# cd /sys/fs/resctrl
			
 
				+
			
 
				+First, we observe that the default group is configured to allocate to all L2
			
 
				+cache:
			
 
				+
			
 
				+# cat schemata
			
 
				+L2:0=ff;1=ff
			
 
				+
			
 
				+We could attempt to create the new resource group at this point, but it will
			
 
				+fail because of the overlap with the schemata of the default group:
			
 
				+# mkdir p0
			
 
				+# echo 'L2:0=0x3;1=0x3' > p0/schemata
			
 
				+# cat p0/mode
			
 
				+shareable
			
 
				+# echo exclusive > p0/mode
			
 
				+-sh: echo: write error: Invalid argument
			
 
				+# cat info/last_cmd_status
			
 
				+schemata overlaps
			
 
				+
			
 
				+To ensure that there is no overlap with another resource group the default
			
 
				+resource group's schemata has to change, making it possible for the new
			
 
				+resource group to become exclusive.
			
 
				+# echo 'L2:0=0xfc;1=0xfc' > schemata
			
 
				+# echo exclusive > p0/mode
			
 
				+# grep . p0/*
			
 
				+p0/cpus:0
			
 
				+p0/mode:exclusive
			
 
				+p0/schemata:L2:0=03;1=03
			
 
				+p0/size:L2:0=262144;1=262144
			
 
				+
			
 
				+A new resource group will on creation not overlap with an exclusive resource
			
 
				+group:
			
 
				+# mkdir p1
			
 
				+# grep . p1/*
			
 
				+p1/cpus:0
			
 
				+p1/mode:shareable
			
 
				+p1/schemata:L2:0=fc;1=fc
			
 
				+p1/size:L2:0=786432;1=786432
			
 
				+
			
 
				+The bit_usage will reflect how the cache is used:
			
 
				+# cat info/L2/bit_usage
			
 
				+0=SSSSSSEE;1=SSSSSSEE
			
 
				+
			
 
				+A resource group cannot be forced to overlap with an exclusive resource group:
			
 
				+# echo 'L2:0=0x1;1=0x1' > p1/schemata
			
 
				+-sh: echo: write error: Invalid argument
			
 
				+# cat info/last_cmd_status
			
 
				+overlaps with exclusive group
			
 
				+
			
 
				+Example of Cache Pseudo-Locking
			
 
				+-------------------------------
			
 
				+Lock portion of L2 cache from cache id 1 using CBM 0x3. Pseudo-locked
			
 
				+region is exposed at /dev/pseudo_lock/newlock that can be provided to
			
 
				+application for argument to mmap().
			
 
				+
			
 
				+# mount -t resctrl resctrl /sys/fs/resctrl/
			
 
				+# cd /sys/fs/resctrl
			
 
				+
			
 
				+Ensure that there are bits available that can be pseudo-locked, since only
			
 
				+unused bits can be pseudo-locked the bits to be pseudo-locked needs to be
			
 
				+removed from the default resource group's schemata:
			
 
				+# cat info/L2/bit_usage
			
 
				+0=SSSSSSSS;1=SSSSSSSS
			
 
				+# echo 'L2:1=0xfc' > schemata
			
 
				+# cat info/L2/bit_usage
			
 
				+0=SSSSSSSS;1=SSSSSS00
			
 
				+
			
 
				+Create a new resource group that will be associated with the pseudo-locked
			
 
				+region, indicate that it will be used for a pseudo-locked region, and
			
 
				+configure the requested pseudo-locked region capacity bitmask:
			
 
				+
			
 
				+# mkdir newlock
			
 
				+# echo pseudo-locksetup > newlock/mode
			
 
				+# echo 'L2:1=0x3' > newlock/schemata
			
 
				+
			
 
				+On success the resource group's mode will change to pseudo-locked, the
			
 
				+bit_usage will reflect the pseudo-locked region, and the character device
			
 
				+exposing the pseudo-locked region will exist:
			
 
				+
			
 
				+# cat newlock/mode
			
 
				+pseudo-locked
			
 
				+# cat info/L2/bit_usage
			
 
				+0=SSSSSSSS;1=SSSSSSPP
			
 
				+# ls -l /dev/pseudo_lock/newlock
			
 
				+crw------- 1 root root 243, 0 Apr  3 05:01 /dev/pseudo_lock/newlock
			
 
				+
			
 
				+/*
			
 
				+ * Example code to access one page of pseudo-locked cache region
			
 
				+ * from user space.
			
 
				+ */
			
 
				+#define _GNU_SOURCE
			
 
				+#include <fcntl.h>
			
 
				+#include <sched.h>
			
 
				+#include <stdio.h>
			
 
				+#include <stdlib.h>
			
 
				+#include <unistd.h>
			
 
				+#include <sys/mman.h>
			
 
				+
			
 
				+/*
			
 
				+ * It is required that the application runs with affinity to only
			
 
				+ * cores associated with the pseudo-locked region. Here the cpu
			
 
				+ * is hardcoded for convenience of example.
			
 
				+ */
			
 
				+static int cpuid = 2;
			
 
				+
			
 
				+int main(int argc, char *argv[])
			
 
				+{
			
 
				+	cpu_set_t cpuset;
			
 
				+	long page_size;
			
 
				+	void *mapping;
			
 
				+	int dev_fd;
			
 
				+	int ret;
			
 
				+
			
 
				+	page_size = sysconf(_SC_PAGESIZE);
			
 
				+
			
 
				+	CPU_ZERO(&cpuset);
			
 
				+	CPU_SET(cpuid, &cpuset);
			
 
				+	ret = sched_setaffinity(0, sizeof(cpuset), &cpuset);
			
 
				+	if (ret < 0) {
			
 
				+		perror("sched_setaffinity");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	dev_fd = open("/dev/pseudo_lock/newlock", O_RDWR);
			
 
				+	if (dev_fd < 0) {
			
 
				+		perror("open");
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	mapping = mmap(0, page_size, PROT_READ | PROT_WRITE, MAP_SHARED,
			
 
				+		       dev_fd, 0);
			
 
				+	if (mapping == MAP_FAILED) {
			
 
				+		perror("mmap");
			
 
				+		close(dev_fd);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	/* Application interacts with pseudo-locked memory @mapping */
			
 
				+
			
 
				+	ret = munmap(mapping, page_size);
			
 
				+	if (ret < 0) {
			
 
				+		perror("munmap");
			
 
				+		close(dev_fd);
			
 
				+		exit(EXIT_FAILURE);
			
 
				+	}
			
 
				+
			
 
				+	close(dev_fd);
			
 
				+	exit(EXIT_SUCCESS);
			
 
				+}
			
 
				+
			
 
				+Locking between applications
			
 
				+----------------------------
			
 
				 
			
 
				 Certain operations on the resctrl filesystem, composed of read/writes
			
 
				 to/from multiple files, must be atomic.
			
@@ -510,7 +884,7 @@ to/from multiple files, must be atomic.
 
				 As an example, the allocation of an exclusive reservation of L3 cache
			
 
				 involves:
			
 
				 
			
 
				-  1. Read the cbmmasks from each directory
			
 
				+  1. Read the cbmmasks from each directory or the per-resource "bit_usage"
			
 
				   2. Find a contiguous set of bits in the global CBM bitmask that is clear
			
 
				      in any of the directory cbmmasks
			
 
				   3. Create a new directory
			
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -35,7 +35,9 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o
 
				 obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o
			
 
				 obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o
			
 
				 
			
 
				-obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o
			
 
				+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o
			
 
				+obj-$(CONFIG_INTEL_RDT)	+= intel_rdt_ctrlmondata.o intel_rdt_pseudo_lock.o
			
 
				+CFLAGS_intel_rdt_pseudo_lock.o = -I$(src)
			
 
				 
			
 
				 obj-$(CONFIG_X86_MCE)			+= mcheck/
			
 
				 obj-$(CONFIG_MTRR)			+= mtrr/
			
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -859,6 +859,8 @@ static __init bool get_rdt_resources(void)
 
				 	return (rdt_mon_capable || rdt_alloc_capable);
			
 
				 }
			
 
				 
			
 
				+static enum cpuhp_state rdt_online;
			
 
				+
			
 
				 static int __init intel_rdt_late_init(void)
			
 
				 {
			
 
				 	struct rdt_resource *r;
			
@@ -880,6 +882,7 @@ static int __init intel_rdt_late_init(void)
 
				 		cpuhp_remove_state(state);
			
 
				 		return ret;
			
 
				 	}
			
 
				+	rdt_online = state;
			
 
				 
			
 
				 	for_each_alloc_capable_rdt_resource(r)
			
 
				 		pr_info("Intel RDT %s allocation detected\n", r->name);
			
@@ -891,3 +894,11 @@ static int __init intel_rdt_late_init(void)
 
				 }
			
 
				 
			
 
				 late_initcall(intel_rdt_late_init);
			
 
				+
			
 
				+static void __exit intel_rdt_exit(void)
			
 
				+{
			
 
				+	cpuhp_remove_state(rdt_online);
			
 
				+	rdtgroup_exit();
			
 
				+}
			
 
				+
			
 
				+__exitcall(intel_rdt_exit);
			
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -80,6 +80,34 @@ enum rdt_group_type {
 
				 	RDT_NUM_GROUP,
			
 
				 };
			
 
				 
			
 
				+/**
			
 
				+ * enum rdtgrp_mode - Mode of a RDT resource group
			
 
				+ * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations
			
 
				+ * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed
			
 
				+ * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking
			
 
				+ * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations
			
 
				+ *                          allowed AND the allocations are Cache Pseudo-Locked
			
 
				+ *
			
 
				+ * The mode of a resource group enables control over the allowed overlap
			
 
				+ * between allocations associated with different resource groups (classes
			
 
				+ * of service). User is able to modify the mode of a resource group by
			
 
				+ * writing to the "mode" resctrl file associated with the resource group.
			
 
				+ *
			
 
				+ * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by
			
 
				+ * writing the appropriate text to the "mode" file. A resource group enters
			
 
				+ * "pseudo-locked" mode after the schemata is written while the resource
			
 
				+ * group is in "pseudo-locksetup" mode.
			
 
				+ */
			
 
				+enum rdtgrp_mode {
			
 
				+	RDT_MODE_SHAREABLE = 0,
			
 
				+	RDT_MODE_EXCLUSIVE,
			
 
				+	RDT_MODE_PSEUDO_LOCKSETUP,
			
 
				+	RDT_MODE_PSEUDO_LOCKED,
			
 
				+
			
 
				+	/* Must be last */
			
 
				+	RDT_NUM_MODES,
			
 
				+};
			
 
				+
			
 
				 /**
			
 
				  * struct mongroup - store mon group's data in resctrl fs.
			
 
				  * @mon_data_kn		kernlfs node for the mon_data directory
			
@@ -94,6 +122,43 @@ struct mongroup {
 
				 	u32			rmid;
			
 
				 };
			
 
				 
			
 
				+/**
			
 
				+ * struct pseudo_lock_region - pseudo-lock region information
			
 
				+ * @r:			RDT resource to which this pseudo-locked region
			
 
				+ *			belongs
			
 
				+ * @d:			RDT domain to which this pseudo-locked region
			
 
				+ *			belongs
			
 
				+ * @cbm:		bitmask of the pseudo-locked region
			
 
				+ * @lock_thread_wq:	waitqueue used to wait on the pseudo-locking thread
			
 
				+ *			completion
			
 
				+ * @thread_done:	variable used by waitqueue to test if pseudo-locking
			
 
				+ *			thread completed
			
 
				+ * @cpu:		core associated with the cache on which the setup code
			
 
				+ *			will be run
			
 
				+ * @line_size:		size of the cache lines
			
 
				+ * @size:		size of pseudo-locked region in bytes
			
 
				+ * @kmem:		the kernel memory associated with pseudo-locked region
			
 
				+ * @minor:		minor number of character device associated with this
			
 
				+ *			region
			
 
				+ * @debugfs_dir:	pointer to this region's directory in the debugfs
			
 
				+ *			filesystem
			
 
				+ * @pm_reqs:		Power management QoS requests related to this region
			
 
				+ */
			
 
				+struct pseudo_lock_region {
			
 
				+	struct rdt_resource	*r;
			
 
				+	struct rdt_domain	*d;
			
 
				+	u32			cbm;
			
 
				+	wait_queue_head_t	lock_thread_wq;
			
 
				+	int			thread_done;
			
 
				+	int			cpu;
			
 
				+	unsigned int		line_size;
			
 
				+	unsigned int		size;
			
 
				+	void			*kmem;
			
 
				+	unsigned int		minor;
			
 
				+	struct dentry		*debugfs_dir;
			
 
				+	struct list_head	pm_reqs;
			
 
				+};
			
 
				+
			
 
				 /**
			
 
				  * struct rdtgroup - store rdtgroup's data in resctrl file system.
			
 
				  * @kn:				kernfs node
			
@@ -106,16 +171,20 @@ struct mongroup {
 
				  * @type:			indicates type of this rdtgroup - either
			
 
				  *				monitor only or ctrl_mon group
			
 
				  * @mon:			mongroup related data
			
 
				+ * @mode:			mode of resource group
			
 
				+ * @plr:			pseudo-locked region
			
 
				  */
			
 
				 struct rdtgroup {
			
 
				-	struct kernfs_node	*kn;
			
 
				-	struct list_head	rdtgroup_list;
			
 
				-	u32			closid;
			
 
				-	struct cpumask		cpu_mask;
			
 
				-	int			flags;
			
 
				-	atomic_t		waitcount;
			
 
				-	enum rdt_group_type	type;
			
 
				-	struct mongroup		mon;
			
 
				+	struct kernfs_node		*kn;
			
 
				+	struct list_head		rdtgroup_list;
			
 
				+	u32				closid;
			
 
				+	struct cpumask			cpu_mask;
			
 
				+	int				flags;
			
 
				+	atomic_t			waitcount;
			
 
				+	enum rdt_group_type		type;
			
 
				+	struct mongroup			mon;
			
 
				+	enum rdtgrp_mode		mode;
			
 
				+	struct pseudo_lock_region	*plr;
			
 
				 };
			
 
				 
			
 
				 /* rdtgroup.flags */
			
@@ -148,6 +217,7 @@ extern struct list_head rdt_all_groups;
 
				 extern int max_name_width, max_data_width;
			
 
				 
			
 
				 int __init rdtgroup_init(void);
			
 
				+void __exit rdtgroup_exit(void);
			
 
				 
			
 
				 /**
			
 
				  * struct rftype - describe each file in the resctrl file system
			
@@ -216,22 +286,24 @@ struct mbm_state {
 
				  * @mbps_val:	When mba_sc is enabled, this holds the bandwidth in MBps
			
 
				  * @new_ctrl:	new ctrl value to be loaded
			
 
				  * @have_new_ctrl: did user provide new_ctrl for this domain
			
 
				+ * @plr:	pseudo-locked region (if any) associated with domain
			
 
				  */
			
 
				 struct rdt_domain {
			
 
				-	struct list_head	list;
			
 
				-	int			id;
			
 
				-	struct cpumask		cpu_mask;
			
 
				-	unsigned long		*rmid_busy_llc;
			
 
				-	struct mbm_state	*mbm_total;
			
 
				-	struct mbm_state	*mbm_local;
			
 
				-	struct delayed_work	mbm_over;
			
 
				-	struct delayed_work	cqm_limbo;
			
 
				-	int			mbm_work_cpu;
			
 
				-	int			cqm_work_cpu;
			
 
				-	u32			*ctrl_val;
			
 
				-	u32			*mbps_val;
			
 
				-	u32			new_ctrl;
			
 
				-	bool			have_new_ctrl;
			
 
				+	struct list_head		list;
			
 
				+	int				id;
			
 
				+	struct cpumask			cpu_mask;
			
 
				+	unsigned long			*rmid_busy_llc;
			
 
				+	struct mbm_state		*mbm_total;
			
 
				+	struct mbm_state		*mbm_local;
			
 
				+	struct delayed_work		mbm_over;
			
 
				+	struct delayed_work		cqm_limbo;
			
 
				+	int				mbm_work_cpu;
			
 
				+	int				cqm_work_cpu;
			
 
				+	u32				*ctrl_val;
			
 
				+	u32				*mbps_val;
			
 
				+	u32				new_ctrl;
			
 
				+	bool				have_new_ctrl;
			
 
				+	struct pseudo_lock_region	*plr;
			
 
				 };
			
 
				 
			
 
				 /**
			
@@ -351,7 +423,7 @@ struct rdt_resource {
 
				 	struct rdt_cache	cache;
			
 
				 	struct rdt_membw	membw;
			
 
				 	const char		*format_str;
			
 
				-	int (*parse_ctrlval)	(char *buf, struct rdt_resource *r,
			
 
				+	int (*parse_ctrlval)	(void *data, struct rdt_resource *r,
			
 
				 				 struct rdt_domain *d);
			
 
				 	struct list_head	evt_list;
			
 
				 	int			num_rmid;
			
@@ -359,8 +431,8 @@ struct rdt_resource {
 
				 	unsigned long		fflags;
			
 
				 };
			
 
				 
			
 
				-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d);
			
 
				-int parse_bw(char *buf, struct rdt_resource *r,  struct rdt_domain *d);
			
 
				+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d);
			
 
				+int parse_bw(void *_buf, struct rdt_resource *r,  struct rdt_domain *d);
			
 
				 
			
 
				 extern struct mutex rdtgroup_mutex;
			
 
				 
			
@@ -368,7 +440,7 @@ extern struct rdt_resource rdt_resources_all[];
 
				 extern struct rdtgroup rdtgroup_default;
			
 
				 DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key);
			
 
				 
			
 
				-int __init rdtgroup_init(void);
			
 
				+extern struct dentry *debugfs_resctrl;
			
 
				 
			
 
				 enum {
			
 
				 	RDT_RESOURCE_L3,
			
@@ -439,13 +511,32 @@ void rdt_last_cmd_printf(const char *fmt, ...);
 
				 void rdt_ctrl_update(void *arg);
			
 
				 struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn);
			
 
				 void rdtgroup_kn_unlock(struct kernfs_node *kn);
			
 
				+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name);
			
 
				+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
			
 
				+			     umode_t mask);
			
 
				 struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id,
			
 
				 				   struct list_head **pos);
			
 
				 ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
			
 
				 				char *buf, size_t nbytes, loff_t off);
			
 
				 int rdtgroup_schemata_show(struct kernfs_open_file *of,
			
 
				 			   struct seq_file *s, void *v);
			
 
				+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
			
 
				+			   u32 _cbm, int closid, bool exclusive);
			
 
				+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_domain *d,
			
 
				+				  u32 cbm);
			
 
				+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid);
			
 
				+int rdtgroup_tasks_assigned(struct rdtgroup *r);
			
 
				+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp);
			
 
				+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp);
			
 
				+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm);
			
 
				+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d);
			
 
				+int rdt_pseudo_lock_init(void);
			
 
				+void rdt_pseudo_lock_release(void);
			
 
				+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp);
			
 
				+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp);
			
 
				 struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r);
			
 
				+int update_domains(struct rdt_resource *r, int closid);
			
 
				+void closid_free(int closid);
			
 
				 int alloc_rmid(void);
			
 
				 void free_rmid(u32 rmid);
			
 
				 int rdt_get_mon_l3_config(struct rdt_resource *r);
			
--- a/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
+++ b/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c
@@ -64,9 +64,10 @@ static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				-int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
			
 
				+int parse_bw(void *_buf, struct rdt_resource *r, struct rdt_domain *d)
			
 
				 {
			
 
				 	unsigned long data;
			
 
				+	char *buf = _buf;
			
 
				 
			
 
				 	if (d->have_new_ctrl) {
			
 
				 		rdt_last_cmd_printf("duplicate domain %d\n", d->id);
			
@@ -87,7 +88,7 @@ int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d)
 
				  *	are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.).
			
 
				  * Additionally Haswell requires at least two bits set.
			
 
				  */
			
 
				-static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
			
 
				+static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r)
			
 
				 {
			
 
				 	unsigned long first_bit, zero_bit, val;
			
 
				 	unsigned int cbm_len = r->cache.cbm_len;
			
@@ -122,22 +123,64 @@ static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r)
 
				 	return true;
			
 
				 }
			
 
				 
			
 
				+struct rdt_cbm_parse_data {
			
 
				+	struct rdtgroup		*rdtgrp;
			
 
				+	char			*buf;
			
 
				+};
			
 
				+
			
 
				 /*
			
 
				  * Read one cache bit mask (hex). Check that it is valid for the current
			
 
				  * resource type.
			
 
				  */
			
 
				-int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
			
 
				+int parse_cbm(void *_data, struct rdt_resource *r, struct rdt_domain *d)
			
 
				 {
			
 
				-	unsigned long data;
			
 
				+	struct rdt_cbm_parse_data *data = _data;
			
 
				+	struct rdtgroup *rdtgrp = data->rdtgrp;
			
 
				+	u32 cbm_val;
			
 
				 
			
 
				 	if (d->have_new_ctrl) {
			
 
				 		rdt_last_cmd_printf("duplicate domain %d\n", d->id);
			
 
				 		return -EINVAL;
			
 
				 	}
			
 
				 
			
 
				-	if(!cbm_validate(buf, &data, r))
			
 
				+	/*
			
 
				+	 * Cannot set up more than one pseudo-locked region in a cache
			
 
				+	 * hierarchy.
			
 
				+	 */
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP &&
			
 
				+	    rdtgroup_pseudo_locked_in_hierarchy(d)) {
			
 
				+		rdt_last_cmd_printf("pseudo-locked region in hierarchy\n");
			
 
				 		return -EINVAL;
			
 
				-	d->new_ctrl = data;
			
 
				+	}
			
 
				+
			
 
				+	if (!cbm_validate(data->buf, &cbm_val, r))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
			
 
				+	     rdtgrp->mode == RDT_MODE_SHAREABLE) &&
			
 
				+	    rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) {
			
 
				+		rdt_last_cmd_printf("CBM overlaps with pseudo-locked region\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * The CBM may not overlap with the CBM of another closid if
			
 
				+	 * either is exclusive.
			
 
				+	 */
			
 
				+	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, true)) {
			
 
				+		rdt_last_cmd_printf("overlaps with exclusive group\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (rdtgroup_cbm_overlaps(r, d, cbm_val, rdtgrp->closid, false)) {
			
 
				+		if (rdtgrp->mode == RDT_MODE_EXCLUSIVE ||
			
 
				+		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+			rdt_last_cmd_printf("overlaps with other group\n");
			
 
				+			return -EINVAL;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	d->new_ctrl = cbm_val;
			
 
				 	d->have_new_ctrl = true;
			
 
				 
			
 
				 	return 0;
			
@@ -149,8 +192,10 @@ int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d)
 
				  * separated by ";". The "id" is in decimal, and must match one of
			
 
				  * the "id"s for this resource.
			
 
				  */
			
 
				-static int parse_line(char *line, struct rdt_resource *r)
			
 
				+static int parse_line(char *line, struct rdt_resource *r,
			
 
				+		      struct rdtgroup *rdtgrp)
			
 
				 {
			
 
				+	struct rdt_cbm_parse_data data;
			
 
				 	char *dom = NULL, *id;
			
 
				 	struct rdt_domain *d;
			
 
				 	unsigned long dom_id;
			
@@ -167,15 +212,32 @@ next:
 
				 	dom = strim(dom);
			
 
				 	list_for_each_entry(d, &r->domains, list) {
			
 
				 		if (d->id == dom_id) {
			
 
				-			if (r->parse_ctrlval(dom, r, d))
			
 
				+			data.buf = dom;
			
 
				+			data.rdtgrp = rdtgrp;
			
 
				+			if (r->parse_ctrlval(&data, r, d))
			
 
				 				return -EINVAL;
			
 
				+			if (rdtgrp->mode ==  RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+				/*
			
 
				+				 * In pseudo-locking setup mode and just
			
 
				+				 * parsed a valid CBM that should be
			
 
				+				 * pseudo-locked. Only one locked region per
			
 
				+				 * resource group and domain so just do
			
 
				+				 * the required initialization for single
			
 
				+				 * region and return.
			
 
				+				 */
			
 
				+				rdtgrp->plr->r = r;
			
 
				+				rdtgrp->plr->d = d;
			
 
				+				rdtgrp->plr->cbm = d->new_ctrl;
			
 
				+				d->plr = rdtgrp->plr;
			
 
				+				return 0;
			
 
				+			}
			
 
				 			goto next;
			
 
				 		}
			
 
				 	}
			
 
				 	return -EINVAL;
			
 
				 }
			
 
				 
			
 
				-static int update_domains(struct rdt_resource *r, int closid)
			
 
				+int update_domains(struct rdt_resource *r, int closid)
			
 
				 {
			
 
				 	struct msr_param msr_param;
			
 
				 	cpumask_var_t cpu_mask;
			
@@ -220,13 +282,14 @@ done:
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				-static int rdtgroup_parse_resource(char *resname, char *tok, int closid)
			
 
				+static int rdtgroup_parse_resource(char *resname, char *tok,
			
 
				+				   struct rdtgroup *rdtgrp)
			
 
				 {
			
 
				 	struct rdt_resource *r;
			
 
				 
			
 
				 	for_each_alloc_enabled_rdt_resource(r) {
			
 
				-		if (!strcmp(resname, r->name) && closid < r->num_closid)
			
 
				-			return parse_line(tok, r);
			
 
				+		if (!strcmp(resname, r->name) && rdtgrp->closid < r->num_closid)
			
 
				+			return parse_line(tok, r, rdtgrp);
			
 
				 	}
			
 
				 	rdt_last_cmd_printf("unknown/unsupported resource name '%s'\n", resname);
			
 
				 	return -EINVAL;
			
@@ -239,7 +302,7 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
				 	struct rdt_domain *dom;
			
 
				 	struct rdt_resource *r;
			
 
				 	char *tok, *resname;
			
 
				-	int closid, ret = 0;
			
 
				+	int ret = 0;
			
 
				 
			
 
				 	/* Valid input requires a trailing newline */
			
 
				 	if (nbytes == 0 || buf[nbytes - 1] != '\n')
			
@@ -253,7 +316,15 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
				 	}
			
 
				 	rdt_last_cmd_clear();
			
 
				 
			
 
				-	closid = rdtgrp->closid;
			
 
				+	/*
			
 
				+	 * No changes to pseudo-locked region allowed. It has to be removed
			
 
				+	 * and re-created instead.
			
 
				+	 */
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
			
 
				+		ret = -EINVAL;
			
 
				+		rdt_last_cmd_puts("resource group is pseudo-locked\n");
			
 
				+		goto out;
			
 
				+	}
			
 
				 
			
 
				 	for_each_alloc_enabled_rdt_resource(r) {
			
 
				 		list_for_each_entry(dom, &r->domains, list)
			
@@ -272,17 +343,27 @@ ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of,
 
				 			ret = -EINVAL;
			
 
				 			goto out;
			
 
				 		}
			
 
				-		ret = rdtgroup_parse_resource(resname, tok, closid);
			
 
				+		ret = rdtgroup_parse_resource(resname, tok, rdtgrp);
			
 
				 		if (ret)
			
 
				 			goto out;
			
 
				 	}
			
 
				 
			
 
				 	for_each_alloc_enabled_rdt_resource(r) {
			
 
				-		ret = update_domains(r, closid);
			
 
				+		ret = update_domains(r, rdtgrp->closid);
			
 
				 		if (ret)
			
 
				 			goto out;
			
 
				 	}
			
 
				 
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+		/*
			
 
				+		 * If pseudo-locking fails we keep the resource group in
			
 
				+		 * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service
			
 
				+		 * active and updated for just the domain the pseudo-locked
			
 
				+		 * region was requested for.
			
 
				+		 */
			
 
				+		ret = rdtgroup_pseudo_lock_create(rdtgrp);
			
 
				+	}
			
 
				+
			
 
				 out:
			
 
				 	rdtgroup_kn_unlock(of->kn);
			
 
				 	return ret ?: nbytes;
			
@@ -318,10 +399,18 @@ int rdtgroup_schemata_show(struct kernfs_open_file *of,
 
				 
			
 
				 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				 	if (rdtgrp) {
			
 
				-		closid = rdtgrp->closid;
			
 
				-		for_each_alloc_enabled_rdt_resource(r) {
			
 
				-			if (closid < r->num_closid)
			
 
				-				show_doms(s, r, closid);
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+			for_each_alloc_enabled_rdt_resource(r)
			
 
				+				seq_printf(s, "%s:uninitialized\n", r->name);
			
 
				+		} else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
			
 
				+			seq_printf(s, "%s:%d=%x\n", rdtgrp->plr->r->name,
			
 
				+				   rdtgrp->plr->d->id, rdtgrp->plr->cbm);
			
 
				+		} else {
			
 
				+			closid = rdtgrp->closid;
			
 
				+			for_each_alloc_enabled_rdt_resource(r) {
			
 
				+				if (closid < r->num_closid)
			
 
				+					show_doms(s, r, closid);
			
 
				+			}
			
 
				 		}
			
 
				 	} else {
			
 
				 		ret = -ENOENT;
			
--- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock.c
@@ -0,0 +1,1522 @@
 
				+// SPDX-License-Identifier: GPL-2.0
			
 
				+/*
			
 
				+ * Resource Director Technology (RDT)
			
 
				+ *
			
 
				+ * Pseudo-locking support built on top of Cache Allocation Technology (CAT)
			
 
				+ *
			
 
				+ * Copyright (C) 2018 Intel Corporation
			
 
				+ *
			
 
				+ * Author: Reinette Chatre <reinette.chatre@intel.com>
			
 
				+ */
			
 
				+
			
 
				+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
			
 
				+
			
 
				+#include <linux/cacheinfo.h>
			
 
				+#include <linux/cpu.h>
			
 
				+#include <linux/cpumask.h>
			
 
				+#include <linux/debugfs.h>
			
 
				+#include <linux/kthread.h>
			
 
				+#include <linux/mman.h>
			
 
				+#include <linux/pm_qos.h>
			
 
				+#include <linux/slab.h>
			
 
				+#include <linux/uaccess.h>
			
 
				+
			
 
				+#include <asm/cacheflush.h>
			
 
				+#include <asm/intel-family.h>
			
 
				+#include <asm/intel_rdt_sched.h>
			
 
				+#include <asm/perf_event.h>
			
 
				+
			
 
				+#include "intel_rdt.h"
			
 
				+
			
 
				+#define CREATE_TRACE_POINTS
			
 
				+#include "intel_rdt_pseudo_lock_event.h"
			
 
				+
			
 
				+/*
			
 
				+ * MSR_MISC_FEATURE_CONTROL register enables the modification of hardware
			
 
				+ * prefetcher state. Details about this register can be found in the MSR
			
 
				+ * tables for specific platforms found in Intel's SDM.
			
 
				+ */
			
 
				+#define MSR_MISC_FEATURE_CONTROL	0x000001a4
			
 
				+
			
 
				+/*
			
 
				+ * The bits needed to disable hardware prefetching varies based on the
			
 
				+ * platform. During initialization we will discover which bits to use.
			
 
				+ */
			
 
				+static u64 prefetch_disable_bits;
			
 
				+
			
 
				+/*
			
 
				+ * Major number assigned to and shared by all devices exposing
			
 
				+ * pseudo-locked regions.
			
 
				+ */
			
 
				+static unsigned int pseudo_lock_major;
			
 
				+static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0);
			
 
				+static struct class *pseudo_lock_class;
			
 
				+
			
 
				+/**
			
 
				+ * get_prefetch_disable_bits - prefetch disable bits of supported platforms
			
 
				+ *
			
 
				+ * Capture the list of platforms that have been validated to support
			
 
				+ * pseudo-locking. This includes testing to ensure pseudo-locked regions
			
 
				+ * with low cache miss rates can be created under variety of load conditions
			
 
				+ * as well as that these pseudo-locked regions can maintain their low cache
			
 
				+ * miss rates under variety of load conditions for significant lengths of time.
			
 
				+ *
			
 
				+ * After a platform has been validated to support pseudo-locking its
			
 
				+ * hardware prefetch disable bits are included here as they are documented
			
 
				+ * in the SDM.
			
 
				+ *
			
 
				+ * When adding a platform here also add support for its cache events to
			
 
				+ * measure_cycles_perf_fn()
			
 
				+ *
			
 
				+ * Return:
			
 
				+ * If platform is supported, the bits to disable hardware prefetchers, 0
			
 
				+ * if platform is not supported.
			
 
				+ */
			
 
				+static u64 get_prefetch_disable_bits(void)
			
 
				+{
			
 
				+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
			
 
				+	    boot_cpu_data.x86 != 6)
			
 
				+		return 0;
			
 
				+
			
 
				+	switch (boot_cpu_data.x86_model) {
			
 
				+	case INTEL_FAM6_BROADWELL_X:
			
 
				+		/*
			
 
				+		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
			
 
				+		 * as:
			
 
				+		 * 0    L2 Hardware Prefetcher Disable (R/W)
			
 
				+		 * 1    L2 Adjacent Cache Line Prefetcher Disable (R/W)
			
 
				+		 * 2    DCU Hardware Prefetcher Disable (R/W)
			
 
				+		 * 3    DCU IP Prefetcher Disable (R/W)
			
 
				+		 * 63:4 Reserved
			
 
				+		 */
			
 
				+		return 0xF;
			
 
				+	case INTEL_FAM6_ATOM_GOLDMONT:
			
 
				+	case INTEL_FAM6_ATOM_GEMINI_LAKE:
			
 
				+		/*
			
 
				+		 * SDM defines bits of MSR_MISC_FEATURE_CONTROL register
			
 
				+		 * as:
			
 
				+		 * 0     L2 Hardware Prefetcher Disable (R/W)
			
 
				+		 * 1     Reserved
			
 
				+		 * 2     DCU Hardware Prefetcher Disable (R/W)
			
 
				+		 * 63:3  Reserved
			
 
				+		 */
			
 
				+		return 0x5;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Helper to write 64bit value to MSR without tracing. Used when
			
 
				+ * use of the cache should be restricted and use of registers used
			
 
				+ * for local variables avoided.
			
 
				+ */
			
 
				+static inline void pseudo_wrmsrl_notrace(unsigned int msr, u64 val)
			
 
				+{
			
 
				+	__wrmsr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32));
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_minor_get - Obtain available minor number
			
 
				+ * @minor: Pointer to where new minor number will be stored
			
 
				+ *
			
 
				+ * A bitmask is used to track available minor numbers. Here the next free
			
 
				+ * minor number is marked as unavailable and returned.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure.
			
 
				+ */
			
 
				+static int pseudo_lock_minor_get(unsigned int *minor)
			
 
				+{
			
 
				+	unsigned long first_bit;
			
 
				+
			
 
				+	first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS);
			
 
				+
			
 
				+	if (first_bit == MINORBITS)
			
 
				+		return -ENOSPC;
			
 
				+
			
 
				+	__clear_bit(first_bit, &pseudo_lock_minor_avail);
			
 
				+	*minor = first_bit;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_minor_release - Return minor number to available
			
 
				+ * @minor: The minor number made available
			
 
				+ */
			
 
				+static void pseudo_lock_minor_release(unsigned int minor)
			
 
				+{
			
 
				+	__set_bit(minor, &pseudo_lock_minor_avail);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * region_find_by_minor - Locate a pseudo-lock region by inode minor number
			
 
				+ * @minor: The minor number of the device representing pseudo-locked region
			
 
				+ *
			
 
				+ * When the character device is accessed we need to determine which
			
 
				+ * pseudo-locked region it belongs to. This is done by matching the minor
			
 
				+ * number of the device to the pseudo-locked region it belongs.
			
 
				+ *
			
 
				+ * Minor numbers are assigned at the time a pseudo-locked region is associated
			
 
				+ * with a cache instance.
			
 
				+ *
			
 
				+ * Return: On success return pointer to resource group owning the pseudo-locked
			
 
				+ *         region, NULL on failure.
			
 
				+ */
			
 
				+static struct rdtgroup *region_find_by_minor(unsigned int minor)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp, *rdtgrp_match = NULL;
			
 
				+
			
 
				+	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		if (rdtgrp->plr && rdtgrp->plr->minor == minor) {
			
 
				+			rdtgrp_match = rdtgrp;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	return rdtgrp_match;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_pm_req - A power management QoS request list entry
			
 
				+ * @list:	Entry within the @pm_reqs list for a pseudo-locked region
			
 
				+ * @req:	PM QoS request
			
 
				+ */
			
 
				+struct pseudo_lock_pm_req {
			
 
				+	struct list_head list;
			
 
				+	struct dev_pm_qos_request req;
			
 
				+};
			
 
				+
			
 
				+static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr)
			
 
				+{
			
 
				+	struct pseudo_lock_pm_req *pm_req, *next;
			
 
				+
			
 
				+	list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) {
			
 
				+		dev_pm_qos_remove_request(&pm_req->req);
			
 
				+		list_del(&pm_req->list);
			
 
				+		kfree(pm_req);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_cstates_constrain - Restrict cores from entering C6
			
 
				+ *
			
 
				+ * To prevent the cache from being affected by power management entering
			
 
				+ * C6 has to be avoided. This is accomplished by requesting a latency
			
 
				+ * requirement lower than lowest C6 exit latency of all supported
			
 
				+ * platforms as found in the cpuidle state tables in the intel_idle driver.
			
 
				+ * At this time it is possible to do so with a single latency requirement
			
 
				+ * for all supported platforms.
			
 
				+ *
			
 
				+ * Since Goldmont is supported, which is affected by X86_BUG_MONITOR,
			
 
				+ * the ACPI latencies need to be considered while keeping in mind that C2
			
 
				+ * may be set to map to deeper sleep states. In this case the latency
			
 
				+ * requirement needs to prevent entering C2 also.
			
 
				+ */
			
 
				+static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr)
			
 
				+{
			
 
				+	struct pseudo_lock_pm_req *pm_req;
			
 
				+	int cpu;
			
 
				+	int ret;
			
 
				+
			
 
				+	for_each_cpu(cpu, &plr->d->cpu_mask) {
			
 
				+		pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL);
			
 
				+		if (!pm_req) {
			
 
				+			rdt_last_cmd_puts("fail allocating mem for PM QoS\n");
			
 
				+			ret = -ENOMEM;
			
 
				+			goto out_err;
			
 
				+		}
			
 
				+		ret = dev_pm_qos_add_request(get_cpu_device(cpu),
			
 
				+					     &pm_req->req,
			
 
				+					     DEV_PM_QOS_RESUME_LATENCY,
			
 
				+					     30);
			
 
				+		if (ret < 0) {
			
 
				+			rdt_last_cmd_printf("fail to add latency req cpu%d\n",
			
 
				+					    cpu);
			
 
				+			kfree(pm_req);
			
 
				+			ret = -1;
			
 
				+			goto out_err;
			
 
				+		}
			
 
				+		list_add(&pm_req->list, &plr->pm_reqs);
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+out_err:
			
 
				+	pseudo_lock_cstates_relax(plr);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_region_clear - Reset pseudo-lock region data
			
 
				+ * @plr: pseudo-lock region
			
 
				+ *
			
 
				+ * All content of the pseudo-locked region is reset - any memory allocated
			
 
				+ * freed.
			
 
				+ *
			
 
				+ * Return: void
			
 
				+ */
			
 
				+static void pseudo_lock_region_clear(struct pseudo_lock_region *plr)
			
 
				+{
			
 
				+	plr->size = 0;
			
 
				+	plr->line_size = 0;
			
 
				+	kfree(plr->kmem);
			
 
				+	plr->kmem = NULL;
			
 
				+	plr->r = NULL;
			
 
				+	if (plr->d)
			
 
				+		plr->d->plr = NULL;
			
 
				+	plr->d = NULL;
			
 
				+	plr->cbm = 0;
			
 
				+	plr->debugfs_dir = NULL;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_region_init - Initialize pseudo-lock region information
			
 
				+ * @plr: pseudo-lock region
			
 
				+ *
			
 
				+ * Called after user provided a schemata to be pseudo-locked. From the
			
 
				+ * schemata the &struct pseudo_lock_region is on entry already initialized
			
 
				+ * with the resource, domain, and capacity bitmask. Here the information
			
 
				+ * required for pseudo-locking is deduced from this data and &struct
			
 
				+ * pseudo_lock_region initialized further. This information includes:
			
 
				+ * - size in bytes of the region to be pseudo-locked
			
 
				+ * - cache line size to know the stride with which data needs to be accessed
			
 
				+ *   to be pseudo-locked
			
 
				+ * - a cpu associated with the cache instance on which the pseudo-locking
			
 
				+ *   flow can be executed
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure. Descriptive error will be written
			
 
				+ * to last_cmd_status buffer.
			
 
				+ */
			
 
				+static int pseudo_lock_region_init(struct pseudo_lock_region *plr)
			
 
				+{
			
 
				+	struct cpu_cacheinfo *ci;
			
 
				+	int ret;
			
 
				+	int i;
			
 
				+
			
 
				+	/* Pick the first cpu we find that is associated with the cache. */
			
 
				+	plr->cpu = cpumask_first(&plr->d->cpu_mask);
			
 
				+
			
 
				+	if (!cpu_online(plr->cpu)) {
			
 
				+		rdt_last_cmd_printf("cpu %u associated with cache not online\n",
			
 
				+				    plr->cpu);
			
 
				+		ret = -ENODEV;
			
 
				+		goto out_region;
			
 
				+	}
			
 
				+
			
 
				+	ci = get_cpu_cacheinfo(plr->cpu);
			
 
				+
			
 
				+	plr->size = rdtgroup_cbm_to_size(plr->r, plr->d, plr->cbm);
			
 
				+
			
 
				+	for (i = 0; i < ci->num_leaves; i++) {
			
 
				+		if (ci->info_list[i].level == plr->r->cache_level) {
			
 
				+			plr->line_size = ci->info_list[i].coherency_line_size;
			
 
				+			return 0;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	ret = -1;
			
 
				+	rdt_last_cmd_puts("unable to determine cache line size\n");
			
 
				+out_region:
			
 
				+	pseudo_lock_region_clear(plr);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_init - Initialize a pseudo-lock region
			
 
				+ * @rdtgrp: resource group to which new pseudo-locked region will belong
			
 
				+ *
			
 
				+ * A pseudo-locked region is associated with a resource group. When this
			
 
				+ * association is created the pseudo-locked region is initialized. The
			
 
				+ * details of the pseudo-locked region are not known at this time so only
			
 
				+ * allocation is done and association established.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure
			
 
				+ */
			
 
				+static int pseudo_lock_init(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	struct pseudo_lock_region *plr;
			
 
				+
			
 
				+	plr = kzalloc(sizeof(*plr), GFP_KERNEL);
			
 
				+	if (!plr)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	init_waitqueue_head(&plr->lock_thread_wq);
			
 
				+	INIT_LIST_HEAD(&plr->pm_reqs);
			
 
				+	rdtgrp->plr = plr;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked
			
 
				+ * @plr: pseudo-lock region
			
 
				+ *
			
 
				+ * Initialize the details required to set up the pseudo-locked region and
			
 
				+ * allocate the contiguous memory that will be pseudo-locked to the cache.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure.  Descriptive error will be written
			
 
				+ * to last_cmd_status buffer.
			
 
				+ */
			
 
				+static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = pseudo_lock_region_init(plr);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * We do not yet support contiguous regions larger than
			
 
				+	 * KMALLOC_MAX_SIZE.
			
 
				+	 */
			
 
				+	if (plr->size > KMALLOC_MAX_SIZE) {
			
 
				+		rdt_last_cmd_puts("requested region exceeds maximum size\n");
			
 
				+		ret = -E2BIG;
			
 
				+		goto out_region;
			
 
				+	}
			
 
				+
			
 
				+	plr->kmem = kzalloc(plr->size, GFP_KERNEL);
			
 
				+	if (!plr->kmem) {
			
 
				+		rdt_last_cmd_puts("unable to allocate memory\n");
			
 
				+		ret = -ENOMEM;
			
 
				+		goto out_region;
			
 
				+	}
			
 
				+
			
 
				+	ret = 0;
			
 
				+	goto out;
			
 
				+out_region:
			
 
				+	pseudo_lock_region_clear(plr);
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_free - Free a pseudo-locked region
			
 
				+ * @rdtgrp: resource group to which pseudo-locked region belonged
			
 
				+ *
			
 
				+ * The pseudo-locked region's resources have already been released, or not
			
 
				+ * yet created at this point. Now it can be freed and disassociated from the
			
 
				+ * resource group.
			
 
				+ *
			
 
				+ * Return: void
			
 
				+ */
			
 
				+static void pseudo_lock_free(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	pseudo_lock_region_clear(rdtgrp->plr);
			
 
				+	kfree(rdtgrp->plr);
			
 
				+	rdtgrp->plr = NULL;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_fn - Load kernel memory into cache
			
 
				+ * @_rdtgrp: resource group to which pseudo-lock region belongs
			
 
				+ *
			
 
				+ * This is the core pseudo-locking flow.
			
 
				+ *
			
 
				+ * First we ensure that the kernel memory cannot be found in the cache.
			
 
				+ * Then, while taking care that there will be as little interference as
			
 
				+ * possible, the memory to be loaded is accessed while core is running
			
 
				+ * with class of service set to the bitmask of the pseudo-locked region.
			
 
				+ * After this is complete no future CAT allocations will be allowed to
			
 
				+ * overlap with this bitmask.
			
 
				+ *
			
 
				+ * Local register variables are utilized to ensure that the memory region
			
 
				+ * to be locked is the only memory access made during the critical locking
			
 
				+ * loop.
			
 
				+ *
			
 
				+ * Return: 0. Waiter on waitqueue will be woken on completion.
			
 
				+ */
			
 
				+static int pseudo_lock_fn(void *_rdtgrp)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp = _rdtgrp;
			
 
				+	struct pseudo_lock_region *plr = rdtgrp->plr;
			
 
				+	u32 rmid_p, closid_p;
			
 
				+	unsigned long i;
			
 
				+#ifdef CONFIG_KASAN
			
 
				+	/*
			
 
				+	 * The registers used for local register variables are also used
			
 
				+	 * when KASAN is active. When KASAN is active we use a regular
			
 
				+	 * variable to ensure we always use a valid pointer, but the cost
			
 
				+	 * is that this variable will enter the cache through evicting the
			
 
				+	 * memory we are trying to lock into the cache. Thus expect lower
			
 
				+	 * pseudo-locking success rate when KASAN is active.
			
 
				+	 */
			
 
				+	unsigned int line_size;
			
 
				+	unsigned int size;
			
 
				+	void *mem_r;
			
 
				+#else
			
 
				+	register unsigned int line_size asm("esi");
			
 
				+	register unsigned int size asm("edi");
			
 
				+#ifdef CONFIG_X86_64
			
 
				+	register void *mem_r asm("rbx");
			
 
				+#else
			
 
				+	register void *mem_r asm("ebx");
			
 
				+#endif /* CONFIG_X86_64 */
			
 
				+#endif /* CONFIG_KASAN */
			
 
				+
			
 
				+	/*
			
 
				+	 * Make sure none of the allocated memory is cached. If it is we
			
 
				+	 * will get a cache hit in below loop from outside of pseudo-locked
			
 
				+	 * region.
			
 
				+	 * wbinvd (as opposed to clflush/clflushopt) is required to
			
 
				+	 * increase likelihood that allocated cache portion will be filled
			
 
				+	 * with associated memory.
			
 
				+	 */
			
 
				+	native_wbinvd();
			
 
				+
			
 
				+	/*
			
 
				+	 * Always called with interrupts enabled. By disabling interrupts
			
 
				+	 * ensure that we will not be preempted during this critical section.
			
 
				+	 */
			
 
				+	local_irq_disable();
			
 
				+
			
 
				+	/*
			
 
				+	 * Call wrmsr and rdmsr as directly as possible to avoid tracing
			
 
				+	 * clobbering local register variables or affecting cache accesses.
			
 
				+	 *
			
 
				+	 * Disable the hardware prefetcher so that when the end of the memory
			
 
				+	 * being pseudo-locked is reached the hardware will not read beyond
			
 
				+	 * the buffer and evict pseudo-locked memory read earlier from the
			
 
				+	 * cache.
			
 
				+	 */
			
 
				+	__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
			
 
				+	closid_p = this_cpu_read(pqr_state.cur_closid);
			
 
				+	rmid_p = this_cpu_read(pqr_state.cur_rmid);
			
 
				+	mem_r = plr->kmem;
			
 
				+	size = plr->size;
			
 
				+	line_size = plr->line_size;
			
 
				+	/*
			
 
				+	 * Critical section begin: start by writing the closid associated
			
 
				+	 * with the capacity bitmask of the cache region being
			
 
				+	 * pseudo-locked followed by reading of kernel memory to load it
			
 
				+	 * into the cache.
			
 
				+	 */
			
 
				+	__wrmsr(IA32_PQR_ASSOC, rmid_p, rdtgrp->closid);
			
 
				+	/*
			
 
				+	 * Cache was flushed earlier. Now access kernel memory to read it
			
 
				+	 * into cache region associated with just activated plr->closid.
			
 
				+	 * Loop over data twice:
			
 
				+	 * - In first loop the cache region is shared with the page walker
			
 
				+	 *   as it populates the paging structure caches (including TLB).
			
 
				+	 * - In the second loop the paging structure caches are used and
			
 
				+	 *   cache region is populated with the memory being referenced.
			
 
				+	 */
			
 
				+	for (i = 0; i < size; i += PAGE_SIZE) {
			
 
				+		/*
			
 
				+		 * Add a barrier to prevent speculative execution of this
			
 
				+		 * loop reading beyond the end of the buffer.
			
 
				+		 */
			
 
				+		rmb();
			
 
				+		asm volatile("mov (%0,%1,1), %%eax\n\t"
			
 
				+			:
			
 
				+			: "r" (mem_r), "r" (i)
			
 
				+			: "%eax", "memory");
			
 
				+	}
			
 
				+	for (i = 0; i < size; i += line_size) {
			
 
				+		/*
			
 
				+		 * Add a barrier to prevent speculative execution of this
			
 
				+		 * loop reading beyond the end of the buffer.
			
 
				+		 */
			
 
				+		rmb();
			
 
				+		asm volatile("mov (%0,%1,1), %%eax\n\t"
			
 
				+			:
			
 
				+			: "r" (mem_r), "r" (i)
			
 
				+			: "%eax", "memory");
			
 
				+	}
			
 
				+	/*
			
 
				+	 * Critical section end: restore closid with capacity bitmask that
			
 
				+	 * does not overlap with pseudo-locked region.
			
 
				+	 */
			
 
				+	__wrmsr(IA32_PQR_ASSOC, rmid_p, closid_p);
			
 
				+
			
 
				+	/* Re-enable the hardware prefetcher(s) */
			
 
				+	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
			
 
				+	local_irq_enable();
			
 
				+
			
 
				+	plr->thread_done = 1;
			
 
				+	wake_up_interruptible(&plr->lock_thread_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_monitor_in_progress - Test if monitoring in progress
			
 
				+ * @r: resource group being queried
			
 
				+ *
			
 
				+ * Return: 1 if monitor groups have been created for this resource
			
 
				+ * group, 0 otherwise.
			
 
				+ */
			
 
				+static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	return !list_empty(&rdtgrp->mon.crdtgrp_list);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_locksetup_user_restrict - Restrict user access to group
			
 
				+ * @rdtgrp: resource group needing access restricted
			
 
				+ *
			
 
				+ * A resource group used for cache pseudo-locking cannot have cpus or tasks
			
 
				+ * assigned to it. This is communicated to the user by restricting access
			
 
				+ * to all the files that can be used to make such changes.
			
 
				+ *
			
 
				+ * Permissions restored with rdtgroup_locksetup_user_restore()
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure. If a failure occurs during the
			
 
				+ * restriction of access an attempt will be made to restore permissions but
			
 
				+ * the state of the mode of these files will be uncertain when a failure
			
 
				+ * occurs.
			
 
				+ */
			
 
				+static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
			
 
				+	if (ret)
			
 
				+		goto err_tasks;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
			
 
				+	if (ret)
			
 
				+		goto err_cpus;
			
 
				+
			
 
				+	if (rdt_mon_capable) {
			
 
				+		ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups");
			
 
				+		if (ret)
			
 
				+			goto err_cpus_list;
			
 
				+	}
			
 
				+
			
 
				+	ret = 0;
			
 
				+	goto out;
			
 
				+
			
 
				+err_cpus_list:
			
 
				+	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
			
 
				+err_cpus:
			
 
				+	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
			
 
				+err_tasks:
			
 
				+	rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_locksetup_user_restore - Restore user access to group
			
 
				+ * @rdtgrp: resource group needing access restored
			
 
				+ *
			
 
				+ * Restore all file access previously removed using
			
 
				+ * rdtgroup_locksetup_user_restrict()
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure.  If a failure occurs during the
			
 
				+ * restoration of access an attempt will be made to restrict permissions
			
 
				+ * again but the state of the mode of these files will be uncertain when
			
 
				+ * a failure occurs.
			
 
				+ */
			
 
				+static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777);
			
 
				+	if (ret)
			
 
				+		goto err_tasks;
			
 
				+
			
 
				+	ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777);
			
 
				+	if (ret)
			
 
				+		goto err_cpus;
			
 
				+
			
 
				+	if (rdt_mon_capable) {
			
 
				+		ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777);
			
 
				+		if (ret)
			
 
				+			goto err_cpus_list;
			
 
				+	}
			
 
				+
			
 
				+	ret = 0;
			
 
				+	goto out;
			
 
				+
			
 
				+err_cpus_list:
			
 
				+	rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list");
			
 
				+err_cpus:
			
 
				+	rdtgroup_kn_mode_restrict(rdtgrp, "cpus");
			
 
				+err_tasks:
			
 
				+	rdtgroup_kn_mode_restrict(rdtgrp, "tasks");
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_locksetup_enter - Resource group enters locksetup mode
			
 
				+ * @rdtgrp: resource group requested to enter locksetup mode
			
 
				+ *
			
 
				+ * A resource group enters locksetup mode to reflect that it would be used
			
 
				+ * to represent a pseudo-locked region and is in the process of being set
			
 
				+ * up to do so. A resource group used for a pseudo-locked region would
			
 
				+ * lose the closid associated with it so we cannot allow it to have any
			
 
				+ * tasks or cpus assigned nor permit tasks or cpus to be assigned in the
			
 
				+ * future. Monitoring of a pseudo-locked region is not allowed either.
			
 
				+ *
			
 
				+ * The above and more restrictions on a pseudo-locked region are checked
			
 
				+ * for and enforced before the resource group enters the locksetup mode.
			
 
				+ *
			
 
				+ * Returns: 0 if the resource group successfully entered locksetup mode, <0
			
 
				+ * on failure. On failure the last_cmd_status buffer is updated with text to
			
 
				+ * communicate details of failure to the user.
			
 
				+ */
			
 
				+int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	/*
			
 
				+	 * The default resource group can neither be removed nor lose the
			
 
				+	 * default closid associated with it.
			
 
				+	 */
			
 
				+	if (rdtgrp == &rdtgroup_default) {
			
 
				+		rdt_last_cmd_puts("cannot pseudo-lock default group\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Cache Pseudo-locking not supported when CDP is enabled.
			
 
				+	 *
			
 
				+	 * Some things to consider if you would like to enable this
			
 
				+	 * support (using L3 CDP as example):
			
 
				+	 * - When CDP is enabled two separate resources are exposed,
			
 
				+	 *   L3DATA and L3CODE, but they are actually on the same cache.
			
 
				+	 *   The implication for pseudo-locking is that if a
			
 
				+	 *   pseudo-locked region is created on a domain of one
			
 
				+	 *   resource (eg. L3CODE), then a pseudo-locked region cannot
			
 
				+	 *   be created on that same domain of the other resource
			
 
				+	 *   (eg. L3DATA). This is because the creation of a
			
 
				+	 *   pseudo-locked region involves a call to wbinvd that will
			
 
				+	 *   affect all cache allocations on particular domain.
			
 
				+	 * - Considering the previous, it may be possible to only
			
 
				+	 *   expose one of the CDP resources to pseudo-locking and
			
 
				+	 *   hide the other. For example, we could consider to only
			
 
				+	 *   expose L3DATA and since the L3 cache is unified it is
			
 
				+	 *   still possible to place instructions there are execute it.
			
 
				+	 * - If only one region is exposed to pseudo-locking we should
			
 
				+	 *   still keep in mind that availability of a portion of cache
			
 
				+	 *   for pseudo-locking should take into account both resources.
			
 
				+	 *   Similarly, if a pseudo-locked region is created in one
			
 
				+	 *   resource, the portion of cache used by it should be made
			
 
				+	 *   unavailable to all future allocations from both resources.
			
 
				+	 */
			
 
				+	if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled ||
			
 
				+	    rdt_resources_all[RDT_RESOURCE_L2DATA].alloc_enabled) {
			
 
				+		rdt_last_cmd_puts("CDP enabled\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Not knowing the bits to disable prefetching implies that this
			
 
				+	 * platform does not support Cache Pseudo-Locking.
			
 
				+	 */
			
 
				+	prefetch_disable_bits = get_prefetch_disable_bits();
			
 
				+	if (prefetch_disable_bits == 0) {
			
 
				+		rdt_last_cmd_puts("pseudo-locking not supported\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (rdtgroup_monitor_in_progress(rdtgrp)) {
			
 
				+		rdt_last_cmd_puts("monitoring in progress\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (rdtgroup_tasks_assigned(rdtgrp)) {
			
 
				+		rdt_last_cmd_puts("tasks assigned to resource group\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (!cpumask_empty(&rdtgrp->cpu_mask)) {
			
 
				+		rdt_last_cmd_puts("CPUs assigned to resource group\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (rdtgroup_locksetup_user_restrict(rdtgrp)) {
			
 
				+		rdt_last_cmd_puts("unable to modify resctrl permissions\n");
			
 
				+		return -EIO;
			
 
				+	}
			
 
				+
			
 
				+	ret = pseudo_lock_init(rdtgrp);
			
 
				+	if (ret) {
			
 
				+		rdt_last_cmd_puts("unable to init pseudo-lock region\n");
			
 
				+		goto out_release;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * If this system is capable of monitoring a rmid would have been
			
 
				+	 * allocated when the control group was created. This is not needed
			
 
				+	 * anymore when this group would be used for pseudo-locking. This
			
 
				+	 * is safe to call on platforms not capable of monitoring.
			
 
				+	 */
			
 
				+	free_rmid(rdtgrp->mon.rmid);
			
 
				+
			
 
				+	ret = 0;
			
 
				+	goto out;
			
 
				+
			
 
				+out_release:
			
 
				+	rdtgroup_locksetup_user_restore(rdtgrp);
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_locksetup_exit - resource group exist locksetup mode
			
 
				+ * @rdtgrp: resource group
			
 
				+ *
			
 
				+ * When a resource group exits locksetup mode the earlier restrictions are
			
 
				+ * lifted.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure
			
 
				+ */
			
 
				+int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	if (rdt_mon_capable) {
			
 
				+		ret = alloc_rmid();
			
 
				+		if (ret < 0) {
			
 
				+			rdt_last_cmd_puts("out of RMIDs\n");
			
 
				+			return ret;
			
 
				+		}
			
 
				+		rdtgrp->mon.rmid = ret;
			
 
				+	}
			
 
				+
			
 
				+	ret = rdtgroup_locksetup_user_restore(rdtgrp);
			
 
				+	if (ret) {
			
 
				+		free_rmid(rdtgrp->mon.rmid);
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	pseudo_lock_free(rdtgrp);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked
			
 
				+ * @d: RDT domain
			
 
				+ * @_cbm: CBM to test
			
 
				+ *
			
 
				+ * @d represents a cache instance and @_cbm a capacity bitmask that is
			
 
				+ * considered for it. Determine if @_cbm overlaps with any existing
			
 
				+ * pseudo-locked region on @d.
			
 
				+ *
			
 
				+ * Return: true if @_cbm overlaps with pseudo-locked region on @d, false
			
 
				+ * otherwise.
			
 
				+ */
			
 
				+bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_domain *d, u32 _cbm)
			
 
				+{
			
 
				+	unsigned long *cbm = (unsigned long *)&_cbm;
			
 
				+	unsigned long *cbm_b;
			
 
				+	unsigned int cbm_len;
			
 
				+
			
 
				+	if (d->plr) {
			
 
				+		cbm_len = d->plr->r->cache.cbm_len;
			
 
				+		cbm_b = (unsigned long *)&d->plr->cbm;
			
 
				+		if (bitmap_intersects(cbm, cbm_b, cbm_len))
			
 
				+			return true;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy
			
 
				+ * @d: RDT domain under test
			
 
				+ *
			
 
				+ * The setup of a pseudo-locked region affects all cache instances within
			
 
				+ * the hierarchy of the region. It is thus essential to know if any
			
 
				+ * pseudo-locked regions exist within a cache hierarchy to prevent any
			
 
				+ * attempts to create new pseudo-locked regions in the same hierarchy.
			
 
				+ *
			
 
				+ * Return: true if a pseudo-locked region exists in the hierarchy of @d or
			
 
				+ *         if it is not possible to test due to memory allocation issue,
			
 
				+ *         false otherwise.
			
 
				+ */
			
 
				+bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_domain *d)
			
 
				+{
			
 
				+	cpumask_var_t cpu_with_psl;
			
 
				+	struct rdt_resource *r;
			
 
				+	struct rdt_domain *d_i;
			
 
				+	bool ret = false;
			
 
				+
			
 
				+	if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL))
			
 
				+		return true;
			
 
				+
			
 
				+	/*
			
 
				+	 * First determine which cpus have pseudo-locked regions
			
 
				+	 * associated with them.
			
 
				+	 */
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		list_for_each_entry(d_i, &r->domains, list) {
			
 
				+			if (d_i->plr)
			
 
				+				cpumask_or(cpu_with_psl, cpu_with_psl,
			
 
				+					   &d_i->cpu_mask);
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Next test if new pseudo-locked region would intersect with
			
 
				+	 * existing region.
			
 
				+	 */
			
 
				+	if (cpumask_intersects(&d->cpu_mask, cpu_with_psl))
			
 
				+		ret = true;
			
 
				+
			
 
				+	free_cpumask_var(cpu_with_psl);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * measure_cycles_lat_fn - Measure cycle latency to read pseudo-locked memory
			
 
				+ * @_plr: pseudo-lock region to measure
			
 
				+ *
			
 
				+ * There is no deterministic way to test if a memory region is cached. One
			
 
				+ * way is to measure how long it takes to read the memory, the speed of
			
 
				+ * access is a good way to learn how close to the cpu the data was. Even
			
 
				+ * more, if the prefetcher is disabled and the memory is read at a stride
			
 
				+ * of half the cache line, then a cache miss will be easy to spot since the
			
 
				+ * read of the first half would be significantly slower than the read of
			
 
				+ * the second half.
			
 
				+ *
			
 
				+ * Return: 0. Waiter on waitqueue will be woken on completion.
			
 
				+ */
			
 
				+static int measure_cycles_lat_fn(void *_plr)
			
 
				+{
			
 
				+	struct pseudo_lock_region *plr = _plr;
			
 
				+	unsigned long i;
			
 
				+	u64 start, end;
			
 
				+#ifdef CONFIG_KASAN
			
 
				+	/*
			
 
				+	 * The registers used for local register variables are also used
			
 
				+	 * when KASAN is active. When KASAN is active we use a regular
			
 
				+	 * variable to ensure we always use a valid pointer to access memory.
			
 
				+	 * The cost is that accessing this pointer, which could be in
			
 
				+	 * cache, will be included in the measurement of memory read latency.
			
 
				+	 */
			
 
				+	void *mem_r;
			
 
				+#else
			
 
				+#ifdef CONFIG_X86_64
			
 
				+	register void *mem_r asm("rbx");
			
 
				+#else
			
 
				+	register void *mem_r asm("ebx");
			
 
				+#endif /* CONFIG_X86_64 */
			
 
				+#endif /* CONFIG_KASAN */
			
 
				+
			
 
				+	local_irq_disable();
			
 
				+	/*
			
 
				+	 * The wrmsr call may be reordered with the assignment below it.
			
 
				+	 * Call wrmsr as directly as possible to avoid tracing clobbering
			
 
				+	 * local register variable used for memory pointer.
			
 
				+	 */
			
 
				+	__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
			
 
				+	mem_r = plr->kmem;
			
 
				+	/*
			
 
				+	 * Dummy execute of the time measurement to load the needed
			
 
				+	 * instructions into the L1 instruction cache.
			
 
				+	 */
			
 
				+	start = rdtsc_ordered();
			
 
				+	for (i = 0; i < plr->size; i += 32) {
			
 
				+		start = rdtsc_ordered();
			
 
				+		asm volatile("mov (%0,%1,1), %%eax\n\t"
			
 
				+			     :
			
 
				+			     : "r" (mem_r), "r" (i)
			
 
				+			     : "%eax", "memory");
			
 
				+		end = rdtsc_ordered();
			
 
				+		trace_pseudo_lock_mem_latency((u32)(end - start));
			
 
				+	}
			
 
				+	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
			
 
				+	local_irq_enable();
			
 
				+	plr->thread_done = 1;
			
 
				+	wake_up_interruptible(&plr->lock_thread_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int measure_cycles_perf_fn(void *_plr)
			
 
				+{
			
 
				+	unsigned long long l3_hits = 0, l3_miss = 0;
			
 
				+	u64 l3_hit_bits = 0, l3_miss_bits = 0;
			
 
				+	struct pseudo_lock_region *plr = _plr;
			
 
				+	unsigned long long l2_hits, l2_miss;
			
 
				+	u64 l2_hit_bits, l2_miss_bits;
			
 
				+	unsigned long i;
			
 
				+#ifdef CONFIG_KASAN
			
 
				+	/*
			
 
				+	 * The registers used for local register variables are also used
			
 
				+	 * when KASAN is active. When KASAN is active we use regular variables
			
 
				+	 * at the cost of including cache access latency to these variables
			
 
				+	 * in the measurements.
			
 
				+	 */
			
 
				+	unsigned int line_size;
			
 
				+	unsigned int size;
			
 
				+	void *mem_r;
			
 
				+#else
			
 
				+	register unsigned int line_size asm("esi");
			
 
				+	register unsigned int size asm("edi");
			
 
				+#ifdef CONFIG_X86_64
			
 
				+	register void *mem_r asm("rbx");
			
 
				+#else
			
 
				+	register void *mem_r asm("ebx");
			
 
				+#endif /* CONFIG_X86_64 */
			
 
				+#endif /* CONFIG_KASAN */
			
 
				+
			
 
				+	/*
			
 
				+	 * Non-architectural event for the Goldmont Microarchitecture
			
 
				+	 * from Intel x86 Architecture Software Developer Manual (SDM):
			
 
				+	 * MEM_LOAD_UOPS_RETIRED D1H (event number)
			
 
				+	 * Umask values:
			
 
				+	 *     L1_HIT   01H
			
 
				+	 *     L2_HIT   02H
			
 
				+	 *     L1_MISS  08H
			
 
				+	 *     L2_MISS  10H
			
 
				+	 *
			
 
				+	 * On Broadwell Microarchitecture the MEM_LOAD_UOPS_RETIRED event
			
 
				+	 * has two "no fix" errata associated with it: BDM35 and BDM100. On
			
 
				+	 * this platform we use the following events instead:
			
 
				+	 *  L2_RQSTS 24H (Documented in https://download.01.org/perfmon/BDW/)
			
 
				+	 *       REFERENCES FFH
			
 
				+	 *       MISS       3FH
			
 
				+	 *  LONGEST_LAT_CACHE 2EH (Documented in SDM)
			
 
				+	 *       REFERENCE 4FH
			
 
				+	 *       MISS      41H
			
 
				+	 */
			
 
				+
			
 
				+	/*
			
 
				+	 * Start by setting flags for IA32_PERFEVTSELx:
			
 
				+	 *     OS  (Operating system mode)  0x2
			
 
				+	 *     INT (APIC interrupt enable)  0x10
			
 
				+	 *     EN  (Enable counter)         0x40
			
 
				+	 *
			
 
				+	 * Then add the Umask value and event number to select performance
			
 
				+	 * event.
			
 
				+	 */
			
 
				+
			
 
				+	switch (boot_cpu_data.x86_model) {
			
 
				+	case INTEL_FAM6_ATOM_GOLDMONT:
			
 
				+	case INTEL_FAM6_ATOM_GEMINI_LAKE:
			
 
				+		l2_hit_bits = (0x52ULL << 16) | (0x2 << 8) | 0xd1;
			
 
				+		l2_miss_bits = (0x52ULL << 16) | (0x10 << 8) | 0xd1;
			
 
				+		break;
			
 
				+	case INTEL_FAM6_BROADWELL_X:
			
 
				+		/* On BDW the l2_hit_bits count references, not hits */
			
 
				+		l2_hit_bits = (0x52ULL << 16) | (0xff << 8) | 0x24;
			
 
				+		l2_miss_bits = (0x52ULL << 16) | (0x3f << 8) | 0x24;
			
 
				+		/* On BDW the l3_hit_bits count references, not hits */
			
 
				+		l3_hit_bits = (0x52ULL << 16) | (0x4f << 8) | 0x2e;
			
 
				+		l3_miss_bits = (0x52ULL << 16) | (0x41 << 8) | 0x2e;
			
 
				+		break;
			
 
				+	default:
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	local_irq_disable();
			
 
				+	/*
			
 
				+	 * Call wrmsr direcly to avoid the local register variables from
			
 
				+	 * being overwritten due to reordering of their assignment with
			
 
				+	 * the wrmsr calls.
			
 
				+	 */
			
 
				+	__wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0);
			
 
				+	/* Disable events and reset counters */
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, 0x0);
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x0);
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0, 0x0);
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 1, 0x0);
			
 
				+	if (l3_hit_bits > 0) {
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x0);
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3, 0x0);
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 2, 0x0);
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_PERFCTR0 + 3, 0x0);
			
 
				+	}
			
 
				+	/* Set and enable the L2 counters */
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0, l2_hit_bits);
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1, l2_miss_bits);
			
 
				+	if (l3_hit_bits > 0) {
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
			
 
				+				      l3_hit_bits);
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
			
 
				+				      l3_miss_bits);
			
 
				+	}
			
 
				+	mem_r = plr->kmem;
			
 
				+	size = plr->size;
			
 
				+	line_size = plr->line_size;
			
 
				+	for (i = 0; i < size; i += line_size) {
			
 
				+		asm volatile("mov (%0,%1,1), %%eax\n\t"
			
 
				+			     :
			
 
				+			     : "r" (mem_r), "r" (i)
			
 
				+			     : "%eax", "memory");
			
 
				+	}
			
 
				+	/*
			
 
				+	 * Call wrmsr directly (no tracing) to not influence
			
 
				+	 * the cache access counters as they are disabled.
			
 
				+	 */
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0,
			
 
				+			      l2_hit_bits & ~(0x40ULL << 16));
			
 
				+	pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 1,
			
 
				+			      l2_miss_bits & ~(0x40ULL << 16));
			
 
				+	if (l3_hit_bits > 0) {
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 2,
			
 
				+				      l3_hit_bits & ~(0x40ULL << 16));
			
 
				+		pseudo_wrmsrl_notrace(MSR_ARCH_PERFMON_EVENTSEL0 + 3,
			
 
				+				      l3_miss_bits & ~(0x40ULL << 16));
			
 
				+	}
			
 
				+	l2_hits = native_read_pmc(0);
			
 
				+	l2_miss = native_read_pmc(1);
			
 
				+	if (l3_hit_bits > 0) {
			
 
				+		l3_hits = native_read_pmc(2);
			
 
				+		l3_miss = native_read_pmc(3);
			
 
				+	}
			
 
				+	wrmsr(MSR_MISC_FEATURE_CONTROL, 0x0, 0x0);
			
 
				+	local_irq_enable();
			
 
				+	/*
			
 
				+	 * On BDW we count references and misses, need to adjust. Sometimes
			
 
				+	 * the "hits" counter is a bit more than the references, for
			
 
				+	 * example, x references but x + 1 hits. To not report invalid
			
 
				+	 * hit values in this case we treat that as misses eaqual to
			
 
				+	 * references.
			
 
				+	 */
			
 
				+	if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
			
 
				+		l2_hits -= (l2_miss > l2_hits ? l2_hits : l2_miss);
			
 
				+	trace_pseudo_lock_l2(l2_hits, l2_miss);
			
 
				+	if (l3_hit_bits > 0) {
			
 
				+		if (boot_cpu_data.x86_model == INTEL_FAM6_BROADWELL_X)
			
 
				+			l3_hits -= (l3_miss > l3_hits ? l3_hits : l3_miss);
			
 
				+		trace_pseudo_lock_l3(l3_hits, l3_miss);
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	plr->thread_done = 1;
			
 
				+	wake_up_interruptible(&plr->lock_thread_wq);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region
			
 
				+ *
			
 
				+ * The measurement of latency to access a pseudo-locked region should be
			
 
				+ * done from a cpu that is associated with that pseudo-locked region.
			
 
				+ * Determine which cpu is associated with this region and start a thread on
			
 
				+ * that cpu to perform the measurement, wait for that thread to complete.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure
			
 
				+ */
			
 
				+static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel)
			
 
				+{
			
 
				+	struct pseudo_lock_region *plr = rdtgrp->plr;
			
 
				+	struct task_struct *thread;
			
 
				+	unsigned int cpu;
			
 
				+	int ret = -1;
			
 
				+
			
 
				+	cpus_read_lock();
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	if (rdtgrp->flags & RDT_DELETED) {
			
 
				+		ret = -ENODEV;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	plr->thread_done = 0;
			
 
				+	cpu = cpumask_first(&plr->d->cpu_mask);
			
 
				+	if (!cpu_online(cpu)) {
			
 
				+		ret = -ENODEV;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (sel == 1)
			
 
				+		thread = kthread_create_on_node(measure_cycles_lat_fn, plr,
			
 
				+						cpu_to_node(cpu),
			
 
				+						"pseudo_lock_measure/%u",
			
 
				+						cpu);
			
 
				+	else if (sel == 2)
			
 
				+		thread = kthread_create_on_node(measure_cycles_perf_fn, plr,
			
 
				+						cpu_to_node(cpu),
			
 
				+						"pseudo_lock_measure/%u",
			
 
				+						cpu);
			
 
				+	else
			
 
				+		goto out;
			
 
				+
			
 
				+	if (IS_ERR(thread)) {
			
 
				+		ret = PTR_ERR(thread);
			
 
				+		goto out;
			
 
				+	}
			
 
				+	kthread_bind(thread, cpu);
			
 
				+	wake_up_process(thread);
			
 
				+
			
 
				+	ret = wait_event_interruptible(plr->lock_thread_wq,
			
 
				+				       plr->thread_done == 1);
			
 
				+	if (ret < 0)
			
 
				+		goto out;
			
 
				+
			
 
				+	ret = 0;
			
 
				+
			
 
				+out:
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+	cpus_read_unlock();
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t pseudo_lock_measure_trigger(struct file *file,
			
 
				+					   const char __user *user_buf,
			
 
				+					   size_t count, loff_t *ppos)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp = file->private_data;
			
 
				+	size_t buf_size;
			
 
				+	char buf[32];
			
 
				+	int ret;
			
 
				+	int sel;
			
 
				+
			
 
				+	buf_size = min(count, (sizeof(buf) - 1));
			
 
				+	if (copy_from_user(buf, user_buf, buf_size))
			
 
				+		return -EFAULT;
			
 
				+
			
 
				+	buf[buf_size] = '\0';
			
 
				+	ret = kstrtoint(buf, 10, &sel);
			
 
				+	if (ret == 0) {
			
 
				+		if (sel != 1)
			
 
				+			return -EINVAL;
			
 
				+		ret = debugfs_file_get(file->f_path.dentry);
			
 
				+		if (ret)
			
 
				+			return ret;
			
 
				+		ret = pseudo_lock_measure_cycles(rdtgrp, sel);
			
 
				+		if (ret == 0)
			
 
				+			ret = count;
			
 
				+		debugfs_file_put(file->f_path.dentry);
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations pseudo_measure_fops = {
			
 
				+	.write = pseudo_lock_measure_trigger,
			
 
				+	.open = simple_open,
			
 
				+	.llseek = default_llseek,
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_pseudo_lock_create - Create a pseudo-locked region
			
 
				+ * @rdtgrp: resource group to which pseudo-lock region belongs
			
 
				+ *
			
 
				+ * Called when a resource group in the pseudo-locksetup mode receives a
			
 
				+ * valid schemata that should be pseudo-locked. Since the resource group is
			
 
				+ * in pseudo-locksetup mode the &struct pseudo_lock_region has already been
			
 
				+ * allocated and initialized with the essential information. If a failure
			
 
				+ * occurs the resource group remains in the pseudo-locksetup mode with the
			
 
				+ * &struct pseudo_lock_region associated with it, but cleared from all
			
 
				+ * information and ready for the user to re-attempt pseudo-locking by
			
 
				+ * writing the schemata again.
			
 
				+ *
			
 
				+ * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0
			
 
				+ * on failure. Descriptive error will be written to last_cmd_status buffer.
			
 
				+ */
			
 
				+int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	struct pseudo_lock_region *plr = rdtgrp->plr;
			
 
				+	struct task_struct *thread;
			
 
				+	unsigned int new_minor;
			
 
				+	struct device *dev;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = pseudo_lock_region_alloc(plr);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	ret = pseudo_lock_cstates_constrain(plr);
			
 
				+	if (ret < 0) {
			
 
				+		ret = -EINVAL;
			
 
				+		goto out_region;
			
 
				+	}
			
 
				+
			
 
				+	plr->thread_done = 0;
			
 
				+
			
 
				+	thread = kthread_create_on_node(pseudo_lock_fn, rdtgrp,
			
 
				+					cpu_to_node(plr->cpu),
			
 
				+					"pseudo_lock/%u", plr->cpu);
			
 
				+	if (IS_ERR(thread)) {
			
 
				+		ret = PTR_ERR(thread);
			
 
				+		rdt_last_cmd_printf("locking thread returned error %d\n", ret);
			
 
				+		goto out_cstates;
			
 
				+	}
			
 
				+
			
 
				+	kthread_bind(thread, plr->cpu);
			
 
				+	wake_up_process(thread);
			
 
				+
			
 
				+	ret = wait_event_interruptible(plr->lock_thread_wq,
			
 
				+				       plr->thread_done == 1);
			
 
				+	if (ret < 0) {
			
 
				+		/*
			
 
				+		 * If the thread does not get on the CPU for whatever
			
 
				+		 * reason and the process which sets up the region is
			
 
				+		 * interrupted then this will leave the thread in runnable
			
 
				+		 * state and once it gets on the CPU it will derefence
			
 
				+		 * the cleared, but not freed, plr struct resulting in an
			
 
				+		 * empty pseudo-locking loop.
			
 
				+		 */
			
 
				+		rdt_last_cmd_puts("locking thread interrupted\n");
			
 
				+		goto out_cstates;
			
 
				+	}
			
 
				+
			
 
				+	ret = pseudo_lock_minor_get(&new_minor);
			
 
				+	if (ret < 0) {
			
 
				+		rdt_last_cmd_puts("unable to obtain a new minor number\n");
			
 
				+		goto out_cstates;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Unlock access but do not release the reference. The
			
 
				+	 * pseudo-locked region will still be here on return.
			
 
				+	 *
			
 
				+	 * The mutex has to be released temporarily to avoid a potential
			
 
				+	 * deadlock with the mm->mmap_sem semaphore which is obtained in
			
 
				+	 * the device_create() and debugfs_create_dir() callpath below
			
 
				+	 * as well as before the mmap() callback is called.
			
 
				+	 */
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+
			
 
				+	if (!IS_ERR_OR_NULL(debugfs_resctrl)) {
			
 
				+		plr->debugfs_dir = debugfs_create_dir(rdtgrp->kn->name,
			
 
				+						      debugfs_resctrl);
			
 
				+		if (!IS_ERR_OR_NULL(plr->debugfs_dir))
			
 
				+			debugfs_create_file("pseudo_lock_measure", 0200,
			
 
				+					    plr->debugfs_dir, rdtgrp,
			
 
				+					    &pseudo_measure_fops);
			
 
				+	}
			
 
				+
			
 
				+	dev = device_create(pseudo_lock_class, NULL,
			
 
				+			    MKDEV(pseudo_lock_major, new_minor),
			
 
				+			    rdtgrp, "%s", rdtgrp->kn->name);
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	if (IS_ERR(dev)) {
			
 
				+		ret = PTR_ERR(dev);
			
 
				+		rdt_last_cmd_printf("failed to create character device: %d\n",
			
 
				+				    ret);
			
 
				+		goto out_debugfs;
			
 
				+	}
			
 
				+
			
 
				+	/* We released the mutex - check if group was removed while we did so */
			
 
				+	if (rdtgrp->flags & RDT_DELETED) {
			
 
				+		ret = -ENODEV;
			
 
				+		goto out_device;
			
 
				+	}
			
 
				+
			
 
				+	plr->minor = new_minor;
			
 
				+
			
 
				+	rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED;
			
 
				+	closid_free(rdtgrp->closid);
			
 
				+	rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444);
			
 
				+	rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444);
			
 
				+
			
 
				+	ret = 0;
			
 
				+	goto out;
			
 
				+
			
 
				+out_device:
			
 
				+	device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor));
			
 
				+out_debugfs:
			
 
				+	debugfs_remove_recursive(plr->debugfs_dir);
			
 
				+	pseudo_lock_minor_release(new_minor);
			
 
				+out_cstates:
			
 
				+	pseudo_lock_cstates_relax(plr);
			
 
				+out_region:
			
 
				+	pseudo_lock_region_clear(plr);
			
 
				+out:
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region
			
 
				+ * @rdtgrp: resource group to which the pseudo-locked region belongs
			
 
				+ *
			
 
				+ * The removal of a pseudo-locked region can be initiated when the resource
			
 
				+ * group is removed from user space via a "rmdir" from userspace or the
			
 
				+ * unmount of the resctrl filesystem. On removal the resource group does
			
 
				+ * not go back to pseudo-locksetup mode before it is removed, instead it is
			
 
				+ * removed directly. There is thus assymmetry with the creation where the
			
 
				+ * &struct pseudo_lock_region is removed here while it was not created in
			
 
				+ * rdtgroup_pseudo_lock_create().
			
 
				+ *
			
 
				+ * Return: void
			
 
				+ */
			
 
				+void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	struct pseudo_lock_region *plr = rdtgrp->plr;
			
 
				+
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+		/*
			
 
				+		 * Default group cannot be a pseudo-locked region so we can
			
 
				+		 * free closid here.
			
 
				+		 */
			
 
				+		closid_free(rdtgrp->closid);
			
 
				+		goto free;
			
 
				+	}
			
 
				+
			
 
				+	pseudo_lock_cstates_relax(plr);
			
 
				+	debugfs_remove_recursive(rdtgrp->plr->debugfs_dir);
			
 
				+	device_destroy(pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor));
			
 
				+	pseudo_lock_minor_release(plr->minor);
			
 
				+
			
 
				+free:
			
 
				+	pseudo_lock_free(rdtgrp);
			
 
				+}
			
 
				+
			
 
				+static int pseudo_lock_dev_open(struct inode *inode, struct file *filp)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	rdtgrp = region_find_by_minor(iminor(inode));
			
 
				+	if (!rdtgrp) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	filp->private_data = rdtgrp;
			
 
				+	atomic_inc(&rdtgrp->waitcount);
			
 
				+	/* Perform a non-seekable open - llseek is not supported */
			
 
				+	filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE);
			
 
				+
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int pseudo_lock_dev_release(struct inode *inode, struct file *filp)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+	rdtgrp = filp->private_data;
			
 
				+	WARN_ON(!rdtgrp);
			
 
				+	if (!rdtgrp) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+	filp->private_data = NULL;
			
 
				+	atomic_dec(&rdtgrp->waitcount);
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static int pseudo_lock_dev_mremap(struct vm_area_struct *area)
			
 
				+{
			
 
				+	/* Not supported */
			
 
				+	return -EINVAL;
			
 
				+}
			
 
				+
			
 
				+static const struct vm_operations_struct pseudo_mmap_ops = {
			
 
				+	.mremap = pseudo_lock_dev_mremap,
			
 
				+};
			
 
				+
			
 
				+static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma)
			
 
				+{
			
 
				+	unsigned long vsize = vma->vm_end - vma->vm_start;
			
 
				+	unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
			
 
				+	struct pseudo_lock_region *plr;
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+	unsigned long physical;
			
 
				+	unsigned long psize;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+
			
 
				+	rdtgrp = filp->private_data;
			
 
				+	WARN_ON(!rdtgrp);
			
 
				+	if (!rdtgrp) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	plr = rdtgrp->plr;
			
 
				+
			
 
				+	/*
			
 
				+	 * Task is required to run with affinity to the cpus associated
			
 
				+	 * with the pseudo-locked region. If this is not the case the task
			
 
				+	 * may be scheduled elsewhere and invalidate entries in the
			
 
				+	 * pseudo-locked region.
			
 
				+	 */
			
 
				+	if (!cpumask_subset(&current->cpus_allowed, &plr->d->cpu_mask)) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	physical = __pa(plr->kmem) >> PAGE_SHIFT;
			
 
				+	psize = plr->size - off;
			
 
				+
			
 
				+	if (off > plr->size) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Ensure changes are carried directly to the memory being mapped,
			
 
				+	 * do not allow copy-on-write mapping.
			
 
				+	 */
			
 
				+	if (!(vma->vm_flags & VM_SHARED)) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	if (vsize > psize) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -ENOSPC;
			
 
				+	}
			
 
				+
			
 
				+	memset(plr->kmem + off, 0, vsize);
			
 
				+
			
 
				+	if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff,
			
 
				+			    vsize, vma->vm_page_prot)) {
			
 
				+		mutex_unlock(&rdtgroup_mutex);
			
 
				+		return -EAGAIN;
			
 
				+	}
			
 
				+	vma->vm_ops = &pseudo_mmap_ops;
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations pseudo_lock_dev_fops = {
			
 
				+	.owner =	THIS_MODULE,
			
 
				+	.llseek =	no_llseek,
			
 
				+	.read =		NULL,
			
 
				+	.write =	NULL,
			
 
				+	.open =		pseudo_lock_dev_open,
			
 
				+	.release =	pseudo_lock_dev_release,
			
 
				+	.mmap =		pseudo_lock_dev_mmap,
			
 
				+};
			
 
				+
			
 
				+static char *pseudo_lock_devnode(struct device *dev, umode_t *mode)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+
			
 
				+	rdtgrp = dev_get_drvdata(dev);
			
 
				+	if (mode)
			
 
				+		*mode = 0600;
			
 
				+	return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdtgrp->kn->name);
			
 
				+}
			
 
				+
			
 
				+int rdt_pseudo_lock_init(void)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops);
			
 
				+	if (ret < 0)
			
 
				+		return ret;
			
 
				+
			
 
				+	pseudo_lock_major = ret;
			
 
				+
			
 
				+	pseudo_lock_class = class_create(THIS_MODULE, "pseudo_lock");
			
 
				+	if (IS_ERR(pseudo_lock_class)) {
			
 
				+		ret = PTR_ERR(pseudo_lock_class);
			
 
				+		unregister_chrdev(pseudo_lock_major, "pseudo_lock");
			
 
				+		return ret;
			
 
				+	}
			
 
				+
			
 
				+	pseudo_lock_class->devnode = pseudo_lock_devnode;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void rdt_pseudo_lock_release(void)
			
 
				+{
			
 
				+	class_destroy(pseudo_lock_class);
			
 
				+	pseudo_lock_class = NULL;
			
 
				+	unregister_chrdev(pseudo_lock_major, "pseudo_lock");
			
 
				+	pseudo_lock_major = 0;
			
 
				+}
			
--- a/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
+++ b/arch/x86/kernel/cpu/intel_rdt_pseudo_lock_event.h
@@ -0,0 +1,43 @@
 
				+/* SPDX-License-Identifier: GPL-2.0 */
			
 
				+#undef TRACE_SYSTEM
			
 
				+#define TRACE_SYSTEM resctrl
			
 
				+
			
 
				+#if !defined(_TRACE_PSEUDO_LOCK_H) || defined(TRACE_HEADER_MULTI_READ)
			
 
				+#define _TRACE_PSEUDO_LOCK_H
			
 
				+
			
 
				+#include <linux/tracepoint.h>
			
 
				+
			
 
				+TRACE_EVENT(pseudo_lock_mem_latency,
			
 
				+	    TP_PROTO(u32 latency),
			
 
				+	    TP_ARGS(latency),
			
 
				+	    TP_STRUCT__entry(__field(u32, latency)),
			
 
				+	    TP_fast_assign(__entry->latency = latency),
			
 
				+	    TP_printk("latency=%u", __entry->latency)
			
 
				+	   );
			
 
				+
			
 
				+TRACE_EVENT(pseudo_lock_l2,
			
 
				+	    TP_PROTO(u64 l2_hits, u64 l2_miss),
			
 
				+	    TP_ARGS(l2_hits, l2_miss),
			
 
				+	    TP_STRUCT__entry(__field(u64, l2_hits)
			
 
				+			     __field(u64, l2_miss)),
			
 
				+	    TP_fast_assign(__entry->l2_hits = l2_hits;
			
 
				+			   __entry->l2_miss = l2_miss;),
			
 
				+	    TP_printk("hits=%llu miss=%llu",
			
 
				+		      __entry->l2_hits, __entry->l2_miss));
			
 
				+
			
 
				+TRACE_EVENT(pseudo_lock_l3,
			
 
				+	    TP_PROTO(u64 l3_hits, u64 l3_miss),
			
 
				+	    TP_ARGS(l3_hits, l3_miss),
			
 
				+	    TP_STRUCT__entry(__field(u64, l3_hits)
			
 
				+			     __field(u64, l3_miss)),
			
 
				+	    TP_fast_assign(__entry->l3_hits = l3_hits;
			
 
				+			   __entry->l3_miss = l3_miss;),
			
 
				+	    TP_printk("hits=%llu miss=%llu",
			
 
				+		      __entry->l3_hits, __entry->l3_miss));
			
 
				+
			
 
				+#endif /* _TRACE_PSEUDO_LOCK_H */
			
 
				+
			
 
				+#undef TRACE_INCLUDE_PATH
			
 
				+#define TRACE_INCLUDE_PATH .
			
 
				+#define TRACE_INCLUDE_FILE intel_rdt_pseudo_lock_event
			
 
				+#include <trace/define_trace.h>
			
--- a/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
+++ b/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c
@@ -20,7 +20,9 @@
 
				 
			
 
				 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
			
 
				 
			
 
				+#include <linux/cacheinfo.h>
			
 
				 #include <linux/cpu.h>
			
 
				+#include <linux/debugfs.h>
			
 
				 #include <linux/fs.h>
			
 
				 #include <linux/sysfs.h>
			
 
				 #include <linux/kernfs.h>
			
@@ -55,6 +57,8 @@ static struct kernfs_node *kn_mondata;
 
				 static struct seq_buf last_cmd_status;
			
 
				 static char last_cmd_status_buf[512];
			
 
				 
			
 
				+struct dentry *debugfs_resctrl;
			
 
				+
			
 
				 void rdt_last_cmd_clear(void)
			
 
				 {
			
 
				 	lockdep_assert_held(&rdtgroup_mutex);
			
@@ -121,11 +125,65 @@ static int closid_alloc(void)
 
				 	return closid;
			
 
				 }
			
 
				 
			
 
				-static void closid_free(int closid)
			
 
				+void closid_free(int closid)
			
 
				 {
			
 
				 	closid_free_map |= 1 << closid;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * closid_allocated - test if provided closid is in use
			
 
				+ * @closid: closid to be tested
			
 
				+ *
			
 
				+ * Return: true if @closid is currently associated with a resource group,
			
 
				+ * false if @closid is free
			
 
				+ */
			
 
				+static bool closid_allocated(unsigned int closid)
			
 
				+{
			
 
				+	return (closid_free_map & (1 << closid)) == 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_mode_by_closid - Return mode of resource group with closid
			
 
				+ * @closid: closid if the resource group
			
 
				+ *
			
 
				+ * Each resource group is associated with a @closid. Here the mode
			
 
				+ * of a resource group can be queried by searching for it using its closid.
			
 
				+ *
			
 
				+ * Return: mode as &enum rdtgrp_mode of resource group with closid @closid
			
 
				+ */
			
 
				+enum rdtgrp_mode rdtgroup_mode_by_closid(int closid)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+
			
 
				+	list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) {
			
 
				+		if (rdtgrp->closid == closid)
			
 
				+			return rdtgrp->mode;
			
 
				+	}
			
 
				+
			
 
				+	return RDT_NUM_MODES;
			
 
				+}
			
 
				+
			
 
				+static const char * const rdt_mode_str[] = {
			
 
				+	[RDT_MODE_SHAREABLE]		= "shareable",
			
 
				+	[RDT_MODE_EXCLUSIVE]		= "exclusive",
			
 
				+	[RDT_MODE_PSEUDO_LOCKSETUP]	= "pseudo-locksetup",
			
 
				+	[RDT_MODE_PSEUDO_LOCKED]	= "pseudo-locked",
			
 
				+};
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_mode_str - Return the string representation of mode
			
 
				+ * @mode: the resource group mode as &enum rdtgroup_mode
			
 
				+ *
			
 
				+ * Return: string representation of valid mode, "unknown" otherwise
			
 
				+ */
			
 
				+static const char *rdtgroup_mode_str(enum rdtgrp_mode mode)
			
 
				+{
			
 
				+	if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES)
			
 
				+		return "unknown";
			
 
				+
			
 
				+	return rdt_mode_str[mode];
			
 
				+}
			
 
				+
			
 
				 /* set uid and gid of rdtgroup dirs and files to that of the creator */
			
 
				 static int rdtgroup_kn_set_ugid(struct kernfs_node *kn)
			
 
				 {
			
@@ -207,8 +265,12 @@ static int rdtgroup_cpus_show(struct kernfs_open_file *of,
 
				 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				 
			
 
				 	if (rdtgrp) {
			
 
				-		seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
			
 
				-			   cpumask_pr_args(&rdtgrp->cpu_mask));
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
			
 
				+			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
			
 
				+				   cpumask_pr_args(&rdtgrp->plr->d->cpu_mask));
			
 
				+		else
			
 
				+			seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n",
			
 
				+				   cpumask_pr_args(&rdtgrp->cpu_mask));
			
 
				 	} else {
			
 
				 		ret = -ENOENT;
			
 
				 	}
			
@@ -394,6 +456,13 @@ static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of,
 
				 		goto unlock;
			
 
				 	}
			
 
				 
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
			
 
				+	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+		ret = -EINVAL;
			
 
				+		rdt_last_cmd_puts("pseudo-locking in progress\n");
			
 
				+		goto unlock;
			
 
				+	}
			
 
				+
			
 
				 	if (is_cpu_list(of))
			
 
				 		ret = cpulist_parse(buf, newmask);
			
 
				 	else
			
@@ -509,6 +578,32 @@ static int __rdtgroup_move_task(struct task_struct *tsk,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group
			
 
				+ * @r: Resource group
			
 
				+ *
			
 
				+ * Return: 1 if tasks have been assigned to @r, 0 otherwise
			
 
				+ */
			
 
				+int rdtgroup_tasks_assigned(struct rdtgroup *r)
			
 
				+{
			
 
				+	struct task_struct *p, *t;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	lockdep_assert_held(&rdtgroup_mutex);
			
 
				+
			
 
				+	rcu_read_lock();
			
 
				+	for_each_process_thread(p, t) {
			
 
				+		if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) ||
			
 
				+		    (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) {
			
 
				+			ret = 1;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	rcu_read_unlock();
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static int rdtgroup_task_write_permission(struct task_struct *task,
			
 
				 					  struct kernfs_open_file *of)
			
 
				 {
			
@@ -570,13 +665,22 @@ static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of,
 
				 	if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0)
			
 
				 		return -EINVAL;
			
 
				 	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				+	if (!rdtgrp) {
			
 
				+		rdtgroup_kn_unlock(of->kn);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				 	rdt_last_cmd_clear();
			
 
				 
			
 
				-	if (rdtgrp)
			
 
				-		ret = rdtgroup_move_task(pid, rdtgrp, of);
			
 
				-	else
			
 
				-		ret = -ENOENT;
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED ||
			
 
				+	    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+		ret = -EINVAL;
			
 
				+		rdt_last_cmd_puts("pseudo-locking in progress\n");
			
 
				+		goto unlock;
			
 
				+	}
			
 
				 
			
 
				+	ret = rdtgroup_move_task(pid, rdtgrp, of);
			
 
				+
			
 
				+unlock:
			
 
				 	rdtgroup_kn_unlock(of->kn);
			
 
				 
			
 
				 	return ret ?: nbytes;
			
@@ -662,6 +766,94 @@ static int rdt_shareable_bits_show(struct kernfs_open_file *of,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * rdt_bit_usage_show - Display current usage of resources
			
 
				+ *
			
 
				+ * A domain is a shared resource that can now be allocated differently. Here
			
 
				+ * we display the current regions of the domain as an annotated bitmask.
			
 
				+ * For each domain of this resource its allocation bitmask
			
 
				+ * is annotated as below to indicate the current usage of the corresponding bit:
			
 
				+ *   0 - currently unused
			
 
				+ *   X - currently available for sharing and used by software and hardware
			
 
				+ *   H - currently used by hardware only but available for software use
			
 
				+ *   S - currently used and shareable by software only
			
 
				+ *   E - currently used exclusively by one resource group
			
 
				+ *   P - currently pseudo-locked by one resource group
			
 
				+ */
			
 
				+static int rdt_bit_usage_show(struct kernfs_open_file *of,
			
 
				+			      struct seq_file *seq, void *v)
			
 
				+{
			
 
				+	struct rdt_resource *r = of->kn->parent->priv;
			
 
				+	u32 sw_shareable = 0, hw_shareable = 0;
			
 
				+	u32 exclusive = 0, pseudo_locked = 0;
			
 
				+	struct rdt_domain *dom;
			
 
				+	int i, hwb, swb, excl, psl;
			
 
				+	enum rdtgrp_mode mode;
			
 
				+	bool sep = false;
			
 
				+	u32 *ctrl;
			
 
				+
			
 
				+	mutex_lock(&rdtgroup_mutex);
			
 
				+	hw_shareable = r->cache.shareable_bits;
			
 
				+	list_for_each_entry(dom, &r->domains, list) {
			
 
				+		if (sep)
			
 
				+			seq_putc(seq, ';');
			
 
				+		ctrl = dom->ctrl_val;
			
 
				+		sw_shareable = 0;
			
 
				+		exclusive = 0;
			
 
				+		seq_printf(seq, "%d=", dom->id);
			
 
				+		for (i = 0; i < r->num_closid; i++, ctrl++) {
			
 
				+			if (!closid_allocated(i))
			
 
				+				continue;
			
 
				+			mode = rdtgroup_mode_by_closid(i);
			
 
				+			switch (mode) {
			
 
				+			case RDT_MODE_SHAREABLE:
			
 
				+				sw_shareable |= *ctrl;
			
 
				+				break;
			
 
				+			case RDT_MODE_EXCLUSIVE:
			
 
				+				exclusive |= *ctrl;
			
 
				+				break;
			
 
				+			case RDT_MODE_PSEUDO_LOCKSETUP:
			
 
				+			/*
			
 
				+			 * RDT_MODE_PSEUDO_LOCKSETUP is possible
			
 
				+			 * here but not included since the CBM
			
 
				+			 * associated with this CLOSID in this mode
			
 
				+			 * is not initialized and no task or cpu can be
			
 
				+			 * assigned this CLOSID.
			
 
				+			 */
			
 
				+				break;
			
 
				+			case RDT_MODE_PSEUDO_LOCKED:
			
 
				+			case RDT_NUM_MODES:
			
 
				+				WARN(1,
			
 
				+				     "invalid mode for closid %d\n", i);
			
 
				+				break;
			
 
				+			}
			
 
				+		}
			
 
				+		for (i = r->cache.cbm_len - 1; i >= 0; i--) {
			
 
				+			pseudo_locked = dom->plr ? dom->plr->cbm : 0;
			
 
				+			hwb = test_bit(i, (unsigned long *)&hw_shareable);
			
 
				+			swb = test_bit(i, (unsigned long *)&sw_shareable);
			
 
				+			excl = test_bit(i, (unsigned long *)&exclusive);
			
 
				+			psl = test_bit(i, (unsigned long *)&pseudo_locked);
			
 
				+			if (hwb && swb)
			
 
				+				seq_putc(seq, 'X');
			
 
				+			else if (hwb && !swb)
			
 
				+				seq_putc(seq, 'H');
			
 
				+			else if (!hwb && swb)
			
 
				+				seq_putc(seq, 'S');
			
 
				+			else if (excl)
			
 
				+				seq_putc(seq, 'E');
			
 
				+			else if (psl)
			
 
				+				seq_putc(seq, 'P');
			
 
				+			else /* Unused bits remain */
			
 
				+				seq_putc(seq, '0');
			
 
				+		}
			
 
				+		sep = true;
			
 
				+	}
			
 
				+	seq_putc(seq, '\n');
			
 
				+	mutex_unlock(&rdtgroup_mutex);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int rdt_min_bw_show(struct kernfs_open_file *of,
			
 
				 			     struct seq_file *seq, void *v)
			
 
				 {
			
@@ -740,6 +932,269 @@ static ssize_t max_threshold_occ_write(struct kernfs_open_file *of,
 
				 	return nbytes;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * rdtgroup_mode_show - Display mode of this resource group
			
 
				+ */
			
 
				+static int rdtgroup_mode_show(struct kernfs_open_file *of,
			
 
				+			      struct seq_file *s, void *v)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+
			
 
				+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				+	if (!rdtgrp) {
			
 
				+		rdtgroup_kn_unlock(of->kn);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode));
			
 
				+
			
 
				+	rdtgroup_kn_unlock(of->kn);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other
			
 
				+ * @r: Resource to which domain instance @d belongs.
			
 
				+ * @d: The domain instance for which @closid is being tested.
			
 
				+ * @cbm: Capacity bitmask being tested.
			
 
				+ * @closid: Intended closid for @cbm.
			
 
				+ * @exclusive: Only check if overlaps with exclusive resource groups
			
 
				+ *
			
 
				+ * Checks if provided @cbm intended to be used for @closid on domain
			
 
				+ * @d overlaps with any other closids or other hardware usage associated
			
 
				+ * with this domain. If @exclusive is true then only overlaps with
			
 
				+ * resource groups in exclusive mode will be considered. If @exclusive
			
 
				+ * is false then overlaps with any resource group or hardware entities
			
 
				+ * will be considered.
			
 
				+ *
			
 
				+ * Return: false if CBM does not overlap, true if it does.
			
 
				+ */
			
 
				+bool rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_domain *d,
			
 
				+			   u32 _cbm, int closid, bool exclusive)
			
 
				+{
			
 
				+	unsigned long *cbm = (unsigned long *)&_cbm;
			
 
				+	unsigned long *ctrl_b;
			
 
				+	enum rdtgrp_mode mode;
			
 
				+	u32 *ctrl;
			
 
				+	int i;
			
 
				+
			
 
				+	/* Check for any overlap with regions used by hardware directly */
			
 
				+	if (!exclusive) {
			
 
				+		if (bitmap_intersects(cbm,
			
 
				+				      (unsigned long *)&r->cache.shareable_bits,
			
 
				+				      r->cache.cbm_len))
			
 
				+			return true;
			
 
				+	}
			
 
				+
			
 
				+	/* Check for overlap with other resource groups */
			
 
				+	ctrl = d->ctrl_val;
			
 
				+	for (i = 0; i < r->num_closid; i++, ctrl++) {
			
 
				+		ctrl_b = (unsigned long *)ctrl;
			
 
				+		mode = rdtgroup_mode_by_closid(i);
			
 
				+		if (closid_allocated(i) && i != closid &&
			
 
				+		    mode != RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+			if (bitmap_intersects(cbm, ctrl_b, r->cache.cbm_len)) {
			
 
				+				if (exclusive) {
			
 
				+					if (mode == RDT_MODE_EXCLUSIVE)
			
 
				+						return true;
			
 
				+					continue;
			
 
				+				}
			
 
				+				return true;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive
			
 
				+ *
			
 
				+ * An exclusive resource group implies that there should be no sharing of
			
 
				+ * its allocated resources. At the time this group is considered to be
			
 
				+ * exclusive this test can determine if its current schemata supports this
			
 
				+ * setting by testing for overlap with all other resource groups.
			
 
				+ *
			
 
				+ * Return: true if resource group can be exclusive, false if there is overlap
			
 
				+ * with allocations of other resource groups and thus this resource group
			
 
				+ * cannot be exclusive.
			
 
				+ */
			
 
				+static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	int closid = rdtgrp->closid;
			
 
				+	struct rdt_resource *r;
			
 
				+	struct rdt_domain *d;
			
 
				+
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		list_for_each_entry(d, &r->domains, list) {
			
 
				+			if (rdtgroup_cbm_overlaps(r, d, d->ctrl_val[closid],
			
 
				+						  rdtgrp->closid, false))
			
 
				+				return false;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_mode_write - Modify the resource group's mode
			
 
				+ *
			
 
				+ */
			
 
				+static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of,
			
 
				+				   char *buf, size_t nbytes, loff_t off)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+	enum rdtgrp_mode mode;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	/* Valid input requires a trailing newline */
			
 
				+	if (nbytes == 0 || buf[nbytes - 1] != '\n')
			
 
				+		return -EINVAL;
			
 
				+	buf[nbytes - 1] = '\0';
			
 
				+
			
 
				+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				+	if (!rdtgrp) {
			
 
				+		rdtgroup_kn_unlock(of->kn);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	rdt_last_cmd_clear();
			
 
				+
			
 
				+	mode = rdtgrp->mode;
			
 
				+
			
 
				+	if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) ||
			
 
				+	    (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) ||
			
 
				+	    (!strcmp(buf, "pseudo-locksetup") &&
			
 
				+	     mode == RDT_MODE_PSEUDO_LOCKSETUP) ||
			
 
				+	    (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED))
			
 
				+		goto out;
			
 
				+
			
 
				+	if (mode == RDT_MODE_PSEUDO_LOCKED) {
			
 
				+		rdt_last_cmd_printf("cannot change pseudo-locked group\n");
			
 
				+		ret = -EINVAL;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	if (!strcmp(buf, "shareable")) {
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+			ret = rdtgroup_locksetup_exit(rdtgrp);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+		}
			
 
				+		rdtgrp->mode = RDT_MODE_SHAREABLE;
			
 
				+	} else if (!strcmp(buf, "exclusive")) {
			
 
				+		if (!rdtgroup_mode_test_exclusive(rdtgrp)) {
			
 
				+			rdt_last_cmd_printf("schemata overlaps\n");
			
 
				+			ret = -EINVAL;
			
 
				+			goto out;
			
 
				+		}
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+			ret = rdtgroup_locksetup_exit(rdtgrp);
			
 
				+			if (ret)
			
 
				+				goto out;
			
 
				+		}
			
 
				+		rdtgrp->mode = RDT_MODE_EXCLUSIVE;
			
 
				+	} else if (!strcmp(buf, "pseudo-locksetup")) {
			
 
				+		ret = rdtgroup_locksetup_enter(rdtgrp);
			
 
				+		if (ret)
			
 
				+			goto out;
			
 
				+		rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP;
			
 
				+	} else {
			
 
				+		rdt_last_cmd_printf("unknown/unsupported mode\n");
			
 
				+		ret = -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	rdtgroup_kn_unlock(of->kn);
			
 
				+	return ret ?: nbytes;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_cbm_to_size - Translate CBM to size in bytes
			
 
				+ * @r: RDT resource to which @d belongs.
			
 
				+ * @d: RDT domain instance.
			
 
				+ * @cbm: bitmask for which the size should be computed.
			
 
				+ *
			
 
				+ * The bitmask provided associated with the RDT domain instance @d will be
			
 
				+ * translated into how many bytes it represents. The size in bytes is
			
 
				+ * computed by first dividing the total cache size by the CBM length to
			
 
				+ * determine how many bytes each bit in the bitmask represents. The result
			
 
				+ * is multiplied with the number of bits set in the bitmask.
			
 
				+ */
			
 
				+unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r,
			
 
				+				  struct rdt_domain *d, u32 cbm)
			
 
				+{
			
 
				+	struct cpu_cacheinfo *ci;
			
 
				+	unsigned int size = 0;
			
 
				+	int num_b, i;
			
 
				+
			
 
				+	num_b = bitmap_weight((unsigned long *)&cbm, r->cache.cbm_len);
			
 
				+	ci = get_cpu_cacheinfo(cpumask_any(&d->cpu_mask));
			
 
				+	for (i = 0; i < ci->num_leaves; i++) {
			
 
				+		if (ci->info_list[i].level == r->cache_level) {
			
 
				+			size = ci->info_list[i].size / r->cache.cbm_len * num_b;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return size;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_size_show - Display size in bytes of allocated regions
			
 
				+ *
			
 
				+ * The "size" file mirrors the layout of the "schemata" file, printing the
			
 
				+ * size in bytes of each region instead of the capacity bitmask.
			
 
				+ *
			
 
				+ */
			
 
				+static int rdtgroup_size_show(struct kernfs_open_file *of,
			
 
				+			      struct seq_file *s, void *v)
			
 
				+{
			
 
				+	struct rdtgroup *rdtgrp;
			
 
				+	struct rdt_resource *r;
			
 
				+	struct rdt_domain *d;
			
 
				+	unsigned int size;
			
 
				+	bool sep = false;
			
 
				+	u32 cbm;
			
 
				+
			
 
				+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
			
 
				+	if (!rdtgrp) {
			
 
				+		rdtgroup_kn_unlock(of->kn);
			
 
				+		return -ENOENT;
			
 
				+	}
			
 
				+
			
 
				+	if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
			
 
				+		seq_printf(s, "%*s:", max_name_width, rdtgrp->plr->r->name);
			
 
				+		size = rdtgroup_cbm_to_size(rdtgrp->plr->r,
			
 
				+					    rdtgrp->plr->d,
			
 
				+					    rdtgrp->plr->cbm);
			
 
				+		seq_printf(s, "%d=%u\n", rdtgrp->plr->d->id, size);
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		seq_printf(s, "%*s:", max_name_width, r->name);
			
 
				+		list_for_each_entry(d, &r->domains, list) {
			
 
				+			if (sep)
			
 
				+				seq_putc(s, ';');
			
 
				+			if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) {
			
 
				+				size = 0;
			
 
				+			} else {
			
 
				+				cbm = d->ctrl_val[rdtgrp->closid];
			
 
				+				size = rdtgroup_cbm_to_size(r, d, cbm);
			
 
				+			}
			
 
				+			seq_printf(s, "%d=%u", d->id, size);
			
 
				+			sep = true;
			
 
				+		}
			
 
				+		seq_putc(s, '\n');
			
 
				+	}
			
 
				+
			
 
				+out:
			
 
				+	rdtgroup_kn_unlock(of->kn);
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 /* rdtgroup information files for one cache resource. */
			
 
				 static struct rftype res_common_files[] = {
			
 
				 	{
			
@@ -791,6 +1246,13 @@ static struct rftype res_common_files[] = {
 
				 		.seq_show	= rdt_shareable_bits_show,
			
 
				 		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
			
 
				 	},
			
 
				+	{
			
 
				+		.name		= "bit_usage",
			
 
				+		.mode		= 0444,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.seq_show	= rdt_bit_usage_show,
			
 
				+		.fflags		= RF_CTRL_INFO | RFTYPE_RES_CACHE,
			
 
				+	},
			
 
				 	{
			
 
				 		.name		= "min_bandwidth",
			
 
				 		.mode		= 0444,
			
@@ -853,6 +1315,22 @@ static struct rftype res_common_files[] = {
 
				 		.seq_show	= rdtgroup_schemata_show,
			
 
				 		.fflags		= RF_CTRL_BASE,
			
 
				 	},
			
 
				+	{
			
 
				+		.name		= "mode",
			
 
				+		.mode		= 0644,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.write		= rdtgroup_mode_write,
			
 
				+		.seq_show	= rdtgroup_mode_show,
			
 
				+		.fflags		= RF_CTRL_BASE,
			
 
				+	},
			
 
				+	{
			
 
				+		.name		= "size",
			
 
				+		.mode		= 0444,
			
 
				+		.kf_ops		= &rdtgroup_kf_single_ops,
			
 
				+		.seq_show	= rdtgroup_size_show,
			
 
				+		.fflags		= RF_CTRL_BASE,
			
 
				+	},
			
 
				+
			
 
				 };
			
 
				 
			
 
				 static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags)
			
@@ -883,6 +1361,103 @@ error:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file
			
 
				+ * @r: The resource group with which the file is associated.
			
 
				+ * @name: Name of the file
			
 
				+ *
			
 
				+ * The permissions of named resctrl file, directory, or link are modified
			
 
				+ * to not allow read, write, or execute by any user.
			
 
				+ *
			
 
				+ * WARNING: This function is intended to communicate to the user that the
			
 
				+ * resctrl file has been locked down - that it is not relevant to the
			
 
				+ * particular state the system finds itself in. It should not be relied
			
 
				+ * on to protect from user access because after the file's permissions
			
 
				+ * are restricted the user can still change the permissions using chmod
			
 
				+ * from the command line.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure.
			
 
				+ */
			
 
				+int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name)
			
 
				+{
			
 
				+	struct iattr iattr = {.ia_valid = ATTR_MODE,};
			
 
				+	struct kernfs_node *kn;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
			
 
				+	if (!kn)
			
 
				+		return -ENOENT;
			
 
				+
			
 
				+	switch (kernfs_type(kn)) {
			
 
				+	case KERNFS_DIR:
			
 
				+		iattr.ia_mode = S_IFDIR;
			
 
				+		break;
			
 
				+	case KERNFS_FILE:
			
 
				+		iattr.ia_mode = S_IFREG;
			
 
				+		break;
			
 
				+	case KERNFS_LINK:
			
 
				+		iattr.ia_mode = S_IFLNK;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	ret = kernfs_setattr(kn, &iattr);
			
 
				+	kernfs_put(kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_kn_mode_restore - Restore user access to named resctrl file
			
 
				+ * @r: The resource group with which the file is associated.
			
 
				+ * @name: Name of the file
			
 
				+ * @mask: Mask of permissions that should be restored
			
 
				+ *
			
 
				+ * Restore the permissions of the named file. If @name is a directory the
			
 
				+ * permissions of its parent will be used.
			
 
				+ *
			
 
				+ * Return: 0 on success, <0 on failure.
			
 
				+ */
			
 
				+int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name,
			
 
				+			     umode_t mask)
			
 
				+{
			
 
				+	struct iattr iattr = {.ia_valid = ATTR_MODE,};
			
 
				+	struct kernfs_node *kn, *parent;
			
 
				+	struct rftype *rfts, *rft;
			
 
				+	int ret, len;
			
 
				+
			
 
				+	rfts = res_common_files;
			
 
				+	len = ARRAY_SIZE(res_common_files);
			
 
				+
			
 
				+	for (rft = rfts; rft < rfts + len; rft++) {
			
 
				+		if (!strcmp(rft->name, name))
			
 
				+			iattr.ia_mode = rft->mode & mask;
			
 
				+	}
			
 
				+
			
 
				+	kn = kernfs_find_and_get_ns(r->kn, name, NULL);
			
 
				+	if (!kn)
			
 
				+		return -ENOENT;
			
 
				+
			
 
				+	switch (kernfs_type(kn)) {
			
 
				+	case KERNFS_DIR:
			
 
				+		parent = kernfs_get_parent(kn);
			
 
				+		if (parent) {
			
 
				+			iattr.ia_mode |= parent->mode;
			
 
				+			kernfs_put(parent);
			
 
				+		}
			
 
				+		iattr.ia_mode |= S_IFDIR;
			
 
				+		break;
			
 
				+	case KERNFS_FILE:
			
 
				+		iattr.ia_mode |= S_IFREG;
			
 
				+		break;
			
 
				+	case KERNFS_LINK:
			
 
				+		iattr.ia_mode |= S_IFLNK;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	ret = kernfs_setattr(kn, &iattr);
			
 
				+	kernfs_put(kn);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				 static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name,
			
 
				 				      unsigned long fflags)
			
 
				 {
			
@@ -1224,6 +1799,9 @@ void rdtgroup_kn_unlock(struct kernfs_node *kn)
 
				 
			
 
				 	if (atomic_dec_and_test(&rdtgrp->waitcount) &&
			
 
				 	    (rdtgrp->flags & RDT_DELETED)) {
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
			
 
				+		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
			
 
				+			rdtgroup_pseudo_lock_remove(rdtgrp);
			
 
				 		kernfs_unbreak_active_protection(kn);
			
 
				 		kernfs_put(rdtgrp->kn);
			
 
				 		kfree(rdtgrp);
			
@@ -1289,10 +1867,16 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 
				 		rdtgroup_default.mon.mon_data_kn = kn_mondata;
			
 
				 	}
			
 
				 
			
 
				+	ret = rdt_pseudo_lock_init();
			
 
				+	if (ret) {
			
 
				+		dentry = ERR_PTR(ret);
			
 
				+		goto out_mondata;
			
 
				+	}
			
 
				+
			
 
				 	dentry = kernfs_mount(fs_type, flags, rdt_root,
			
 
				 			      RDTGROUP_SUPER_MAGIC, NULL);
			
 
				 	if (IS_ERR(dentry))
			
 
				-		goto out_mondata;
			
 
				+		goto out_psl;
			
 
				 
			
 
				 	if (rdt_alloc_capable)
			
 
				 		static_branch_enable_cpuslocked(&rdt_alloc_enable_key);
			
@@ -1310,6 +1894,8 @@ static struct dentry *rdt_mount(struct file_system_type *fs_type,
 
				 
			
 
				 	goto out;
			
 
				 
			
 
				+out_psl:
			
 
				+	rdt_pseudo_lock_release();
			
 
				 out_mondata:
			
 
				 	if (rdt_mon_capable)
			
 
				 		kernfs_remove(kn_mondata);
			
@@ -1447,6 +2033,10 @@ static void rmdir_all_sub(void)
 
				 		if (rdtgrp == &rdtgroup_default)
			
 
				 			continue;
			
 
				 
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
			
 
				+		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)
			
 
				+			rdtgroup_pseudo_lock_remove(rdtgrp);
			
 
				+
			
 
				 		/*
			
 
				 		 * Give any CPUs back to the default group. We cannot copy
			
 
				 		 * cpu_online_mask because a CPU might have executed the
			
@@ -1483,6 +2073,8 @@ static void rdt_kill_sb(struct super_block *sb)
 
				 		reset_all_ctrls(r);
			
 
				 	cdp_disable_all();
			
 
				 	rmdir_all_sub();
			
 
				+	rdt_pseudo_lock_release();
			
 
				+	rdtgroup_default.mode = RDT_MODE_SHAREABLE;
			
 
				 	static_branch_disable_cpuslocked(&rdt_alloc_enable_key);
			
 
				 	static_branch_disable_cpuslocked(&rdt_mon_enable_key);
			
 
				 	static_branch_disable_cpuslocked(&rdt_enable_key);
			
@@ -1682,6 +2274,114 @@ out_destroy:
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+/**
			
 
				+ * cbm_ensure_valid - Enforce validity on provided CBM
			
 
				+ * @_val:	Candidate CBM
			
 
				+ * @r:		RDT resource to which the CBM belongs
			
 
				+ *
			
 
				+ * The provided CBM represents all cache portions available for use. This
			
 
				+ * may be represented by a bitmap that does not consist of contiguous ones
			
 
				+ * and thus be an invalid CBM.
			
 
				+ * Here the provided CBM is forced to be a valid CBM by only considering
			
 
				+ * the first set of contiguous bits as valid and clearing all bits.
			
 
				+ * The intention here is to provide a valid default CBM with which a new
			
 
				+ * resource group is initialized. The user can follow this with a
			
 
				+ * modification to the CBM if the default does not satisfy the
			
 
				+ * requirements.
			
 
				+ */
			
 
				+static void cbm_ensure_valid(u32 *_val, struct rdt_resource *r)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Convert the u32 _val to an unsigned long required by all the bit
			
 
				+	 * operations within this function. No more than 32 bits of this
			
 
				+	 * converted value can be accessed because all bit operations are
			
 
				+	 * additionally provided with cbm_len that is initialized during
			
 
				+	 * hardware enumeration using five bits from the EAX register and
			
 
				+	 * thus never can exceed 32 bits.
			
 
				+	 */
			
 
				+	unsigned long *val = (unsigned long *)_val;
			
 
				+	unsigned int cbm_len = r->cache.cbm_len;
			
 
				+	unsigned long first_bit, zero_bit;
			
 
				+
			
 
				+	if (*val == 0)
			
 
				+		return;
			
 
				+
			
 
				+	first_bit = find_first_bit(val, cbm_len);
			
 
				+	zero_bit = find_next_zero_bit(val, cbm_len, first_bit);
			
 
				+
			
 
				+	/* Clear any remaining bits to ensure contiguous region */
			
 
				+	bitmap_clear(val, zero_bit, cbm_len - zero_bit);
			
 
				+}
			
 
				+
			
 
				+/**
			
 
				+ * rdtgroup_init_alloc - Initialize the new RDT group's allocations
			
 
				+ *
			
 
				+ * A new RDT group is being created on an allocation capable (CAT)
			
 
				+ * supporting system. Set this group up to start off with all usable
			
 
				+ * allocations. That is, all shareable and unused bits.
			
 
				+ *
			
 
				+ * All-zero CBM is invalid. If there are no more shareable bits available
			
 
				+ * on any domain then the entire allocation will fail.
			
 
				+ */
			
 
				+static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	u32 used_b = 0, unused_b = 0;
			
 
				+	u32 closid = rdtgrp->closid;
			
 
				+	struct rdt_resource *r;
			
 
				+	enum rdtgrp_mode mode;
			
 
				+	struct rdt_domain *d;
			
 
				+	int i, ret;
			
 
				+	u32 *ctrl;
			
 
				+
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		list_for_each_entry(d, &r->domains, list) {
			
 
				+			d->have_new_ctrl = false;
			
 
				+			d->new_ctrl = r->cache.shareable_bits;
			
 
				+			used_b = r->cache.shareable_bits;
			
 
				+			ctrl = d->ctrl_val;
			
 
				+			for (i = 0; i < r->num_closid; i++, ctrl++) {
			
 
				+				if (closid_allocated(i) && i != closid) {
			
 
				+					mode = rdtgroup_mode_by_closid(i);
			
 
				+					if (mode == RDT_MODE_PSEUDO_LOCKSETUP)
			
 
				+						break;
			
 
				+					used_b |= *ctrl;
			
 
				+					if (mode == RDT_MODE_SHAREABLE)
			
 
				+						d->new_ctrl |= *ctrl;
			
 
				+				}
			
 
				+			}
			
 
				+			if (d->plr && d->plr->cbm > 0)
			
 
				+				used_b |= d->plr->cbm;
			
 
				+			unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1);
			
 
				+			unused_b &= BIT_MASK(r->cache.cbm_len) - 1;
			
 
				+			d->new_ctrl |= unused_b;
			
 
				+			/*
			
 
				+			 * Force the initial CBM to be valid, user can
			
 
				+			 * modify the CBM based on system availability.
			
 
				+			 */
			
 
				+			cbm_ensure_valid(&d->new_ctrl, r);
			
 
				+			if (bitmap_weight((unsigned long *) &d->new_ctrl,
			
 
				+					  r->cache.cbm_len) <
			
 
				+					r->cache.min_cbm_bits) {
			
 
				+				rdt_last_cmd_printf("no space on %s:%d\n",
			
 
				+						    r->name, d->id);
			
 
				+				return -ENOSPC;
			
 
				+			}
			
 
				+			d->have_new_ctrl = true;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	for_each_alloc_enabled_rdt_resource(r) {
			
 
				+		ret = update_domains(r, rdtgrp->closid);
			
 
				+		if (ret < 0) {
			
 
				+			rdt_last_cmd_puts("failed to initialize allocations\n");
			
 
				+			return ret;
			
 
				+		}
			
 
				+		rdtgrp->mode = RDT_MODE_SHAREABLE;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
			
 
				 			     struct kernfs_node *prgrp_kn,
			
 
				 			     const char *name, umode_t mode,
			
@@ -1700,6 +2400,14 @@ static int mkdir_rdt_prepare(struct kernfs_node *parent_kn,
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 
			
 
				+	if (rtype == RDTMON_GROUP &&
			
 
				+	    (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
			
 
				+	     prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) {
			
 
				+		ret = -EINVAL;
			
 
				+		rdt_last_cmd_puts("pseudo-locking in progress\n");
			
 
				+		goto out_unlock;
			
 
				+	}
			
 
				+
			
 
				 	/* allocate the rdtgroup. */
			
 
				 	rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL);
			
 
				 	if (!rdtgrp) {
			
@@ -1840,6 +2548,10 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
 
				 	ret = 0;
			
 
				 
			
 
				 	rdtgrp->closid = closid;
			
 
				+	ret = rdtgroup_init_alloc(rdtgrp);
			
 
				+	if (ret < 0)
			
 
				+		goto out_id_free;
			
 
				+
			
 
				 	list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups);
			
 
				 
			
 
				 	if (rdt_mon_capable) {
			
@@ -1850,15 +2562,16 @@ static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn,
 
				 		ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL);
			
 
				 		if (ret) {
			
 
				 			rdt_last_cmd_puts("kernfs subdir error\n");
			
 
				-			goto out_id_free;
			
 
				+			goto out_del_list;
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	goto out_unlock;
			
 
				 
			
 
				+out_del_list:
			
 
				+	list_del(&rdtgrp->rdtgroup_list);
			
 
				 out_id_free:
			
 
				 	closid_free(closid);
			
 
				-	list_del(&rdtgrp->rdtgroup_list);
			
 
				 out_common_fail:
			
 
				 	mkdir_rdt_prepare_clean(rdtgrp);
			
 
				 out_unlock:
			
@@ -1945,6 +2658,21 @@ static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
				 	return 0;
			
 
				 }
			
 
				 
			
 
				+static int rdtgroup_ctrl_remove(struct kernfs_node *kn,
			
 
				+				struct rdtgroup *rdtgrp)
			
 
				+{
			
 
				+	rdtgrp->flags = RDT_DELETED;
			
 
				+	list_del(&rdtgrp->rdtgroup_list);
			
 
				+
			
 
				+	/*
			
 
				+	 * one extra hold on this, will drop when we kfree(rdtgrp)
			
 
				+	 * in rdtgroup_kn_unlock()
			
 
				+	 */
			
 
				+	kernfs_get(kn);
			
 
				+	kernfs_remove(rdtgrp->kn);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				 static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
			
 
				 			       cpumask_var_t tmpmask)
			
 
				 {
			
@@ -1970,7 +2698,6 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
				 	cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask);
			
 
				 	update_closid_rmid(tmpmask, NULL);
			
 
				 
			
 
				-	rdtgrp->flags = RDT_DELETED;
			
 
				 	closid_free(rdtgrp->closid);
			
 
				 	free_rmid(rdtgrp->mon.rmid);
			
 
				 
			
@@ -1979,14 +2706,7 @@ static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp,
 
				 	 */
			
 
				 	free_all_child_rdtgrp(rdtgrp);
			
 
				 
			
 
				-	list_del(&rdtgrp->rdtgroup_list);
			
 
				-
			
 
				-	/*
			
 
				-	 * one extra hold on this, will drop when we kfree(rdtgrp)
			
 
				-	 * in rdtgroup_kn_unlock()
			
 
				-	 */
			
 
				-	kernfs_get(kn);
			
 
				-	kernfs_remove(rdtgrp->kn);
			
 
				+	rdtgroup_ctrl_remove(kn, rdtgrp);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -2014,13 +2734,19 @@ static int rdtgroup_rmdir(struct kernfs_node *kn)
 
				 	 * If the rdtgroup is a mon group and parent directory
			
 
				 	 * is a valid "mon_groups" directory, remove the mon group.
			
 
				 	 */
			
 
				-	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn)
			
 
				-		ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
			
 
				-	else if (rdtgrp->type == RDTMON_GROUP &&
			
 
				-		 is_mon_groups(parent_kn, kn->name))
			
 
				+	if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) {
			
 
				+		if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP ||
			
 
				+		    rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) {
			
 
				+			ret = rdtgroup_ctrl_remove(kn, rdtgrp);
			
 
				+		} else {
			
 
				+			ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask);
			
 
				+		}
			
 
				+	} else if (rdtgrp->type == RDTMON_GROUP &&
			
 
				+		 is_mon_groups(parent_kn, kn->name)) {
			
 
				 		ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask);
			
 
				-	else
			
 
				+	} else {
			
 
				 		ret = -EPERM;
			
 
				+	}
			
 
				 
			
 
				 out:
			
 
				 	rdtgroup_kn_unlock(kn);
			
@@ -2046,7 +2772,8 @@ static int __init rdtgroup_setup_root(void)
 
				 	int ret;
			
 
				 
			
 
				 	rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops,
			
 
				-				      KERNFS_ROOT_CREATE_DEACTIVATED,
			
 
				+				      KERNFS_ROOT_CREATE_DEACTIVATED |
			
 
				+				      KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
			
 
				 				      &rdtgroup_default);
			
 
				 	if (IS_ERR(rdt_root))
			
 
				 		return PTR_ERR(rdt_root);
			
@@ -2102,6 +2829,29 @@ int __init rdtgroup_init(void)
 
				 	if (ret)
			
 
				 		goto cleanup_mountpoint;
			
 
				 
			
 
				+	/*
			
 
				+	 * Adding the resctrl debugfs directory here may not be ideal since
			
 
				+	 * it would let the resctrl debugfs directory appear on the debugfs
			
 
				+	 * filesystem before the resctrl filesystem is mounted.
			
 
				+	 * It may also be ok since that would enable debugging of RDT before
			
 
				+	 * resctrl is mounted.
			
 
				+	 * The reason why the debugfs directory is created here and not in
			
 
				+	 * rdt_mount() is because rdt_mount() takes rdtgroup_mutex and
			
 
				+	 * during the debugfs directory creation also &sb->s_type->i_mutex_key
			
 
				+	 * (the lockdep class of inode->i_rwsem). Other filesystem
			
 
				+	 * interactions (eg. SyS_getdents) have the lock ordering:
			
 
				+	 * &sb->s_type->i_mutex_key --> &mm->mmap_sem
			
 
				+	 * During mmap(), called with &mm->mmap_sem, the rdtgroup_mutex
			
 
				+	 * is taken, thus creating dependency:
			
 
				+	 * &mm->mmap_sem --> rdtgroup_mutex for the latter that can cause
			
 
				+	 * issues considering the other two lock dependencies.
			
 
				+	 * By creating the debugfs directory here we avoid a dependency
			
 
				+	 * that may cause deadlock (even though file operations cannot
			
 
				+	 * occur until the filesystem is mounted, but I do not know how to
			
 
				+	 * tell lockdep that).
			
 
				+	 */
			
 
				+	debugfs_resctrl = debugfs_create_dir("resctrl", NULL);
			
 
				+
			
 
				 	return 0;
			
 
				 
			
 
				 cleanup_mountpoint:
			
@@ -2111,3 +2861,11 @@ cleanup_root:
 
				 
			
 
				 	return ret;
			
 
				 }
			
 
				+
			
 
				+void __exit rdtgroup_exit(void)
			
 
				+{
			
 
				+	debugfs_remove_recursive(debugfs_resctrl);
			
 
				+	unregister_filesystem(&rdt_fs_type);
			
 
				+	sysfs_remove_mount_point(fs_kobj, "resctrl");
			
 
				+	kernfs_destroy_root(rdt_root);
			
 
				+}