Эх сурвалжийг харах

Merge 4.15-rc6 into driver-core-next

We want the fixes in here as well.

Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Greg Kroah-Hartman 7 жил өмнө
parent
commit
8c9076b07c
100 өөрчлөгдсөн 1325 нэмэгдсэн , 1216 устгасан
  1. 1 0
      Documentation/admin-guide/kernel-parameters.rst
  2. 15 3
      Documentation/admin-guide/kernel-parameters.txt
  3. 1 1
      Documentation/admin-guide/thunderbolt.rst
  4. 1 0
      Documentation/arm64/silicon-errata.txt
  5. 7 0
      Documentation/cgroup-v2.txt
  6. 0 2
      Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt
  7. 1 1
      Documentation/devicetree/bindings/sound/da7218.txt
  8. 1 1
      Documentation/devicetree/bindings/sound/da7219.txt
  9. 12 6
      Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt
  10. 34 0
      Documentation/filesystems/overlayfs.txt
  11. 0 874
      Documentation/locking/crossrelease.txt
  12. 21 1
      Documentation/vm/zswap.txt
  13. 14 15
      Documentation/x86/x86_64/mm.txt
  14. 10 10
      MAINTAINERS
  15. 4 1
      Makefile
  16. 2 2
      arch/arm/boot/dts/vf610-zii-dev-rev-c.dts
  17. 4 0
      arch/arm/lib/csumpartialcopyuser.S
  18. 11 1
      arch/arm64/Kconfig
  19. 10 0
      arch/arm64/include/asm/assembler.h
  20. 3 0
      arch/arm64/include/asm/cpufeature.h
  21. 2 0
      arch/arm64/include/asm/cputype.h
  22. 22 19
      arch/arm64/include/asm/pgtable.h
  23. 1 0
      arch/arm64/kernel/cpu-reset.S
  24. 2 1
      arch/arm64/kernel/cpufeature.c
  25. 2 0
      arch/arm64/kernel/efi-entry.S
  26. 1 1
      arch/arm64/kernel/fpsimd.c
  27. 1 0
      arch/arm64/kernel/head.S
  28. 1 1
      arch/arm64/kernel/hw_breakpoint.c
  29. 1 0
      arch/arm64/kernel/relocate_kernel.S
  30. 1 0
      arch/arm64/kvm/hyp-init.S
  31. 3 0
      arch/arm64/kvm/hyp/debug-sr.c
  32. 1 1
      arch/arm64/mm/dump.c
  33. 2 3
      arch/arm64/mm/fault.c
  34. 2 1
      arch/arm64/mm/init.c
  35. 2 2
      arch/parisc/boot/compressed/misc.c
  36. 5 0
      arch/parisc/include/asm/thread_info.h
  37. 9 3
      arch/parisc/kernel/entry.S
  38. 1 0
      arch/parisc/kernel/hpmc.S
  39. 0 1
      arch/parisc/kernel/unwind.c
  40. 0 2
      arch/parisc/lib/delay.c
  41. 3 2
      arch/powerpc/include/asm/mmu_context.h
  42. 1 1
      arch/powerpc/kernel/process.c
  43. 4 3
      arch/powerpc/kvm/book3s_xive.c
  44. 4 2
      arch/powerpc/net/bpf_jit_comp64.c
  45. 6 2
      arch/powerpc/perf/core-book3s.c
  46. 16 1
      arch/powerpc/perf/imc-pmu.c
  47. 3 1
      arch/powerpc/sysdev/fsl_msi.c
  48. 19 0
      arch/riscv/include/asm/barrier.h
  49. 0 11
      arch/riscv/kernel/setup.c
  50. 1 1
      arch/riscv/kernel/sys_riscv.c
  51. 0 6
      arch/s390/include/asm/pgtable.h
  52. 1 0
      arch/s390/kernel/compat_linux.c
  53. 5 6
      arch/s390/net/bpf_jit_comp.c
  54. 2 2
      arch/sparc/lib/hweight.S
  55. 1 1
      arch/sparc/mm/fault_32.c
  56. 1 1
      arch/sparc/mm/fault_64.c
  57. 2 2
      arch/sparc/mm/gup.c
  58. 4 2
      arch/sparc/net/bpf_jit_comp_64.c
  59. 1 0
      arch/um/include/asm/Kbuild
  60. 2 1
      arch/um/include/asm/mmu_context.h
  61. 1 1
      arch/um/kernel/trap.c
  62. 3 2
      arch/unicore32/include/asm/mmu_context.h
  63. 2 1
      arch/x86/Kconfig
  64. 1 0
      arch/x86/Kconfig.debug
  65. 1 0
      arch/x86/boot/compressed/Makefile
  66. 12 4
      arch/x86/boot/compressed/head_64.S
  67. 16 0
      arch/x86/boot/compressed/misc.c
  68. 3 0
      arch/x86/boot/compressed/pagetable.c
  69. 28 0
      arch/x86/boot/compressed/pgtable_64.c
  70. 19 13
      arch/x86/boot/genimage.sh
  71. 0 7
      arch/x86/crypto/salsa20_glue.c
  72. 145 0
      arch/x86/entry/calling.h
  73. 8 6
      arch/x86/entry/entry_32.S
  74. 205 32
      arch/x86/entry/entry_64.S
  75. 28 3
      arch/x86/entry/entry_64_compat.S
  76. 37 1
      arch/x86/entry/vsyscall/vsyscall_64.c
  77. 4 1
      arch/x86/events/intel/core.c
  78. 83 47
      arch/x86/events/intel/ds.c
  79. 4 19
      arch/x86/events/perf_event.h
  80. 2 0
      arch/x86/include/asm/asm.h
  81. 81 0
      arch/x86/include/asm/cpu_entry_area.h
  82. 2 0
      arch/x86/include/asm/cpufeature.h
  83. 3 1
      arch/x86/include/asm/cpufeatures.h
  84. 5 9
      arch/x86/include/asm/desc.h
  85. 7 1
      arch/x86/include/asm/disabled-features.h
  86. 4 3
      arch/x86/include/asm/espfix.h
  87. 1 6
      arch/x86/include/asm/fixmap.h
  88. 15 10
      arch/x86/include/asm/hypervisor.h
  89. 36 0
      arch/x86/include/asm/intel_ds.h
  90. 53 0
      arch/x86/include/asm/invpcid.h
  91. 1 1
      arch/x86/include/asm/irqdomain.h
  92. 3 0
      arch/x86/include/asm/irqflags.h
  93. 1 0
      arch/x86/include/asm/kdebug.h
  94. 3 1
      arch/x86/include/asm/mmu.h
  95. 71 42
      arch/x86/include/asm/mmu_context.h
  96. 9 0
      arch/x86/include/asm/paravirt.h
  97. 11 0
      arch/x86/include/asm/pgalloc.h
  98. 26 4
      arch/x86/include/asm/pgtable.h
  99. 12 3
      arch/x86/include/asm/pgtable_32_types.h
  100. 92 0
      arch/x86/include/asm/pgtable_64.h

+ 1 - 0
Documentation/admin-guide/kernel-parameters.rst

@@ -109,6 +109,7 @@ parameter is applicable::
 	IPV6	IPv6 support is enabled.
 	IPV6	IPv6 support is enabled.
 	ISAPNP	ISA PnP code is enabled.
 	ISAPNP	ISA PnP code is enabled.
 	ISDN	Appropriate ISDN support is enabled.
 	ISDN	Appropriate ISDN support is enabled.
+	ISOL	CPU Isolation is enabled.
 	JOY	Appropriate joystick support is enabled.
 	JOY	Appropriate joystick support is enabled.
 	KGDB	Kernel debugger support is enabled.
 	KGDB	Kernel debugger support is enabled.
 	KVM	Kernel Virtual Machine support is enabled.
 	KVM	Kernel Virtual Machine support is enabled.

+ 15 - 3
Documentation/admin-guide/kernel-parameters.txt

@@ -328,11 +328,15 @@
 			not play well with APC CPU idle - disable it if you have
 			not play well with APC CPU idle - disable it if you have
 			APC and your system crashes randomly.
 			APC and your system crashes randomly.
 
 
-	apic=		[APIC,X86-32] Advanced Programmable Interrupt Controller
+	apic=		[APIC,X86] Advanced Programmable Interrupt Controller
 			Change the output verbosity whilst booting
 			Change the output verbosity whilst booting
 			Format: { quiet (default) | verbose | debug }
 			Format: { quiet (default) | verbose | debug }
 			Change the amount of debugging information output
 			Change the amount of debugging information output
 			when initialising the APIC and IO-APIC components.
 			when initialising the APIC and IO-APIC components.
+			For X86-32, this can also be used to specify an APIC
+			driver name.
+			Format: apic=driver_name
+			Examples: apic=bigsmp
 
 
 	apic_extnmi=	[APIC,X86] External NMI delivery setting
 	apic_extnmi=	[APIC,X86] External NMI delivery setting
 			Format: { bsp (default) | all | none }
 			Format: { bsp (default) | all | none }
@@ -1737,7 +1741,7 @@
 	isapnp=		[ISAPNP]
 	isapnp=		[ISAPNP]
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 			Format: <RDP>,<reset>,<pci_scan>,<verbosity>
 
 
-	isolcpus=	[KNL,SMP] Isolate a given set of CPUs from disturbance.
+	isolcpus=	[KNL,SMP,ISOL] Isolate a given set of CPUs from disturbance.
 			[Deprecated - use cpusets instead]
 			[Deprecated - use cpusets instead]
 			Format: [flag-list,]<cpu-list>
 			Format: [flag-list,]<cpu-list>
 
 
@@ -2662,7 +2666,7 @@
 			Valid arguments: on, off
 			Valid arguments: on, off
 			Default: on
 			Default: on
 
 
-	nohz_full=	[KNL,BOOT]
+	nohz_full=	[KNL,BOOT,SMP,ISOL]
 			The argument is a cpu list, as described above.
 			The argument is a cpu list, as described above.
 			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			In kernels built with CONFIG_NO_HZ_FULL=y, set
 			the specified list of CPUs whose tick will be stopped
 			the specified list of CPUs whose tick will be stopped
@@ -2708,6 +2712,8 @@
 			steal time is computed, but won't influence scheduler
 			steal time is computed, but won't influence scheduler
 			behaviour
 			behaviour
 
 
+	nopti		[X86-64] Disable kernel page table isolation
+
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
@@ -3282,6 +3288,12 @@
 	pt.		[PARIDE]
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 			See Documentation/blockdev/paride.txt.
 
 
+	pti=		[X86_64]
+			Control user/kernel address space isolation:
+			on - enable
+			off - disable
+			auto - default setting
+
 	pty.legacy_count=
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			default number.
 			default number.

+ 1 - 1
Documentation/admin-guide/thunderbolt.rst

@@ -230,7 +230,7 @@ If supported by your machine this will be exposed by the WMI bus with
 a sysfs attribute called "force_power".
 a sysfs attribute called "force_power".
 
 
 For example the intel-wmi-thunderbolt driver exposes this attribute in:
 For example the intel-wmi-thunderbolt driver exposes this attribute in:
-  /sys/devices/platform/PNP0C14:00/wmi_bus/wmi_bus-PNP0C14:00/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
+  /sys/bus/wmi/devices/86CCFD48-205E-4A77-9C48-2021CBEDE341/force_power
 
 
   To force the power to on, write 1 to this attribute file.
   To force the power to on, write 1 to this attribute file.
   To disable force power, write 0 to this attribute file.
   To disable force power, write 0 to this attribute file.

+ 1 - 0
Documentation/arm64/silicon-errata.txt

@@ -75,3 +75,4 @@ stable kernels.
 | Qualcomm Tech. | Falkor v1       | E1003           | QCOM_FALKOR_ERRATUM_1003    |
 | Qualcomm Tech. | Falkor v1       | E1003           | QCOM_FALKOR_ERRATUM_1003    |
 | Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
 | Qualcomm Tech. | Falkor v1       | E1009           | QCOM_FALKOR_ERRATUM_1009    |
 | Qualcomm Tech. | QDF2400 ITS     | E0065           | QCOM_QDF2400_ERRATUM_0065   |
 | Qualcomm Tech. | QDF2400 ITS     | E0065           | QCOM_QDF2400_ERRATUM_0065   |
+| Qualcomm Tech. | Falkor v{1,2}   | E1041           | QCOM_FALKOR_ERRATUM_1041    |

+ 7 - 0
Documentation/cgroup-v2.txt

@@ -898,6 +898,13 @@ controller implements weight and absolute bandwidth limit models for
 normal scheduling policy and absolute bandwidth allocation model for
 normal scheduling policy and absolute bandwidth allocation model for
 realtime scheduling policy.
 realtime scheduling policy.
 
 
+WARNING: cgroup2 doesn't yet support control of realtime processes and
+the cpu controller can only be enabled when all RT processes are in
+the root cgroup.  Be aware that system management software may already
+have placed RT processes into nonroot cgroups during the system boot
+process, and these processes may need to be moved to the root cgroup
+before the cpu controller can be enabled.
+
 
 
 CPU Interface Files
 CPU Interface Files
 ~~~~~~~~~~~~~~~~~~~
 ~~~~~~~~~~~~~~~~~~~

+ 0 - 2
Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt

@@ -13,7 +13,6 @@ Required properties:
                  at25df321a
                  at25df321a
                  at25df641
                  at25df641
                  at26df081a
                  at26df081a
-                 en25s64
                  mr25h128
                  mr25h128
                  mr25h256
                  mr25h256
                  mr25h10
                  mr25h10
@@ -33,7 +32,6 @@ Required properties:
                  s25fl008k
                  s25fl008k
                  s25fl064k
                  s25fl064k
                  sst25vf040b
                  sst25vf040b
-                 sst25wf040b
                  m25p40
                  m25p40
                  m25p80
                  m25p80
                  m25p16
                  m25p16

+ 1 - 1
Documentation/devicetree/bindings/sound/da7218.txt

@@ -73,7 +73,7 @@ Example:
 		compatible = "dlg,da7218";
 		compatible = "dlg,da7218";
 		reg = <0x1a>;
 		reg = <0x1a>;
 		interrupt-parent = <&gpio6>;
 		interrupt-parent = <&gpio6>;
-		interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+		interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
 		wakeup-source;
 		wakeup-source;
 
 
 		VDD-supply = <&reg_audio>;
 		VDD-supply = <&reg_audio>;

+ 1 - 1
Documentation/devicetree/bindings/sound/da7219.txt

@@ -77,7 +77,7 @@ Example:
 		reg = <0x1a>;
 		reg = <0x1a>;
 
 
 		interrupt-parent = <&gpio6>;
 		interrupt-parent = <&gpio6>;
-		interrupts = <11 IRQ_TYPE_LEVEL_HIGH>;
+		interrupts = <11 IRQ_TYPE_LEVEL_LOW>;
 
 
 		VDD-supply = <&reg_audio>;
 		VDD-supply = <&reg_audio>;
 		VDDMIC-supply = <&reg_audio>;
 		VDDMIC-supply = <&reg_audio>;

+ 12 - 6
Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt

@@ -12,24 +12,30 @@ Required properties:
   - "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
   - "fsl,imx53-ecspi" for SPI compatible with the one integrated on i.MX53 and later Soc
 - reg : Offset and length of the register set for the device
 - reg : Offset and length of the register set for the device
 - interrupts : Should contain CSPI/eCSPI interrupt
 - interrupts : Should contain CSPI/eCSPI interrupt
-- cs-gpios : Specifies the gpio pins to be used for chipselects.
 - clocks : Clock specifiers for both ipg and per clocks.
 - clocks : Clock specifiers for both ipg and per clocks.
 - clock-names : Clock names should include both "ipg" and "per"
 - clock-names : Clock names should include both "ipg" and "per"
 See the clock consumer binding,
 See the clock consumer binding,
 	Documentation/devicetree/bindings/clock/clock-bindings.txt
 	Documentation/devicetree/bindings/clock/clock-bindings.txt
-- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
-		Documentation/devicetree/bindings/dma/dma.txt
-- dma-names: DMA request names should include "tx" and "rx" if present.
 
 
-Obsolete properties:
-- fsl,spi-num-chipselects : Contains the number of the chipselect
+Recommended properties:
+- cs-gpios : GPIOs to use as chip selects, see spi-bus.txt.  While the native chip
+select lines can be used, they appear to always generate a pulse between each
+word of a transfer.  Most use cases will require GPIO based chip selects to
+generate a valid transaction.
 
 
 Optional properties:
 Optional properties:
+- num-cs :  Number of total chip selects, see spi-bus.txt.
+- dmas: DMA specifiers for tx and rx dma. See the DMA client binding,
+Documentation/devicetree/bindings/dma/dma.txt.
+- dma-names: DMA request names, if present, should include "tx" and "rx".
 - fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
 - fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register
 controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
 controlling the SPI_READY handling. Note that to enable the DRCTL consideration,
 the SPI_READY mode-flag needs to be set too.
 the SPI_READY mode-flag needs to be set too.
 Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
 Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst).
 
 
+Obsolete properties:
+- fsl,spi-num-chipselects : Contains the number of the chipselect
+
 Example:
 Example:
 
 
 ecspi@70010000 {
 ecspi@70010000 {

+ 34 - 0
Documentation/filesystems/overlayfs.txt

@@ -156,6 +156,40 @@ handle it in two different ways:
    root of the overlay.  Finally the directory is moved to the new
    root of the overlay.  Finally the directory is moved to the new
    location.
    location.
 
 
+There are several ways to tune the "redirect_dir" feature.
+
+Kernel config options:
+
+- OVERLAY_FS_REDIRECT_DIR:
+    If this is enabled, then redirect_dir is turned on by  default.
+- OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW:
+    If this is enabled, then redirects are always followed by default. Enabling
+    this results in a less secure configuration.  Enable this option only when
+    worried about backward compatibility with kernels that have the redirect_dir
+    feature and follow redirects even if turned off.
+
+Module options (can also be changed through /sys/module/overlay/parameters/*):
+
+- "redirect_dir=BOOL":
+    See OVERLAY_FS_REDIRECT_DIR kernel config option above.
+- "redirect_always_follow=BOOL":
+    See OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW kernel config option above.
+- "redirect_max=NUM":
+    The maximum number of bytes in an absolute redirect (default is 256).
+
+Mount options:
+
+- "redirect_dir=on":
+    Redirects are enabled.
+- "redirect_dir=follow":
+    Redirects are not created, but followed.
+- "redirect_dir=off":
+    Redirects are not created and only followed if "redirect_always_follow"
+    feature is enabled in the kernel/module config.
+- "redirect_dir=nofollow":
+    Redirects are not created and not followed (equivalent to "redirect_dir=off"
+    if "redirect_always_follow" feature is not enabled).
+
 Non-directories
 Non-directories
 ---------------
 ---------------
 
 

+ 0 - 874
Documentation/locking/crossrelease.txt

@@ -1,874 +0,0 @@
-Crossrelease
-============
-
-Started by Byungchul Park <byungchul.park@lge.com>
-
-Contents:
-
- (*) Background
-
-     - What causes deadlock
-     - How lockdep works
-
- (*) Limitation
-
-     - Limit lockdep
-     - Pros from the limitation
-     - Cons from the limitation
-     - Relax the limitation
-
- (*) Crossrelease
-
-     - Introduce crossrelease
-     - Introduce commit
-
- (*) Implementation
-
-     - Data structures
-     - How crossrelease works
-
- (*) Optimizations
-
-     - Avoid duplication
-     - Lockless for hot paths
-
- (*) APPENDIX A: What lockdep does to work aggresively
-
- (*) APPENDIX B: How to avoid adding false dependencies
-
-
-==========
-Background
-==========
-
-What causes deadlock
---------------------
-
-A deadlock occurs when a context is waiting for an event to happen,
-which is impossible because another (or the) context who can trigger the
-event is also waiting for another (or the) event to happen, which is
-also impossible due to the same reason.
-
-For example:
-
-   A context going to trigger event C is waiting for event A to happen.
-   A context going to trigger event A is waiting for event B to happen.
-   A context going to trigger event B is waiting for event C to happen.
-
-A deadlock occurs when these three wait operations run at the same time,
-because event C cannot be triggered if event A does not happen, which in
-turn cannot be triggered if event B does not happen, which in turn
-cannot be triggered if event C does not happen. After all, no event can
-be triggered since any of them never meets its condition to wake up.
-
-A dependency might exist between two waiters and a deadlock might happen
-due to an incorrect releationship between dependencies. Thus, we must
-define what a dependency is first. A dependency exists between them if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-Each wait in the example creates its dependency like:
-
-   Event C depends on event A.
-   Event A depends on event B.
-   Event B depends on event C.
-
-   NOTE: Precisely speaking, a dependency is one between whether a
-   waiter for an event can be woken up and whether another waiter for
-   another event can be woken up. However from now on, we will describe
-   a dependency as if it's one between an event and another event for
-   simplicity.
-
-And they form circular dependencies like:
-
-    -> C -> A -> B -
-   /                \
-   \                /
-    ----------------
-
-   where 'A -> B' means that event A depends on event B.
-
-Such circular dependencies lead to a deadlock since no waiter can meet
-its condition to wake up as described.
-
-CONCLUSION
-
-Circular dependencies cause a deadlock.
-
-
-How lockdep works
------------------
-
-Lockdep tries to detect a deadlock by checking dependencies created by
-lock operations, acquire and release. Waiting for a lock corresponds to
-waiting for an event, and releasing a lock corresponds to triggering an
-event in the previous section.
-
-In short, lockdep does:
-
-   1. Detect a new dependency.
-   2. Add the dependency into a global graph.
-   3. Check if that makes dependencies circular.
-   4. Report a deadlock or its possibility if so.
-
-For example, consider a graph built by lockdep that looks like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-Lockdep will add a dependency into the graph on detection of a new
-dependency. For example, it will add a dependency 'E -> C' when a new
-dependency between lock E and lock C is detected. Then the graph will be:
-
-       A -> B -
-               \
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where A, B,..., E are different lock classes.
-
-This graph contains a subgraph which demonstrates circular dependencies:
-
-                -> E -
-               /      \
-    -> C -> D -        \
-   /                   /
-   \                  /
-    ------------------
-
-   where C, D and E are different lock classes.
-
-This is the condition under which a deadlock might occur. Lockdep
-reports it on detection after adding a new dependency. This is the way
-how lockdep works.
-
-CONCLUSION
-
-Lockdep detects a deadlock or its possibility by checking if circular
-dependencies were created after adding each new dependency.
-
-
-==========
-Limitation
-==========
-
-Limit lockdep
--------------
-
-Limiting lockdep to work on only typical locks e.g. spin locks and
-mutexes, which are released within the acquire context, the
-implementation becomes simple but its capacity for detection becomes
-limited. Let's check pros and cons in next section.
-
-
-Pros from the limitation
-------------------------
-
-Given the limitation, when acquiring a lock, locks in a held_locks
-cannot be released if the context cannot acquire it so has to wait to
-acquire it, which means all waiters for the locks in the held_locks are
-stuck. It's an exact case to create dependencies between each lock in
-the held_locks and the lock to acquire.
-
-For example:
-
-   CONTEXT X
-   ---------
-   acquire A
-   acquire B /* Add a dependency 'A -> B' */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-When acquiring lock A, the held_locks of CONTEXT X is empty thus no
-dependency is added. But when acquiring lock B, lockdep detects and adds
-a new dependency 'A -> B' between lock A in the held_locks and lock B.
-They can be simply added whenever acquiring each lock.
-
-And data required by lockdep exists in a local structure, held_locks
-embedded in task_struct. Forcing to access the data within the context,
-lockdep can avoid racy problems without explicit locks while handling
-the local data.
-
-Lastly, lockdep only needs to keep locks currently being held, to build
-a dependency graph. However, relaxing the limitation, it needs to keep
-even locks already released, because a decision whether they created
-dependencies might be long-deferred.
-
-To sum up, we can expect several advantages from the limitation:
-
-   1. Lockdep can easily identify a dependency when acquiring a lock.
-   2. Races are avoidable while accessing local locks in a held_locks.
-   3. Lockdep only needs to keep locks currently being held.
-
-CONCLUSION
-
-Given the limitation, the implementation becomes simple and efficient.
-
-
-Cons from the limitation
-------------------------
-
-Given the limitation, lockdep is applicable only to typical locks. For
-example, page locks for page access or completions for synchronization
-cannot work with lockdep.
-
-Can we detect deadlocks below, under the limitation?
-
-Example 1:
-
-   CONTEXT X	   CONTEXT Y	   CONTEXT Z
-   ---------	   ---------	   ----------
-		   mutex_lock A
-   lock_page B
-		   lock_page B
-				   mutex_lock A /* DEADLOCK */
-				   unlock_page B held by X
-		   unlock_page B
-		   mutex_unlock A
-				   mutex_unlock A
-
-   where A and B are different lock classes.
-
-No, we cannot.
-
-Example 2:
-
-   CONTEXT X		   CONTEXT Y
-   ---------		   ---------
-			   mutex_lock A
-   mutex_lock A
-			   wait_for_complete B /* DEADLOCK */
-   complete B
-			   mutex_unlock A
-   mutex_unlock A
-
-   where A is a lock class and B is a completion variable.
-
-No, we cannot.
-
-CONCLUSION
-
-Given the limitation, lockdep cannot detect a deadlock or its
-possibility caused by page locks or completions.
-
-
-Relax the limitation
---------------------
-
-Under the limitation, things to create dependencies are limited to
-typical locks. However, synchronization primitives like page locks and
-completions, which are allowed to be released in any context, also
-create dependencies and can cause a deadlock. So lockdep should track
-these locks to do a better job. We have to relax the limitation for
-these locks to work with lockdep.
-
-Detecting dependencies is very important for lockdep to work because
-adding a dependency means adding an opportunity to check whether it
-causes a deadlock. The more lockdep adds dependencies, the more it
-thoroughly works. Thus Lockdep has to do its best to detect and add as
-many true dependencies into a graph as possible.
-
-For example, considering only typical locks, lockdep builds a graph like:
-
-   A -> B -
-           \
-            -> E
-           /
-   C -> D -
-
-   where A, B,..., E are different lock classes.
-
-On the other hand, under the relaxation, additional dependencies might
-be created and added. Assuming additional 'FX -> C' and 'E -> GX' are
-added thanks to the relaxation, the graph will be:
-
-         A -> B -
-                 \
-                  -> E -> GX
-                 /
-   FX -> C -> D -
-
-   where A, B,..., E, FX and GX are different lock classes, and a suffix
-   'X' is added on non-typical locks.
-
-The latter graph gives us more chances to check circular dependencies
-than the former. However, it might suffer performance degradation since
-relaxing the limitation, with which design and implementation of lockdep
-can be efficient, might introduce inefficiency inevitably. So lockdep
-should provide two options, strong detection and efficient detection.
-
-Choosing efficient detection:
-
-   Lockdep works with only locks restricted to be released within the
-   acquire context. However, lockdep works efficiently.
-
-Choosing strong detection:
-
-   Lockdep works with all synchronization primitives. However, lockdep
-   suffers performance degradation.
-
-CONCLUSION
-
-Relaxing the limitation, lockdep can add additional dependencies giving
-additional opportunities to check circular dependencies.
-
-
-============
-Crossrelease
-============
-
-Introduce crossrelease
-----------------------
-
-In order to allow lockdep to handle additional dependencies by what
-might be released in any context, namely 'crosslock', we have to be able
-to identify those created by crosslocks. The proposed 'crossrelease'
-feature provoides a way to do that.
-
-Crossrelease feature has to do:
-
-   1. Identify dependencies created by crosslocks.
-   2. Add the dependencies into a dependency graph.
-
-That's all. Once a meaningful dependency is added into graph, then
-lockdep would work with the graph as it did. The most important thing
-crossrelease feature has to do is to correctly identify and add true
-dependencies into the global graph.
-
-A dependency e.g. 'A -> B' can be identified only in the A's release
-context because a decision required to identify the dependency can be
-made only in the release context. That is to decide whether A can be
-released so that a waiter for A can be woken up. It cannot be made in
-other than the A's release context.
-
-It's no matter for typical locks because each acquire context is same as
-its release context, thus lockdep can decide whether a lock can be
-released in the acquire context. However for crosslocks, lockdep cannot
-make the decision in the acquire context but has to wait until the
-release context is identified.
-
-Therefore, deadlocks by crosslocks cannot be detected just when it
-happens, because those cannot be identified until the crosslocks are
-released. However, deadlock possibilities can be detected and it's very
-worth. See 'APPENDIX A' section to check why.
-
-CONCLUSION
-
-Using crossrelease feature, lockdep can work with what might be released
-in any context, namely crosslock.
-
-
-Introduce commit
-----------------
-
-Since crossrelease defers the work adding true dependencies of
-crosslocks until they are actually released, crossrelease has to queue
-all acquisitions which might create dependencies with the crosslocks.
-Then it identifies dependencies using the queued data in batches at a
-proper time. We call it 'commit'.
-
-There are four types of dependencies:
-
-1. TT type: 'typical lock A -> typical lock B'
-
-   Just when acquiring B, lockdep can see it's in the A's release
-   context. So the dependency between A and B can be identified
-   immediately. Commit is unnecessary.
-
-2. TC type: 'typical lock A -> crosslock BX'
-
-   Just when acquiring BX, lockdep can see it's in the A's release
-   context. So the dependency between A and BX can be identified
-   immediately. Commit is unnecessary, too.
-
-3. CT type: 'crosslock AX -> typical lock B'
-
-   When acquiring B, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-
-4. CC type: 'crosslock AX -> crosslock BX'
-
-   When acquiring BX, lockdep cannot identify the dependency because
-   there's no way to know if it's in the AX's release context. It has
-   to wait until the decision can be made. Commit is necessary.
-   But, handling CC type is not implemented yet. It's a future work.
-
-Lockdep can work without commit for typical locks, but commit step is
-necessary once crosslocks are involved. Introducing commit, lockdep
-performs three steps. What lockdep does in each step is:
-
-1. Acquisition: For typical locks, lockdep does what it originally did
-   and queues the lock so that CT type dependencies can be checked using
-   it at the commit step. For crosslocks, it saves data which will be
-   used at the commit step and increases a reference count for it.
-
-2. Commit: No action is reauired for typical locks. For crosslocks,
-   lockdep adds CT type dependencies using the data saved at the
-   acquisition step.
-
-3. Release: No changes are required for typical locks. When a crosslock
-   is released, it decreases a reference count for it.
-
-CONCLUSION
-
-Crossrelease introduces commit step to handle dependencies of crosslocks
-in batches at a proper time.
-
-
-==============
-Implementation
-==============
-
-Data structures
----------------
-
-Crossrelease introduces two main data structures.
-
-1. hist_lock
-
-   This is an array embedded in task_struct, for keeping lock history so
-   that dependencies can be added using them at the commit step. Since
-   it's local data, it can be accessed locklessly in the owner context.
-   The array is filled at the acquisition step and consumed at the
-   commit step. And it's managed in circular manner.
-
-2. cross_lock
-
-   One per lockdep_map exists. This is for keeping data of crosslocks
-   and used at the commit step.
-
-
-How crossrelease works
-----------------------
-
-It's the key of how crossrelease works, to defer necessary works to an
-appropriate point in time and perform in at once at the commit step.
-Let's take a look with examples step by step, starting from how lockdep
-works without crossrelease for typical locks.
-
-   acquire A /* Push A onto held_locks */
-   acquire B /* Push B onto held_locks and add 'A -> B' */
-   acquire C /* Push C onto held_locks and add 'B -> C' */
-   release C /* Pop C from held_locks */
-   release B /* Pop B from held_locks */
-   release A /* Pop A from held_locks */
-
-   where A, B and C are different lock classes.
-
-   NOTE: This document assumes that readers already understand how
-   lockdep works without crossrelease thus omits details. But there's
-   one thing to note. Lockdep pretends to pop a lock from held_locks
-   when releasing it. But it's subtly different from the original pop
-   operation because lockdep allows other than the top to be poped.
-
-In this case, lockdep adds 'the top of held_locks -> the lock to acquire'
-dependency every time acquiring a lock.
-
-After adding 'A -> B', a dependency graph will be:
-
-   A -> B
-
-   where A and B are different lock classes.
-
-And after adding 'B -> C', the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-Let's performs commit step even for typical locks to add dependencies.
-Of course, commit step is not necessary for them, however, it would work
-well because this is a more general way.
-
-   acquire A
-   /*
-    * Queue A into hist_locks
-    *
-    * In hist_locks: A
-    * In graph: Empty
-    */
-
-   acquire B
-   /*
-    * Queue B into hist_locks
-    *
-    * In hist_locks: A, B
-    * In graph: Empty
-    */
-
-   acquire C
-   /*
-    * Queue C into hist_locks
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   commit C
-   /*
-    * Add 'C -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire C: Nothing
-    *
-    * In hist_locks: A, B, C
-    * In graph: Empty
-    */
-
-   release C
-
-   commit B
-   /*
-    * Add 'B -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire B: C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C'
-    */
-
-   release B
-
-   commit A
-   /*
-    * Add 'A -> ?'
-    * Answer the following to decide '?'
-    * What has been queued since acquire A: B, C
-    *
-    * In hist_locks: A, B, C
-    * In graph: 'B -> C', 'A -> B', 'A -> C'
-    */
-
-   release A
-
-   where A, B and C are different lock classes.
-
-In this case, dependencies are added at the commit step as described.
-
-After commits for A, B and C, the graph will be:
-
-   A -> B -> C
-
-   where A, B and C are different lock classes.
-
-   NOTE: A dependency 'A -> C' is optimized out.
-
-We can see the former graph built without commit step is same as the
-latter graph built using commit steps. Of course the former way leads to
-earlier finish for building the graph, which means we can detect a
-deadlock or its possibility sooner. So the former way would be prefered
-when possible. But we cannot avoid using the latter way for crosslocks.
-
-Let's look at how commit steps work for crosslocks. In this case, the
-commit step is performed only on crosslock AX as real. And it assumes
-that the AX release context is different from the AX acquire context.
-
-   BX RELEASE CONTEXT		   BX ACQUIRE CONTEXT
-   ------------------		   ------------------
-				   acquire A
-				   /*
-				    * Push A onto held_locks
-				    * Queue A into hist_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: Empty
-				    */
-
-				   acquire BX
-				   /*
-				    * Add 'the top of held_locks -> BX'
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A
-				    * In graph: 'A -> BX'
-				    */
-
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-   It must be guaranteed that the following operations are seen after
-   acquiring BX globally. It can be done by things like barrier.
-   ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-   acquire C
-   /*
-    * Push C onto held_locks
-    * Queue C into hist_locks
-    *
-    * In held_locks: C
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-
-   release C
-   /*
-    * Pop C from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: C
-    * In graph: 'A -> BX'
-    */
-				   acquire D
-				   /*
-				    * Push D onto held_locks
-				    * Queue D into hist_locks
-				    * Add 'the top of held_locks -> D'
-				    *
-				    * In held_locks: A, D
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   acquire E
-   /*
-    * Push E onto held_locks
-    * Queue E into hist_locks
-    *
-    * In held_locks: E
-    * In hist_locks: C, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-
-   release E
-   /*
-    * Pop E from held_locks
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D'
-    */
-				   release D
-				   /*
-				    * Pop D from held_locks
-				    *
-				    * In held_locks: A
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D'
-				    */
-   commit BX
-   /*
-    * Add 'BX -> ?'
-    * What has been queued since acquire BX: C, E
-    *
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-
-   release BX
-   /*
-    * In held_locks: Empty
-    * In hist_locks: D, E
-    * In graph: 'A -> BX', 'A -> D',
-    *           'BX -> C', 'BX -> E'
-    */
-				   release A
-				   /*
-				    * Pop A from held_locks
-				    *
-				    * In held_locks: Empty
-				    * In hist_locks: A, D
-				    * In graph: 'A -> BX', 'A -> D',
-				    *           'BX -> C', 'BX -> E'
-				    */
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Crossrelease considers all acquisitions after acqiuring BX are
-candidates which might create dependencies with BX. True dependencies
-will be determined when identifying the release context of BX. Meanwhile,
-all typical locks are queued so that they can be used at the commit step.
-And then two dependencies 'BX -> C' and 'BX -> E' are added at the
-commit step when identifying the release context.
-
-The final graph will be, with crossrelease:
-
-               -> C
-              /
-       -> BX -
-      /       \
-   A -         -> E
-      \
-       -> D
-
-   where A, BX, C,..., E are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-However, the final graph will be, without crossrelease:
-
-   A -> D
-
-   where A and D are different lock classes.
-
-The former graph has three more dependencies, 'A -> BX', 'BX -> C' and
-'BX -> E' giving additional opportunities to check if they cause
-deadlocks. This way lockdep can detect a deadlock or its possibility
-caused by crosslocks.
-
-CONCLUSION
-
-We checked how crossrelease works with several examples.
-
-
-=============
-Optimizations
-=============
-
-Avoid duplication
------------------
-
-Crossrelease feature uses a cache like what lockdep already uses for
-dependency chains, but this time it's for caching CT type dependencies.
-Once that dependency is cached, the same will never be added again.
-
-
-Lockless for hot paths
-----------------------
-
-To keep all locks for later use at the commit step, crossrelease adopts
-a local array embedded in task_struct, which makes access to the data
-lockless by forcing it to happen only within the owner context. It's
-like how lockdep handles held_locks. Lockless implmentation is important
-since typical locks are very frequently acquired and released.
-
-
-=================================================
-APPENDIX A: What lockdep does to work aggresively
-=================================================
-
-A deadlock actually occurs when all wait operations creating circular
-dependencies run at the same time. Even though they don't, a potential
-deadlock exists if the problematic dependencies exist. Thus it's
-meaningful to detect not only an actual deadlock but also its potential
-possibility. The latter is rather valuable. When a deadlock occurs
-actually, we can identify what happens in the system by some means or
-other even without lockdep. However, there's no way to detect possiblity
-without lockdep unless the whole code is parsed in head. It's terrible.
-Lockdep does the both, and crossrelease only focuses on the latter.
-
-Whether or not a deadlock actually occurs depends on several factors.
-For example, what order contexts are switched in is a factor. Assuming
-circular dependencies exist, a deadlock would occur when contexts are
-switched so that all wait operations creating the dependencies run
-simultaneously. Thus to detect a deadlock possibility even in the case
-that it has not occured yet, lockdep should consider all possible
-combinations of dependencies, trying to:
-
-1. Use a global dependency graph.
-
-   Lockdep combines all dependencies into one global graph and uses them,
-   regardless of which context generates them or what order contexts are
-   switched in. Aggregated dependencies are only considered so they are
-   prone to be circular if a problem exists.
-
-2. Check dependencies between classes instead of instances.
-
-   What actually causes a deadlock are instances of lock. However,
-   lockdep checks dependencies between classes instead of instances.
-   This way lockdep can detect a deadlock which has not happened but
-   might happen in future by others but the same class.
-
-3. Assume all acquisitions lead to waiting.
-
-   Although locks might be acquired without waiting which is essential
-   to create dependencies, lockdep assumes all acquisitions lead to
-   waiting since it might be true some time or another.
-
-CONCLUSION
-
-Lockdep detects not only an actual deadlock but also its possibility,
-and the latter is more valuable.
-
-
-==================================================
-APPENDIX B: How to avoid adding false dependencies
-==================================================
-
-Remind what a dependency is. A dependency exists if:
-
-   1. There are two waiters waiting for each event at a given time.
-   2. The only way to wake up each waiter is to trigger its event.
-   3. Whether one can be woken up depends on whether the other can.
-
-For example:
-
-   acquire A
-   acquire B /* A dependency 'A -> B' exists */
-   release B
-   release A
-
-   where A and B are different lock classes.
-
-A depedency 'A -> B' exists since:
-
-   1. A waiter for A and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for A can be woken up depends on whether the
-      other can. IOW, TASK X cannot release A if it fails to acquire B.
-
-For another example:
-
-   TASK X			   TASK Y
-   ------			   ------
-				   acquire AX
-   acquire B /* A dependency 'AX -> B' exists */
-   release B
-   release AX held by Y
-
-   where AX and B are different lock classes, and a suffix 'X' is added
-   on crosslocks.
-
-Even in this case involving crosslocks, the same rule can be applied. A
-depedency 'AX -> B' exists since:
-
-   1. A waiter for AX and a waiter for B might exist when acquiring B.
-   2. Only way to wake up each is to release what it waits for.
-   3. Whether the waiter for AX can be woken up depends on whether the
-      other can. IOW, TASK X cannot release AX if it fails to acquire B.
-
-Let's take a look at more complicated example:
-
-   TASK X			   TASK Y
-   ------			   ------
-   acquire B
-   release B
-   fork Y
-				   acquire AX
-   acquire C /* A dependency 'AX -> C' exists */
-   release C
-   release AX held by Y
-
-   where AX, B and C are different lock classes, and a suffix 'X' is
-   added on crosslocks.
-
-Does a dependency 'AX -> B' exist? Nope.
-
-Two waiters are essential to create a dependency. However, waiters for
-AX and B to create 'AX -> B' cannot exist at the same time in this
-example. Thus the dependency 'AX -> B' cannot be created.
-
-It would be ideal if the full set of true ones can be considered. But
-we can ensure nothing but what actually happened. Relying on what
-actually happens at runtime, we can anyway add only true ones, though
-they might be a subset of true ones. It's similar to how lockdep works
-for typical locks. There might be more true dependencies than what
-lockdep has detected in runtime. Lockdep has no choice but to rely on
-what actually happens. Crossrelease also relies on it.
-
-CONCLUSION
-
-Relying on what actually happens, lockdep can avoid adding false
-dependencies.

+ 21 - 1
Documentation/vm/zswap.txt

@@ -98,5 +98,25 @@ request is made for a page in an old zpool, it is uncompressed using its
 original compressor.  Once all pages are removed from an old zpool, the zpool
 original compressor.  Once all pages are removed from an old zpool, the zpool
 and its compressor are freed.
 and its compressor are freed.
 
 
+Some of the pages in zswap are same-value filled pages (i.e. contents of the
+page have same value or repetitive pattern). These pages include zero-filled
+pages and they are handled differently. During store operation, a page is
+checked if it is a same-value filled page before compressing it. If true, the
+compressed length of the page is set to zero and the pattern or same-filled
+value is stored.
+
+Same-value filled pages identification feature is enabled by default and can be
+disabled at boot time by setting the "same_filled_pages_enabled" attribute to 0,
+e.g. zswap.same_filled_pages_enabled=0. It can also be enabled and disabled at
+runtime using the sysfs "same_filled_pages_enabled" attribute, e.g.
+
+echo 1 > /sys/module/zswap/parameters/same_filled_pages_enabled
+
+When zswap same-filled page identification is disabled at runtime, it will stop
+checking for the same-value filled pages during store operation. However, the
+existing pages which are marked as same-value filled pages remain stored
+unchanged in zswap until they are either loaded or invalidated.
+
 A debugfs interface is provided for various statistic about pool size, number
 A debugfs interface is provided for various statistic about pool size, number
-of pages stored, and various counters for the reasons pages are rejected.
+of pages stored, same-value filled pages and various counters for the reasons
+pages are rejected.

+ 14 - 15
Documentation/x86/x86_64/mm.txt

@@ -1,6 +1,4 @@
 
 
-<previous description obsolete, deleted>
-
 Virtual memory map with 4 level page tables:
 Virtual memory map with 4 level page tables:
 
 
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
@@ -14,13 +12,16 @@ ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ... unused hole ...
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB)
 ... unused hole ...
 ... unused hole ...
+fffffe0000000000 - fffffe7fffffffff (=39 bits) LDT remap for PTI
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable)
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space (variable)
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 
 Virtual memory map with 5 level page tables:
 Virtual memory map with 5 level page tables:
@@ -29,26 +30,29 @@ Virtual memory map with 5 level page tables:
 hole caused by [56:63] sign extension
 hole caused by [56:63] sign extension
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
 ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory
-ff90000000000000 - ff91ffffffffffff (=49 bits) hole
-ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space
+ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI
+ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB)
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB)
 ... unused hole ...
 ... unused hole ...
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB)
 ... unused hole ...
 ... unused hole ...
+fffffe8000000000 - fffffeffffffffff (=39 bits) cpu_entry_area mapping
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks
 ... unused hole ...
 ... unused hole ...
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space
 ... unused hole ...
 ... unused hole ...
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
 ffffffff80000000 - ffffffff9fffffff (=512 MB)  kernel text mapping, from phys 0
-ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space
-ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
+ffffffffa0000000 - [fixmap start]   (~1526 MB) module mapping space
+[fixmap start]   - ffffffffff5fffff kernel-internal fixmap range
+ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole
 
 
 Architecture defines a 64-bit virtual address. Implementations can support
 Architecture defines a 64-bit virtual address. Implementations can support
 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
 less. Currently supported are 48- and 57-bit virtual addresses. Bits 63
-through to the most-significant implemented bit are set to either all ones
-or all zero. This causes hole between user space and kernel addresses.
+through to the most-significant implemented bit are sign extended.
+This causes hole between user space and kernel addresses if you interpret them
+as unsigned.
 
 
 The direct mapping covers all memory in the system up to the highest
 The direct mapping covers all memory in the system up to the highest
 memory address (this means in some cases it can also include PCI memory
 memory address (this means in some cases it can also include PCI memory
@@ -58,9 +62,6 @@ vmalloc space is lazily synchronized into the different PML4/PML5 pages of
 the processes using the page fault handler, with init_top_pgt as
 the processes using the page fault handler, with init_top_pgt as
 reference.
 reference.
 
 
-Current X86-64 implementations support up to 46 bits of address space (64 TB),
-which is our current limit. This expands into MBZ space in the page tables.
-
 We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
 We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual
 memory window (this size is arbitrary, it can be raised later if needed).
 memory window (this size is arbitrary, it can be raised later if needed).
 The mappings are not part of any other kernel PGD and are only available
 The mappings are not part of any other kernel PGD and are only available
@@ -72,5 +73,3 @@ following fixmap section.
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 physical memory, vmalloc/ioremap space and virtual memory map are randomized.
 Their order is preserved but their base will be offset early at boot time.
 Their order is preserved but their base will be offset early at boot time.
-
--Andi Kleen, Jul 2004

+ 10 - 10
MAINTAINERS

@@ -2621,24 +2621,22 @@ F:	fs/bfs/
 F:	include/uapi/linux/bfs_fs.h
 F:	include/uapi/linux/bfs_fs.h
 
 
 BLACKFIN ARCHITECTURE
 BLACKFIN ARCHITECTURE
-M:	Steven Miao <realmz6@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 T:	git git://git.code.sf.net/p/adi-linux/code
 T:	git git://git.code.sf.net/p/adi-linux/code
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	arch/blackfin/
 F:	arch/blackfin/
 
 
 BLACKFIN EMAC DRIVER
 BLACKFIN EMAC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/net/ethernet/adi/
 F:	drivers/net/ethernet/adi/
 
 
 BLACKFIN MEDIA DRIVER
 BLACKFIN MEDIA DRIVER
-M:	Scott Jiang <scott.jiang.linux@gmail.com>
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org/
 W:	http://blackfin.uclinux.org/
-S:	Supported
+S:	Orphan
 F:	drivers/media/platform/blackfin/
 F:	drivers/media/platform/blackfin/
 F:	drivers/media/i2c/adv7183*
 F:	drivers/media/i2c/adv7183*
 F:	drivers/media/i2c/vs6624*
 F:	drivers/media/i2c/vs6624*
@@ -2646,25 +2644,25 @@ F:	drivers/media/i2c/vs6624*
 BLACKFIN RTC DRIVER
 BLACKFIN RTC DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/rtc/rtc-bfin.c
 F:	drivers/rtc/rtc-bfin.c
 
 
 BLACKFIN SDH DRIVER
 BLACKFIN SDH DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/mmc/host/bfin_sdh.c
 F:	drivers/mmc/host/bfin_sdh.c
 
 
 BLACKFIN SERIAL DRIVER
 BLACKFIN SERIAL DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/tty/serial/bfin_uart.c
 F:	drivers/tty/serial/bfin_uart.c
 
 
 BLACKFIN WATCHDOG DRIVER
 BLACKFIN WATCHDOG DRIVER
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 L:	adi-buildroot-devel@lists.sourceforge.net (moderated for non-subscribers)
 W:	http://blackfin.uclinux.org
 W:	http://blackfin.uclinux.org
-S:	Supported
+S:	Orphan
 F:	drivers/watchdog/bfin_wdt.c
 F:	drivers/watchdog/bfin_wdt.c
 
 
 BLINKM RGB LED DRIVER
 BLINKM RGB LED DRIVER
@@ -5431,7 +5429,7 @@ F:	drivers/media/tuners/fc2580*
 
 
 FCOE SUBSYSTEM (libfc, libfcoe, fcoe)
 FCOE SUBSYSTEM (libfc, libfcoe, fcoe)
 M:	Johannes Thumshirn <jth@kernel.org>
 M:	Johannes Thumshirn <jth@kernel.org>
-L:	fcoe-devel@open-fcoe.org
+L:	linux-scsi@vger.kernel.org
 W:	www.Open-FCoE.org
 W:	www.Open-FCoE.org
 S:	Supported
 S:	Supported
 F:	drivers/scsi/libfc/
 F:	drivers/scsi/libfc/
@@ -13117,6 +13115,7 @@ F:	drivers/dma/dw/
 
 
 SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER
 SYNOPSYS DESIGNWARE ENTERPRISE ETHERNET DRIVER
 M:	Jie Deng <jiedeng@synopsys.com>
 M:	Jie Deng <jiedeng@synopsys.com>
+M:	Jose Abreu <Jose.Abreu@synopsys.com>
 L:	netdev@vger.kernel.org
 L:	netdev@vger.kernel.org
 S:	Supported
 S:	Supported
 F:	drivers/net/ethernet/synopsys/
 F:	drivers/net/ethernet/synopsys/
@@ -13492,6 +13491,7 @@ M:	Mika Westerberg <mika.westerberg@linux.intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 M:	Yehezkel Bernat <yehezkel.bernat@intel.com>
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
 T:	git git://git.kernel.org/pub/scm/linux/kernel/git/westeri/thunderbolt.git
 S:	Maintained
 S:	Maintained
+F:	Documentation/admin-guide/thunderbolt.rst
 F:	drivers/thunderbolt/
 F:	drivers/thunderbolt/
 F:	include/linux/thunderbolt.h
 F:	include/linux/thunderbolt.h
 
 

+ 4 - 1
Makefile

@@ -2,7 +2,7 @@
 VERSION = 4
 VERSION = 4
 PATCHLEVEL = 15
 PATCHLEVEL = 15
 SUBLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc3
+EXTRAVERSION = -rc6
 NAME = Fearless Coyote
 NAME = Fearless Coyote
 
 
 # *DOCUMENTATION*
 # *DOCUMENTATION*
@@ -789,6 +789,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, pointer-sign)
 # disable invalid "can't wrap" optimizations for signed / pointers
 # disable invalid "can't wrap" optimizations for signed / pointers
 KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 KBUILD_CFLAGS	+= $(call cc-option,-fno-strict-overflow)
 
 
+# Make sure -fstack-check isn't enabled (like gentoo apparently did)
+KBUILD_CFLAGS  += $(call cc-option,-fno-stack-check,)
+
 # conserve stack if available
 # conserve stack if available
 KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
 KBUILD_CFLAGS   += $(call cc-option,-fconserve-stack)
 
 

+ 2 - 2
arch/arm/boot/dts/vf610-zii-dev-rev-c.dts

@@ -121,7 +121,7 @@
 					switch0port10: port@10 {
 					switch0port10: port@10 {
 						reg = <10>;
 						reg = <10>;
 						label = "dsa";
 						label = "dsa";
-						phy-mode = "xgmii";
+						phy-mode = "xaui";
 						link = <&switch1port10>;
 						link = <&switch1port10>;
 					};
 					};
 				};
 				};
@@ -208,7 +208,7 @@
 					switch1port10: port@10 {
 					switch1port10: port@10 {
 						reg = <10>;
 						reg = <10>;
 						label = "dsa";
 						label = "dsa";
-						phy-mode = "xgmii";
+						phy-mode = "xaui";
 						link = <&switch0port10>;
 						link = <&switch0port10>;
 					};
 					};
 				};
 				};

+ 4 - 0
arch/arm/lib/csumpartialcopyuser.S

@@ -85,7 +85,11 @@
 		.pushsection .text.fixup,"ax"
 		.pushsection .text.fixup,"ax"
 		.align	4
 		.align	4
 9001:		mov	r4, #-EFAULT
 9001:		mov	r4, #-EFAULT
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+		ldr	r5, [sp, #9*4]		@ *err_ptr
+#else
 		ldr	r5, [sp, #8*4]		@ *err_ptr
 		ldr	r5, [sp, #8*4]		@ *err_ptr
+#endif
 		str	r4, [r5]
 		str	r4, [r5]
 		ldmia	sp, {r1, r2}		@ retrieve dst, len
 		ldmia	sp, {r1, r2}		@ retrieve dst, len
 		add	r2, r2, r1
 		add	r2, r2, r1

+ 11 - 1
arch/arm64/Kconfig

@@ -557,7 +557,6 @@ config QCOM_QDF2400_ERRATUM_0065
 
 
 	  If unsure, say Y.
 	  If unsure, say Y.
 
 
-
 config SOCIONEXT_SYNQUACER_PREITS
 config SOCIONEXT_SYNQUACER_PREITS
 	bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
 	bool "Socionext Synquacer: Workaround for GICv3 pre-ITS"
 	default y
 	default y
@@ -576,6 +575,17 @@ config HISILICON_ERRATUM_161600802
 	  a 128kB offset to be applied to the target address in this commands.
 	  a 128kB offset to be applied to the target address in this commands.
 
 
 	  If unsure, say Y.
 	  If unsure, say Y.
+
+config QCOM_FALKOR_ERRATUM_E1041
+	bool "Falkor E1041: Speculative instruction fetches might cause errant memory access"
+	default y
+	help
+	  Falkor CPU may speculatively fetch instructions from an improper
+	  memory location when MMU translation is changed from SCTLR_ELn[M]=1
+	  to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem.
+
+	  If unsure, say Y.
+
 endmenu
 endmenu
 
 
 
 

+ 10 - 0
arch/arm64/include/asm/assembler.h

@@ -512,4 +512,14 @@ alternative_else_nop_endif
 #endif
 #endif
 	.endm
 	.endm
 
 
+/**
+ * Errata workaround prior to disable MMU. Insert an ISB immediately prior
+ * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0.
+ */
+	.macro pre_disable_mmu_workaround
+#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041
+	isb
+#endif
+	.endm
+
 #endif	/* __ASM_ASSEMBLER_H */
 #endif	/* __ASM_ASSEMBLER_H */

+ 3 - 0
arch/arm64/include/asm/cpufeature.h

@@ -60,6 +60,9 @@ enum ftr_type {
 #define FTR_VISIBLE	true	/* Feature visible to the user space */
 #define FTR_VISIBLE	true	/* Feature visible to the user space */
 #define FTR_HIDDEN	false	/* Feature is hidden from the user */
 #define FTR_HIDDEN	false	/* Feature is hidden from the user */
 
 
+#define FTR_VISIBLE_IF_IS_ENABLED(config)		\
+	(IS_ENABLED(config) ? FTR_VISIBLE : FTR_HIDDEN)
+
 struct arm64_ftr_bits {
 struct arm64_ftr_bits {
 	bool		sign;	/* Value is signed ? */
 	bool		sign;	/* Value is signed ? */
 	bool		visible;
 	bool		visible;

+ 2 - 0
arch/arm64/include/asm/cputype.h

@@ -91,6 +91,7 @@
 #define BRCM_CPU_PART_VULCAN		0x516
 #define BRCM_CPU_PART_VULCAN		0x516
 
 
 #define QCOM_CPU_PART_FALKOR_V1		0x800
 #define QCOM_CPU_PART_FALKOR_V1		0x800
+#define QCOM_CPU_PART_FALKOR		0xC00
 
 
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
 #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57)
@@ -99,6 +100,7 @@
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
 #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
 #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1)
+#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR)
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 

+ 22 - 19
arch/arm64/include/asm/pgtable.h

@@ -42,6 +42,8 @@
 #include <asm/cmpxchg.h>
 #include <asm/cmpxchg.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
 #include <linux/mmdebug.h>
 #include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/sched.h>
 
 
 extern void __pte_error(const char *file, int line, unsigned long val);
 extern void __pte_error(const char *file, int line, unsigned long val);
 extern void __pmd_error(const char *file, int line, unsigned long val);
 extern void __pmd_error(const char *file, int line, unsigned long val);
@@ -149,12 +151,20 @@ static inline pte_t pte_mkwrite(pte_t pte)
 
 
 static inline pte_t pte_mkclean(pte_t pte)
 static inline pte_t pte_mkclean(pte_t pte)
 {
 {
-	return clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+	pte = clear_pte_bit(pte, __pgprot(PTE_DIRTY));
+	pte = set_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+	return pte;
 }
 }
 
 
 static inline pte_t pte_mkdirty(pte_t pte)
 static inline pte_t pte_mkdirty(pte_t pte)
 {
 {
-	return set_pte_bit(pte, __pgprot(PTE_DIRTY));
+	pte = set_pte_bit(pte, __pgprot(PTE_DIRTY));
+
+	if (pte_write(pte))
+		pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY));
+
+	return pte;
 }
 }
 
 
 static inline pte_t pte_mkold(pte_t pte)
 static inline pte_t pte_mkold(pte_t pte)
@@ -207,9 +217,6 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
 	}
 	}
 }
 }
 
 
-struct mm_struct;
-struct vm_area_struct;
-
 extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
 extern void __sync_icache_dcache(pte_t pteval, unsigned long addr);
 
 
 /*
 /*
@@ -238,7 +245,8 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
 	 * hardware updates of the pte (ptep_set_access_flags safely changes
 	 * hardware updates of the pte (ptep_set_access_flags safely changes
 	 * valid ptes without going through an invalid entry).
 	 * valid ptes without going through an invalid entry).
 	 */
 	 */
-	if (pte_valid(*ptep) && pte_valid(pte)) {
+	if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) &&
+	   (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) {
 		VM_WARN_ONCE(!pte_young(pte),
 		VM_WARN_ONCE(!pte_young(pte),
 			     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
 			     "%s: racy access flag clearing: 0x%016llx -> 0x%016llx",
 			     __func__, pte_val(*ptep), pte_val(pte));
 			     __func__, pte_val(*ptep), pte_val(pte));
@@ -641,28 +649,23 @@ static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
 
 /*
 /*
- * ptep_set_wrprotect - mark read-only while preserving the hardware update of
- * the Access Flag.
+ * ptep_set_wrprotect - mark read-only while trasferring potential hardware
+ * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit.
  */
  */
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 #define __HAVE_ARCH_PTEP_SET_WRPROTECT
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
 static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep)
 {
 {
 	pte_t old_pte, pte;
 	pte_t old_pte, pte;
 
 
-	/*
-	 * ptep_set_wrprotect() is only called on CoW mappings which are
-	 * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE &&
-	 * PTE_RDONLY) or writable and software-dirty (PTE_WRITE &&
-	 * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and
-	 * protection_map[]. There is no race with the hardware update of the
-	 * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM)
-	 * is set.
-	 */
-	VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep),
-		     "%s: potential race with hardware DBM", __func__);
 	pte = READ_ONCE(*ptep);
 	pte = READ_ONCE(*ptep);
 	do {
 	do {
 		old_pte = pte;
 		old_pte = pte;
+		/*
+		 * If hardware-dirty (PTE_WRITE/DBM bit set and PTE_RDONLY
+		 * clear), set the PTE_DIRTY bit.
+		 */
+		if (pte_hw_dirty(pte))
+			pte = pte_mkdirty(pte);
 		pte = pte_wrprotect(pte);
 		pte = pte_wrprotect(pte);
 		pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
 		pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep),
 					       pte_val(old_pte), pte_val(pte));
 					       pte_val(old_pte), pte_val(pte));

+ 1 - 0
arch/arm64/kernel/cpu-reset.S

@@ -37,6 +37,7 @@ ENTRY(__cpu_soft_restart)
 	mrs	x12, sctlr_el1
 	mrs	x12, sctlr_el1
 	ldr	x13, =SCTLR_ELx_FLAGS
 	ldr	x13, =SCTLR_ELx_FLAGS
 	bic	x12, x12, x13
 	bic	x12, x12, x13
+	pre_disable_mmu_workaround
 	msr	sctlr_el1, x12
 	msr	sctlr_el1, x12
 	isb
 	isb
 
 

+ 2 - 1
arch/arm64/kernel/cpufeature.c

@@ -145,7 +145,8 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 };
 };
 
 
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
-	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
+	ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SVE),
+				   FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_SVE_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_GIC_SHIFT, 4, 0),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),
 	S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI),

+ 2 - 0
arch/arm64/kernel/efi-entry.S

@@ -96,6 +96,7 @@ ENTRY(entry)
 	mrs	x0, sctlr_el2
 	mrs	x0, sctlr_el2
 	bic	x0, x0, #1 << 0	// clear SCTLR.M
 	bic	x0, x0, #1 << 0	// clear SCTLR.M
 	bic	x0, x0, #1 << 2	// clear SCTLR.C
 	bic	x0, x0, #1 << 2	// clear SCTLR.C
+	pre_disable_mmu_workaround
 	msr	sctlr_el2, x0
 	msr	sctlr_el2, x0
 	isb
 	isb
 	b	2f
 	b	2f
@@ -103,6 +104,7 @@ ENTRY(entry)
 	mrs	x0, sctlr_el1
 	mrs	x0, sctlr_el1
 	bic	x0, x0, #1 << 0	// clear SCTLR.M
 	bic	x0, x0, #1 << 0	// clear SCTLR.M
 	bic	x0, x0, #1 << 2	// clear SCTLR.C
 	bic	x0, x0, #1 << 2	// clear SCTLR.C
+	pre_disable_mmu_workaround
 	msr	sctlr_el1, x0
 	msr	sctlr_el1, x0
 	isb
 	isb
 2:
 2:

+ 1 - 1
arch/arm64/kernel/fpsimd.c

@@ -1043,7 +1043,7 @@ void fpsimd_update_current_state(struct fpsimd_state *state)
 
 
 	local_bh_disable();
 	local_bh_disable();
 
 
-	current->thread.fpsimd_state = *state;
+	current->thread.fpsimd_state.user_fpsimd = state->user_fpsimd;
 	if (system_supports_sve() && test_thread_flag(TIF_SVE))
 	if (system_supports_sve() && test_thread_flag(TIF_SVE))
 		fpsimd_to_sve(current);
 		fpsimd_to_sve(current);
 
 

+ 1 - 0
arch/arm64/kernel/head.S

@@ -750,6 +750,7 @@ __primary_switch:
 	 * to take into account by discarding the current kernel mapping and
 	 * to take into account by discarding the current kernel mapping and
 	 * creating a new one.
 	 * creating a new one.
 	 */
 	 */
+	pre_disable_mmu_workaround
 	msr	sctlr_el1, x20			// disable the MMU
 	msr	sctlr_el1, x20			// disable the MMU
 	isb
 	isb
 	bl	__create_page_tables		// recreate kernel mapping
 	bl	__create_page_tables		// recreate kernel mapping

+ 1 - 1
arch/arm64/kernel/hw_breakpoint.c

@@ -28,6 +28,7 @@
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 #include <linux/ptrace.h>
 #include <linux/ptrace.h>
 #include <linux/smp.h>
 #include <linux/smp.h>
+#include <linux/uaccess.h>
 
 
 #include <asm/compat.h>
 #include <asm/compat.h>
 #include <asm/current.h>
 #include <asm/current.h>
@@ -36,7 +37,6 @@
 #include <asm/traps.h>
 #include <asm/traps.h>
 #include <asm/cputype.h>
 #include <asm/cputype.h>
 #include <asm/system_misc.h>
 #include <asm/system_misc.h>
-#include <asm/uaccess.h>
 
 
 /* Breakpoint currently in use for each BRP. */
 /* Breakpoint currently in use for each BRP. */
 static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]);
 static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]);

+ 1 - 0
arch/arm64/kernel/relocate_kernel.S

@@ -45,6 +45,7 @@ ENTRY(arm64_relocate_new_kernel)
 	mrs	x0, sctlr_el2
 	mrs	x0, sctlr_el2
 	ldr	x1, =SCTLR_ELx_FLAGS
 	ldr	x1, =SCTLR_ELx_FLAGS
 	bic	x0, x0, x1
 	bic	x0, x0, x1
+	pre_disable_mmu_workaround
 	msr	sctlr_el2, x0
 	msr	sctlr_el2, x0
 	isb
 	isb
 1:
 1:

+ 1 - 0
arch/arm64/kvm/hyp-init.S

@@ -151,6 +151,7 @@ reset:
 	mrs	x5, sctlr_el2
 	mrs	x5, sctlr_el2
 	ldr	x6, =SCTLR_ELx_FLAGS
 	ldr	x6, =SCTLR_ELx_FLAGS
 	bic	x5, x5, x6		// Clear SCTL_M and etc
 	bic	x5, x5, x6		// Clear SCTL_M and etc
+	pre_disable_mmu_workaround
 	msr	sctlr_el2, x5
 	msr	sctlr_el2, x5
 	isb
 	isb
 
 

+ 3 - 0
arch/arm64/kvm/hyp/debug-sr.c

@@ -74,6 +74,9 @@ static void __hyp_text __debug_save_spe_nvhe(u64 *pmscr_el1)
 {
 {
 	u64 reg;
 	u64 reg;
 
 
+	/* Clear pmscr in case of early return */
+	*pmscr_el1 = 0;
+
 	/* SPE present on this CPU? */
 	/* SPE present on this CPU? */
 	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
 	if (!cpuid_feature_extract_unsigned_field(read_sysreg(id_aa64dfr0_el1),
 						  ID_AA64DFR0_PMSVER_SHIFT))
 						  ID_AA64DFR0_PMSVER_SHIFT))

+ 1 - 1
arch/arm64/mm/dump.c

@@ -389,7 +389,7 @@ void ptdump_check_wx(void)
 		.check_wx = true,
 		.check_wx = true,
 	};
 	};
 
 
-	walk_pgd(&st, &init_mm, 0);
+	walk_pgd(&st, &init_mm, VA_START);
 	note_page(&st, 0, 0, 0);
 	note_page(&st, 0, 0, 0);
 	if (st.wx_pages || st.uxn_pages)
 	if (st.wx_pages || st.uxn_pages)
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",
 		pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n",

+ 2 - 3
arch/arm64/mm/fault.c

@@ -574,7 +574,6 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 {
 {
 	struct siginfo info;
 	struct siginfo info;
 	const struct fault_info *inf;
 	const struct fault_info *inf;
-	int ret = 0;
 
 
 	inf = esr_to_fault_info(esr);
 	inf = esr_to_fault_info(esr);
 	pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
 	pr_err("Synchronous External Abort: %s (0x%08x) at 0x%016lx\n",
@@ -589,7 +588,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 		if (interrupts_enabled(regs))
 		if (interrupts_enabled(regs))
 			nmi_enter();
 			nmi_enter();
 
 
-		ret = ghes_notify_sea();
+		ghes_notify_sea();
 
 
 		if (interrupts_enabled(regs))
 		if (interrupts_enabled(regs))
 			nmi_exit();
 			nmi_exit();
@@ -604,7 +603,7 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs)
 		info.si_addr  = (void __user *)addr;
 		info.si_addr  = (void __user *)addr;
 	arm64_notify_die("", regs, &info, esr);
 	arm64_notify_die("", regs, &info, esr);
 
 
-	return ret;
+	return 0;
 }
 }
 
 
 static const struct fault_info fault_info[] = {
 static const struct fault_info fault_info[] = {

+ 2 - 1
arch/arm64/mm/init.c

@@ -476,6 +476,8 @@ void __init arm64_memblock_init(void)
 
 
 	reserve_elfcorehdr();
 	reserve_elfcorehdr();
 
 
+	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
+
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
 
 	memblock_allow_resize();
 	memblock_allow_resize();
@@ -502,7 +504,6 @@ void __init bootmem_init(void)
 	sparse_init();
 	sparse_init();
 	zone_sizes_init(min, max);
 	zone_sizes_init(min, max);
 
 
-	high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
 	memblock_dump_all();
 	memblock_dump_all();
 }
 }
 
 

+ 2 - 2
arch/parisc/boot/compressed/misc.c

@@ -123,8 +123,8 @@ int puts(const char *s)
 	while ((nuline = strchr(s, '\n')) != NULL) {
 	while ((nuline = strchr(s, '\n')) != NULL) {
 		if (nuline != s)
 		if (nuline != s)
 			pdc_iodc_print(s, nuline - s);
 			pdc_iodc_print(s, nuline - s);
-			pdc_iodc_print("\r\n", 2);
-			s = nuline + 1;
+		pdc_iodc_print("\r\n", 2);
+		s = nuline + 1;
 	}
 	}
 	if (*s != '\0')
 	if (*s != '\0')
 		pdc_iodc_print(s, strlen(s));
 		pdc_iodc_print(s, strlen(s));

+ 5 - 0
arch/parisc/include/asm/thread_info.h

@@ -35,7 +35,12 @@ struct thread_info {
 
 
 /* thread information allocation */
 /* thread information allocation */
 
 
+#ifdef CONFIG_IRQSTACKS
+#define THREAD_SIZE_ORDER	2 /* PA-RISC requires at least 16k stack */
+#else
 #define THREAD_SIZE_ORDER	3 /* PA-RISC requires at least 32k stack */
 #define THREAD_SIZE_ORDER	3 /* PA-RISC requires at least 32k stack */
+#endif
+
 /* Be sure to hunt all references to this down when you change the size of
 /* Be sure to hunt all references to this down when you change the size of
  * the kernel stack */
  * the kernel stack */
 #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_SIZE             (PAGE_SIZE << THREAD_SIZE_ORDER)

+ 9 - 3
arch/parisc/kernel/entry.S

@@ -878,9 +878,6 @@ ENTRY_CFI(syscall_exit_rfi)
 	STREG   %r19,PT_SR7(%r16)
 	STREG   %r19,PT_SR7(%r16)
 
 
 intr_return:
 intr_return:
-	/* NOTE: Need to enable interrupts incase we schedule. */
-	ssm     PSW_SM_I, %r0
-
 	/* check for reschedule */
 	/* check for reschedule */
 	mfctl   %cr30,%r1
 	mfctl   %cr30,%r1
 	LDREG   TI_FLAGS(%r1),%r19	/* sched.h: TIF_NEED_RESCHED */
 	LDREG   TI_FLAGS(%r1),%r19	/* sched.h: TIF_NEED_RESCHED */
@@ -907,6 +904,11 @@ intr_check_sig:
 	LDREG	PT_IASQ1(%r16), %r20
 	LDREG	PT_IASQ1(%r16), %r20
 	cmpib,COND(=),n 0,%r20,intr_restore /* backward */
 	cmpib,COND(=),n 0,%r20,intr_restore /* backward */
 
 
+	/* NOTE: We need to enable interrupts if we have to deliver
+	 * signals. We used to do this earlier but it caused kernel
+	 * stack overflows. */
+	ssm     PSW_SM_I, %r0
+
 	copy	%r0, %r25			/* long in_syscall = 0 */
 	copy	%r0, %r25			/* long in_syscall = 0 */
 #ifdef CONFIG_64BIT
 #ifdef CONFIG_64BIT
 	ldo	-16(%r30),%r29			/* Reference param save area */
 	ldo	-16(%r30),%r29			/* Reference param save area */
@@ -958,6 +960,10 @@ intr_do_resched:
 	cmpib,COND(=)	0, %r20, intr_do_preempt
 	cmpib,COND(=)	0, %r20, intr_do_preempt
 	nop
 	nop
 
 
+	/* NOTE: We need to enable interrupts if we schedule.  We used
+	 * to do this earlier but it caused kernel stack overflows. */
+	ssm     PSW_SM_I, %r0
+
 #ifdef CONFIG_64BIT
 #ifdef CONFIG_64BIT
 	ldo	-16(%r30),%r29		/* Reference param save area */
 	ldo	-16(%r30),%r29		/* Reference param save area */
 #endif
 #endif

+ 1 - 0
arch/parisc/kernel/hpmc.S

@@ -305,6 +305,7 @@ ENDPROC_CFI(os_hpmc)
 
 
 
 
 	__INITRODATA
 	__INITRODATA
+	.align 4
 	.export os_hpmc_size
 	.export os_hpmc_size
 os_hpmc_size:
 os_hpmc_size:
 	.word .os_hpmc_end-.os_hpmc
 	.word .os_hpmc_end-.os_hpmc

+ 0 - 1
arch/parisc/kernel/unwind.c

@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/kallsyms.h>
 #include <linux/kallsyms.h>
 #include <linux/sort.h>
 #include <linux/sort.h>
-#include <linux/sched.h>
 
 
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 #include <asm/assembly.h>
 #include <asm/assembly.h>

+ 0 - 2
arch/parisc/lib/delay.c

@@ -16,9 +16,7 @@
 #include <linux/preempt.h>
 #include <linux/preempt.h>
 #include <linux/init.h>
 #include <linux/init.h>
 
 
-#include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/delay.h>
-
 #include <asm/special_insns.h>    /* for mfctl() */
 #include <asm/special_insns.h>    /* for mfctl() */
 #include <asm/processor.h> /* for boot_cpu_data */
 #include <asm/processor.h> /* for boot_cpu_data */
 
 

+ 3 - 2
arch/powerpc/include/asm/mmu_context.h

@@ -160,9 +160,10 @@ static inline void enter_lazy_tlb(struct mm_struct *mm,
 #endif
 #endif
 }
 }
 
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
 {
 {
+	return 0;
 }
 }
 
 
 #ifndef CONFIG_PPC_BOOK3S_64
 #ifndef CONFIG_PPC_BOOK3S_64

+ 1 - 1
arch/powerpc/kernel/process.c

@@ -1403,7 +1403,7 @@ void show_regs(struct pt_regs * regs)
 
 
 	printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",
 	printk("NIP:  "REG" LR: "REG" CTR: "REG"\n",
 	       regs->nip, regs->link, regs->ctr);
 	       regs->nip, regs->link, regs->ctr);
-	printk("REGS: %p TRAP: %04lx   %s  (%s)\n",
+	printk("REGS: %px TRAP: %04lx   %s  (%s)\n",
 	       regs, regs->trap, print_tainted(), init_utsname()->release);
 	       regs, regs->trap, print_tainted(), init_utsname()->release);
 	printk("MSR:  "REG" ", regs->msr);
 	printk("MSR:  "REG" ", regs->msr);
 	print_msr_bits(regs->msr);
 	print_msr_bits(regs->msr);

+ 4 - 3
arch/powerpc/kvm/book3s_xive.c

@@ -725,7 +725,8 @@ u64 kvmppc_xive_get_icp(struct kvm_vcpu *vcpu)
 
 
 	/* Return the per-cpu state for state saving/migration */
 	/* Return the per-cpu state for state saving/migration */
 	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
 	return (u64)xc->cppr << KVM_REG_PPC_ICP_CPPR_SHIFT |
-	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT;
+	       (u64)xc->mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT |
+	       (u64)0xff << KVM_REG_PPC_ICP_PPRI_SHIFT;
 }
 }
 
 
 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
@@ -1558,7 +1559,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 
 
 	/*
 	/*
 	 * Restore P and Q. If the interrupt was pending, we
 	 * Restore P and Q. If the interrupt was pending, we
-	 * force both P and Q, which will trigger a resend.
+	 * force Q and !P, which will trigger a resend.
 	 *
 	 *
 	 * That means that a guest that had both an interrupt
 	 * That means that a guest that had both an interrupt
 	 * pending (queued) and Q set will restore with only
 	 * pending (queued) and Q set will restore with only
@@ -1566,7 +1567,7 @@ static int xive_set_source(struct kvmppc_xive *xive, long irq, u64 addr)
 	 * is perfectly fine as coalescing interrupts that haven't
 	 * is perfectly fine as coalescing interrupts that haven't
 	 * been presented yet is always allowed.
 	 * been presented yet is always allowed.
 	 */
 	 */
-	if (val & KVM_XICS_PRESENTED || val & KVM_XICS_PENDING)
+	if (val & KVM_XICS_PRESENTED && !(val & KVM_XICS_PENDING))
 		state->old_p = true;
 		state->old_p = true;
 	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
 	if (val & KVM_XICS_QUEUED || val & KVM_XICS_PENDING)
 		state->old_q = true;
 		state->old_q = true;

+ 4 - 2
arch/powerpc/net/bpf_jit_comp64.c

@@ -763,7 +763,8 @@ emit_clear:
 			func = (u8 *) __bpf_call_base + imm;
 			func = (u8 *) __bpf_call_base + imm;
 
 
 			/* Save skb pointer if we need to re-cache skb data */
 			/* Save skb pointer if we need to re-cache skb data */
-			if (bpf_helper_changes_pkt_data(func))
+			if ((ctx->seen & SEEN_SKB) &&
+			    bpf_helper_changes_pkt_data(func))
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 				PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx));
 
 
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
 			bpf_jit_emit_func_call(image, ctx, (u64)func);
@@ -772,7 +773,8 @@ emit_clear:
 			PPC_MR(b2p[BPF_REG_0], 3);
 			PPC_MR(b2p[BPF_REG_0], 3);
 
 
 			/* refresh skb cache */
 			/* refresh skb cache */
-			if (bpf_helper_changes_pkt_data(func)) {
+			if ((ctx->seen & SEEN_SKB) &&
+			    bpf_helper_changes_pkt_data(func)) {
 				/* reload skb pointer to r3 */
 				/* reload skb pointer to r3 */
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx));
 				bpf_jit_emit_skb_loads(image, ctx);
 				bpf_jit_emit_skb_loads(image, ctx);

+ 6 - 2
arch/powerpc/perf/core-book3s.c

@@ -410,8 +410,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
 	int ret;
 	int ret;
 	__u64 target;
 	__u64 target;
 
 
-	if (is_kernel_addr(addr))
-		return branch_target((unsigned int *)addr);
+	if (is_kernel_addr(addr)) {
+		if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+			return 0;
+
+		return branch_target(&instr);
+	}
 
 
 	/* Userspace: need copy instruction here then translate it */
 	/* Userspace: need copy instruction here then translate it */
 	pagefault_disable();
 	pagefault_disable();

+ 16 - 1
arch/powerpc/perf/imc-pmu.c

@@ -309,6 +309,19 @@ static int ppc_nest_imc_cpu_offline(unsigned int cpu)
 	if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
 	if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask))
 		return 0;
 		return 0;
 
 
+	/*
+	 * Check whether nest_imc is registered. We could end up here if the
+	 * cpuhotplug callback registration fails. i.e, callback invokes the
+	 * offline path for all successfully registered nodes. At this stage,
+	 * nest_imc pmu will not be registered and we should return here.
+	 *
+	 * We return with a zero since this is not an offline failure. And
+	 * cpuhp_setup_state() returns the actual failure reason to the caller,
+	 * which in turn will call the cleanup routine.
+	 */
+	if (!nest_pmus)
+		return 0;
+
 	/*
 	/*
 	 * Now that this cpu is one of the designated,
 	 * Now that this cpu is one of the designated,
 	 * find a next cpu a) which is online and b) in same chip.
 	 * find a next cpu a) which is online and b) in same chip.
@@ -1171,6 +1184,7 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		if (nest_pmus == 1) {
 		if (nest_pmus == 1) {
 			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
 			cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE);
 			kfree(nest_imc_refc);
 			kfree(nest_imc_refc);
+			kfree(per_nest_pmu_arr);
 		}
 		}
 
 
 		if (nest_pmus > 0)
 		if (nest_pmus > 0)
@@ -1195,7 +1209,6 @@ static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr)
 		kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
 		kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs);
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
 	kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]);
 	kfree(pmu_ptr);
 	kfree(pmu_ptr);
-	kfree(per_nest_pmu_arr);
 	return;
 	return;
 }
 }
 
 
@@ -1309,6 +1322,8 @@ int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_id
 			ret = nest_pmu_cpumask_init();
 			ret = nest_pmu_cpumask_init();
 			if (ret) {
 			if (ret) {
 				mutex_unlock(&nest_init_lock);
 				mutex_unlock(&nest_init_lock);
+				kfree(nest_imc_refc);
+				kfree(per_nest_pmu_arr);
 				goto err_free;
 				goto err_free;
 			}
 			}
 		}
 		}

+ 3 - 1
arch/powerpc/sysdev/fsl_msi.c

@@ -354,6 +354,7 @@ static int fsl_of_msi_remove(struct platform_device *ofdev)
 }
 }
 
 
 static struct lock_class_key fsl_msi_irq_class;
 static struct lock_class_key fsl_msi_irq_class;
+static struct lock_class_key fsl_msi_irq_request_class;
 
 
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 			       int offset, int irq_index)
 			       int offset, int irq_index)
@@ -373,7 +374,8 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev,
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		dev_err(&dev->dev, "No memory for MSI cascade data\n");
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
-	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class);
+	irq_set_lockdep_class(virt_msir, &fsl_msi_irq_class,
+			      &fsl_msi_irq_request_class);
 	cascade_data->index = offset;
 	cascade_data->index = offset;
 	cascade_data->msi_data = msi;
 	cascade_data->msi_data = msi;
 	cascade_data->virq = virt_msir;
 	cascade_data->virq = virt_msir;

+ 19 - 0
arch/riscv/include/asm/barrier.h

@@ -38,6 +38,25 @@
 #define smp_rmb()	RISCV_FENCE(r,r)
 #define smp_rmb()	RISCV_FENCE(r,r)
 #define smp_wmb()	RISCV_FENCE(w,w)
 #define smp_wmb()	RISCV_FENCE(w,w)
 
 
+/*
+ * This is a very specific barrier: it's currently only used in two places in
+ * the kernel, both in the scheduler.  See include/linux/spinlock.h for the two
+ * orderings it guarantees, but the "critical section is RCsc" guarantee
+ * mandates a barrier on RISC-V.  The sequence looks like:
+ *
+ *    lr.aq lock
+ *    sc    lock <= LOCKED
+ *    smp_mb__after_spinlock()
+ *    // critical section
+ *    lr    lock
+ *    sc.rl lock <= UNLOCKED
+ *
+ * The AQ/RL pair provides a RCpc critical section, but there's not really any
+ * way we can take advantage of that here because the ordering is only enforced
+ * on that one lock.  Thus, we're just doing a full fence.
+ */
+#define smp_mb__after_spinlock()	RISCV_FENCE(rw,rw)
+
 #include <asm-generic/barrier.h>
 #include <asm-generic/barrier.h>
 
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */

+ 0 - 11
arch/riscv/kernel/setup.c

@@ -38,10 +38,6 @@
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/thread_info.h>
 #include <asm/thread_info.h>
 
 
-#ifdef CONFIG_HVC_RISCV_SBI
-#include <asm/hvc_riscv_sbi.h>
-#endif
-
 #ifdef CONFIG_DUMMY_CONSOLE
 #ifdef CONFIG_DUMMY_CONSOLE
 struct screen_info screen_info = {
 struct screen_info screen_info = {
 	.orig_video_lines	= 30,
 	.orig_video_lines	= 30,
@@ -212,13 +208,6 @@ static void __init setup_bootmem(void)
 
 
 void __init setup_arch(char **cmdline_p)
 void __init setup_arch(char **cmdline_p)
 {
 {
-#if defined(CONFIG_HVC_RISCV_SBI)
-	if (likely(early_console == NULL)) {
-		early_console = &riscv_sbi_early_console_dev;
-		register_console(early_console);
-	}
-#endif
-
 #ifdef CONFIG_CMDLINE_BOOL
 #ifdef CONFIG_CMDLINE_BOOL
 #ifdef CONFIG_CMDLINE_OVERRIDE
 #ifdef CONFIG_CMDLINE_OVERRIDE
 	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
 	strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);

+ 1 - 1
arch/riscv/kernel/sys_riscv.c

@@ -70,7 +70,7 @@ SYSCALL_DEFINE3(riscv_flush_icache, uintptr_t, start, uintptr_t, end,
 	bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0;
 	bool local = (flags & SYS_RISCV_FLUSH_ICACHE_LOCAL) != 0;
 
 
 	/* Check the reserved flags. */
 	/* Check the reserved flags. */
-	if (unlikely(flags & !SYS_RISCV_FLUSH_ICACHE_ALL))
+	if (unlikely(flags & ~SYS_RISCV_FLUSH_ICACHE_ALL))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	flush_icache_mm(mm, local);
 	flush_icache_mm(mm, local);

+ 0 - 6
arch/s390/include/asm/pgtable.h

@@ -1264,12 +1264,6 @@ static inline pud_t pud_mkwrite(pud_t pud)
 	return pud;
 	return pud;
 }
 }
 
 
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
-	return (pud_val(pud) & _REGION3_ENTRY_WRITE) != 0;
-}
-
 static inline pud_t pud_mkclean(pud_t pud)
 static inline pud_t pud_mkclean(pud_t pud)
 {
 {
 	if (pud_large(pud)) {
 	if (pud_large(pud)) {

+ 1 - 0
arch/s390/kernel/compat_linux.c

@@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
 		return retval;
 		return retval;
 	}
 	}
 
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 	put_group_info(group_info);
 
 

+ 5 - 6
arch/s390/net/bpf_jit_comp.c

@@ -55,8 +55,7 @@ struct bpf_jit {
 #define SEEN_LITERAL	8	/* code uses literals */
 #define SEEN_LITERAL	8	/* code uses literals */
 #define SEEN_FUNC	16	/* calls C functions */
 #define SEEN_FUNC	16	/* calls C functions */
 #define SEEN_TAIL_CALL	32	/* code uses tail calls */
 #define SEEN_TAIL_CALL	32	/* code uses tail calls */
-#define SEEN_SKB_CHANGE	64	/* code changes skb data */
-#define SEEN_REG_AX	128	/* code uses constant blinding */
+#define SEEN_REG_AX	64	/* code uses constant blinding */
 #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 #define SEEN_STACK	(SEEN_FUNC | SEEN_MEM | SEEN_SKB)
 
 
 /*
 /*
@@ -448,12 +447,12 @@ static void bpf_jit_prologue(struct bpf_jit *jit, u32 stack_depth)
 			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
 			EMIT6_DISP_LH(0xe3000000, 0x0024, REG_W1, REG_0,
 				      REG_15, 152);
 				      REG_15, 152);
 	}
 	}
-	if (jit->seen & SEEN_SKB)
+	if (jit->seen & SEEN_SKB) {
 		emit_load_skb_data_hlen(jit);
 		emit_load_skb_data_hlen(jit);
-	if (jit->seen & SEEN_SKB_CHANGE)
 		/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
 		/* stg %b1,ST_OFF_SKBP(%r0,%r15) */
 		EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
 		EMIT6_DISP_LH(0xe3000000, 0x0024, BPF_REG_1, REG_0, REG_15,
 			      STK_OFF_SKBP);
 			      STK_OFF_SKBP);
+	}
 }
 }
 
 
 /*
 /*
@@ -983,8 +982,8 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i
 		EMIT2(0x0d00, REG_14, REG_W1);
 		EMIT2(0x0d00, REG_14, REG_W1);
 		/* lgr %b0,%r2: load return value into %b0 */
 		/* lgr %b0,%r2: load return value into %b0 */
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
 		EMIT4(0xb9040000, BPF_REG_0, REG_2);
-		if (bpf_helper_changes_pkt_data((void *)func)) {
-			jit->seen |= SEEN_SKB_CHANGE;
+		if ((jit->seen & SEEN_SKB) &&
+		    bpf_helper_changes_pkt_data((void *)func)) {
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			/* lg %b1,ST_OFF_SKBP(%r15) */
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
 			EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0,
 				      REG_15, STK_OFF_SKBP);
 				      REG_15, STK_OFF_SKBP);

+ 2 - 2
arch/sparc/lib/hweight.S

@@ -44,8 +44,8 @@ EXPORT_SYMBOL(__arch_hweight32)
 	.previous
 	.previous
 
 
 ENTRY(__arch_hweight64)
 ENTRY(__arch_hweight64)
-	sethi	%hi(__sw_hweight16), %g1
-	jmpl	%g1 + %lo(__sw_hweight16), %g0
+	sethi	%hi(__sw_hweight64), %g1
+	jmpl	%g1 + %lo(__sw_hweight64), %g0
 	 nop
 	 nop
 ENDPROC(__arch_hweight64)
 ENDPROC(__arch_hweight64)
 EXPORT_SYMBOL(__arch_hweight64)
 EXPORT_SYMBOL(__arch_hweight64)

+ 1 - 1
arch/sparc/mm/fault_32.c

@@ -113,7 +113,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 	if (!printk_ratelimit())
 		return;
 		return;
 
 
-	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->pc, (void *)regs->u_regs[UREG_I7],
 	       (void *)regs->pc, (void *)regs->u_regs[UREG_I7],

+ 1 - 1
arch/sparc/mm/fault_64.c

@@ -154,7 +154,7 @@ show_signal_msg(struct pt_regs *regs, int sig, int code,
 	if (!printk_ratelimit())
 	if (!printk_ratelimit())
 		return;
 		return;
 
 
-	printk("%s%s[%d]: segfault at %lx ip %p (rpc %p) sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px (rpc %px) sp %px error %x",
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       tsk->comm, task_pid_nr(tsk), address,
 	       (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],
 	       (void *)regs->tpc, (void *)regs->u_regs[UREG_I7],

+ 2 - 2
arch/sparc/mm/gup.c

@@ -75,7 +75,7 @@ static int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr,
 	if (!(pmd_val(pmd) & _PAGE_VALID))
 	if (!(pmd_val(pmd) & _PAGE_VALID))
 		return 0;
 		return 0;
 
 
-	if (!pmd_access_permitted(pmd, write))
+	if (write && !pmd_write(pmd))
 		return 0;
 		return 0;
 
 
 	refs = 0;
 	refs = 0;
@@ -114,7 +114,7 @@ static int gup_huge_pud(pud_t *pudp, pud_t pud, unsigned long addr,
 	if (!(pud_val(pud) & _PAGE_VALID))
 	if (!(pud_val(pud) & _PAGE_VALID))
 		return 0;
 		return 0;
 
 
-	if (!pud_access_permitted(pud, write))
+	if (write && !pud_write(pud))
 		return 0;
 		return 0;
 
 
 	refs = 0;
 	refs = 0;

+ 4 - 2
arch/sparc/net/bpf_jit_comp_64.c

@@ -1245,14 +1245,16 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx)
 		u8 *func = ((u8 *)__bpf_call_base) + imm;
 		u8 *func = ((u8 *)__bpf_call_base) + imm;
 
 
 		ctx->saw_call = true;
 		ctx->saw_call = true;
+		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+			emit_reg_move(bpf2sparc[BPF_REG_1], L7, ctx);
 
 
 		emit_call((u32 *)func, ctx);
 		emit_call((u32 *)func, ctx);
 		emit_nop(ctx);
 		emit_nop(ctx);
 
 
 		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
 		emit_reg_move(O0, bpf2sparc[BPF_REG_0], ctx);
 
 
-		if (bpf_helper_changes_pkt_data(func) && ctx->saw_ld_abs_ind)
-			load_skb_regs(ctx, bpf2sparc[BPF_REG_6]);
+		if (ctx->saw_ld_abs_ind && bpf_helper_changes_pkt_data(func))
+			load_skb_regs(ctx, L7);
 		break;
 		break;
 	}
 	}
 
 

+ 1 - 0
arch/um/include/asm/Kbuild

@@ -1,4 +1,5 @@
 generic-y += barrier.h
 generic-y += barrier.h
+generic-y += bpf_perf_event.h
 generic-y += bug.h
 generic-y += bug.h
 generic-y += clkdev.h
 generic-y += clkdev.h
 generic-y += current.h
 generic-y += current.h

+ 2 - 1
arch/um/include/asm/mmu_context.h

@@ -15,9 +15,10 @@ extern void uml_setup_stubs(struct mm_struct *mm);
 /*
 /*
  * Needed since we do not use the asm-generic/mm_hooks.h:
  * Needed since we do not use the asm-generic/mm_hooks.h:
  */
  */
-static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 {
 	uml_setup_stubs(mm);
 	uml_setup_stubs(mm);
+	return 0;
 }
 }
 extern void arch_exit_mmap(struct mm_struct *mm);
 extern void arch_exit_mmap(struct mm_struct *mm);
 static inline void arch_unmap(struct mm_struct *mm,
 static inline void arch_unmap(struct mm_struct *mm,

+ 1 - 1
arch/um/kernel/trap.c

@@ -150,7 +150,7 @@ static void show_segv_info(struct uml_pt_regs *regs)
 	if (!printk_ratelimit())
 	if (!printk_ratelimit())
 		return;
 		return;
 
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %x",
+	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %x",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
 		tsk->comm, task_pid_nr(tsk), FAULT_ADDRESS(*fi),
 		(void *)UPT_IP(regs), (void *)UPT_SP(regs),
 		(void *)UPT_IP(regs), (void *)UPT_SP(regs),

+ 3 - 2
arch/unicore32/include/asm/mmu_context.h

@@ -81,9 +81,10 @@ do { \
 	} \
 	} \
 } while (0)
 } while (0)
 
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm,
+				struct mm_struct *mm)
 {
 {
+	return 0;
 }
 }
 
 
 static inline void arch_unmap(struct mm_struct *mm,
 static inline void arch_unmap(struct mm_struct *mm,

+ 2 - 1
arch/x86/Kconfig

@@ -926,7 +926,8 @@ config MAXSMP
 config NR_CPUS
 config NR_CPUS
 	int "Maximum number of CPUs" if SMP && !MAXSMP
 	int "Maximum number of CPUs" if SMP && !MAXSMP
 	range 2 8 if SMP && X86_32 && !X86_BIGSMP
 	range 2 8 if SMP && X86_32 && !X86_BIGSMP
-	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK
+	range 2 64 if SMP && X86_32 && X86_BIGSMP
+	range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64
 	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
 	range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64
 	default "1" if !SMP
 	default "1" if !SMP
 	default "8192" if MAXSMP
 	default "8192" if MAXSMP

+ 1 - 0
arch/x86/Kconfig.debug

@@ -400,6 +400,7 @@ config UNWINDER_FRAME_POINTER
 config UNWINDER_GUESS
 config UNWINDER_GUESS
 	bool "Guess unwinder"
 	bool "Guess unwinder"
 	depends on EXPERT
 	depends on EXPERT
+	depends on !STACKDEPOT
 	---help---
 	---help---
 	  This option enables the "guess" unwinder for unwinding kernel stack
 	  This option enables the "guess" unwinder for unwinding kernel stack
 	  traces.  It scans the stack and reports every kernel text address it
 	  traces.  It scans the stack and reports every kernel text address it

+ 1 - 0
arch/x86/boot/compressed/Makefile

@@ -80,6 +80,7 @@ vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
 ifdef CONFIG_X86_64
 ifdef CONFIG_X86_64
 	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
 	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
 	vmlinux-objs-y += $(obj)/mem_encrypt.o
 	vmlinux-objs-y += $(obj)/mem_encrypt.o
+	vmlinux-objs-y += $(obj)/pgtable_64.o
 endif
 endif
 
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone

+ 12 - 4
arch/x86/boot/compressed/head_64.S

@@ -305,10 +305,18 @@ ENTRY(startup_64)
 	leaq	boot_stack_end(%rbx), %rsp
 	leaq	boot_stack_end(%rbx), %rsp
 
 
 #ifdef CONFIG_X86_5LEVEL
 #ifdef CONFIG_X86_5LEVEL
-	/* Check if 5-level paging has already enabled */
-	movq	%cr4, %rax
-	testl	$X86_CR4_LA57, %eax
-	jnz	lvl5
+	/*
+	 * Check if we need to enable 5-level paging.
+	 * RSI holds real mode data and need to be preserved across
+	 * a function call.
+	 */
+	pushq	%rsi
+	call	l5_paging_required
+	popq	%rsi
+
+	/* If l5_paging_required() returned zero, we're done here. */
+	cmpq	$0, %rax
+	je	lvl5
 
 
 	/*
 	/*
 	 * At this point we are in long mode with 4-level paging enabled,
 	 * At this point we are in long mode with 4-level paging enabled,

+ 16 - 0
arch/x86/boot/compressed/misc.c

@@ -169,6 +169,16 @@ void __puthex(unsigned long value)
 	}
 	}
 }
 }
 
 
+static bool l5_supported(void)
+{
+	/* Check if leaf 7 is supported. */
+	if (native_cpuid_eax(0) < 7)
+		return 0;
+
+	/* Check if la57 is supported. */
+	return native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31));
+}
+
 #if CONFIG_X86_NEED_RELOCS
 #if CONFIG_X86_NEED_RELOCS
 static void handle_relocations(void *output, unsigned long output_len,
 static void handle_relocations(void *output, unsigned long output_len,
 			       unsigned long virt_addr)
 			       unsigned long virt_addr)
@@ -362,6 +372,12 @@ asmlinkage __visible void *extract_kernel(void *rmode, memptr heap,
 	console_init();
 	console_init();
 	debug_putstr("early console in extract_kernel\n");
 	debug_putstr("early console in extract_kernel\n");
 
 
+	if (IS_ENABLED(CONFIG_X86_5LEVEL) && !l5_supported()) {
+		error("This linux kernel as configured requires 5-level paging\n"
+			"This CPU does not support the required 'cr4.la57' feature\n"
+			"Unable to boot - please use a kernel appropriate for your CPU\n");
+	}
+
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_ptr     = heap;	/* Heap */
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 	free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
 

+ 3 - 0
arch/x86/boot/compressed/pagetable.c

@@ -23,6 +23,9 @@
  */
  */
 #undef CONFIG_AMD_MEM_ENCRYPT
 #undef CONFIG_AMD_MEM_ENCRYPT
 
 
+/* No PAGE_TABLE_ISOLATION support needed either: */
+#undef CONFIG_PAGE_TABLE_ISOLATION
+
 #include "misc.h"
 #include "misc.h"
 
 
 /* These actually do the work of building the kernel identity maps. */
 /* These actually do the work of building the kernel identity maps. */

+ 28 - 0
arch/x86/boot/compressed/pgtable_64.c

@@ -0,0 +1,28 @@
+#include <asm/processor.h>
+
+/*
+ * __force_order is used by special_insns.h asm code to force instruction
+ * serialization.
+ *
+ * It is not referenced from the code, but GCC < 5 with -fPIE would fail
+ * due to an undefined symbol. Define it to make these ancient GCCs work.
+ */
+unsigned long __force_order;
+
+int l5_paging_required(void)
+{
+	/* Check if leaf 7 is supported. */
+
+	if (native_cpuid_eax(0) < 7)
+		return 0;
+
+	/* Check if la57 is supported. */
+	if (!(native_cpuid_ecx(7) & (1 << (X86_FEATURE_LA57 & 31))))
+		return 0;
+
+	/* Check if 5-level paging has already been enabled. */
+	if (native_read_cr4() & X86_CR4_LA57)
+		return 0;
+
+	return 1;
+}

+ 19 - 13
arch/x86/boot/genimage.sh

@@ -44,9 +44,9 @@ FDINITRD=$6
 
 
 # Make sure the files actually exist
 # Make sure the files actually exist
 verify "$FBZIMAGE"
 verify "$FBZIMAGE"
-verify "$MTOOLSRC"
 
 
 genbzdisk() {
 genbzdisk() {
+	verify "$MTOOLSRC"
 	mformat a:
 	mformat a:
 	syslinux $FIMAGE
 	syslinux $FIMAGE
 	echo "$KCMDLINE" | mcopy - a:syslinux.cfg
 	echo "$KCMDLINE" | mcopy - a:syslinux.cfg
@@ -57,6 +57,7 @@ genbzdisk() {
 }
 }
 
 
 genfdimage144() {
 genfdimage144() {
+	verify "$MTOOLSRC"
 	dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
 	dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
 	mformat v:
 	mformat v:
 	syslinux $FIMAGE
 	syslinux $FIMAGE
@@ -68,6 +69,7 @@ genfdimage144() {
 }
 }
 
 
 genfdimage288() {
 genfdimage288() {
+	verify "$MTOOLSRC"
 	dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
 	dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
 	mformat w:
 	mformat w:
 	syslinux $FIMAGE
 	syslinux $FIMAGE
@@ -78,39 +80,43 @@ genfdimage288() {
 	mcopy $FBZIMAGE w:linux
 	mcopy $FBZIMAGE w:linux
 }
 }
 
 
-genisoimage() {
+geniso() {
 	tmp_dir=`dirname $FIMAGE`/isoimage
 	tmp_dir=`dirname $FIMAGE`/isoimage
 	rm -rf $tmp_dir
 	rm -rf $tmp_dir
 	mkdir $tmp_dir
 	mkdir $tmp_dir
-	for i in lib lib64 share end ; do
+	for i in lib lib64 share ; do
 		for j in syslinux ISOLINUX ; do
 		for j in syslinux ISOLINUX ; do
 			if [ -f /usr/$i/$j/isolinux.bin ] ; then
 			if [ -f /usr/$i/$j/isolinux.bin ] ; then
 				isolinux=/usr/$i/$j/isolinux.bin
 				isolinux=/usr/$i/$j/isolinux.bin
-				cp $isolinux $tmp_dir
 			fi
 			fi
 		done
 		done
 		for j in syslinux syslinux/modules/bios ; do
 		for j in syslinux syslinux/modules/bios ; do
 			if [ -f /usr/$i/$j/ldlinux.c32 ]; then
 			if [ -f /usr/$i/$j/ldlinux.c32 ]; then
 				ldlinux=/usr/$i/$j/ldlinux.c32
 				ldlinux=/usr/$i/$j/ldlinux.c32
-				cp $ldlinux $tmp_dir
 			fi
 			fi
 		done
 		done
 		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
 		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
 			break
 			break
 		fi
 		fi
-		if [ $i = end -a -z "$isolinux" ] ; then
-			echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
-			exit 1
-		fi
 	done
 	done
+	if [ -z "$isolinux" ] ; then
+		echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	if [ -z "$ldlinux" ] ; then
+		echo 'Need an ldlinux.c32 file, please install syslinux/isolinux.'
+		exit 1
+	fi
+	cp $isolinux $tmp_dir
+	cp $ldlinux $tmp_dir
 	cp $FBZIMAGE $tmp_dir/linux
 	cp $FBZIMAGE $tmp_dir/linux
 	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
 	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
 	if [ -f "$FDINITRD" ] ; then
 	if [ -f "$FDINITRD" ] ; then
 		cp "$FDINITRD" $tmp_dir/initrd.img
 		cp "$FDINITRD" $tmp_dir/initrd.img
 	fi
 	fi
-	mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
-		-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
-		$tmp_dir
+	genisoimage -J -r -input-charset=utf-8 -quiet -o $FIMAGE \
+		-b isolinux.bin -c boot.cat -no-emul-boot -boot-load-size 4 \
+		-boot-info-table $tmp_dir
 	isohybrid $FIMAGE 2>/dev/null || true
 	isohybrid $FIMAGE 2>/dev/null || true
 	rm -rf $tmp_dir
 	rm -rf $tmp_dir
 }
 }
@@ -119,6 +125,6 @@ case $1 in
 	bzdisk)     genbzdisk;;
 	bzdisk)     genbzdisk;;
 	fdimage144) genfdimage144;;
 	fdimage144) genfdimage144;;
 	fdimage288) genfdimage288;;
 	fdimage288) genfdimage288;;
-	isoimage)   genisoimage;;
+	isoimage)   geniso;;
 	*)          echo 'Unknown image format'; exit 1;
 	*)          echo 'Unknown image format'; exit 1;
 esac
 esac

+ 0 - 7
arch/x86/crypto/salsa20_glue.c

@@ -59,13 +59,6 @@ static int encrypt(struct blkcipher_desc *desc,
 
 
 	salsa20_ivsetup(ctx, walk.iv);
 	salsa20_ivsetup(ctx, walk.iv);
 
 
-	if (likely(walk.nbytes == nbytes))
-	{
-		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
-				      walk.dst.virt.addr, nbytes);
-		return blkcipher_walk_done(desc, &walk, 0);
-	}
-
 	while (walk.nbytes >= 64) {
 	while (walk.nbytes >= 64) {
 		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
 		salsa20_encrypt_bytes(ctx, walk.src.virt.addr,
 				      walk.dst.virt.addr,
 				      walk.dst.virt.addr,

+ 145 - 0
arch/x86/entry/calling.h

@@ -1,6 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 /* SPDX-License-Identifier: GPL-2.0 */
 #include <linux/jump_label.h>
 #include <linux/jump_label.h>
 #include <asm/unwind_hints.h>
 #include <asm/unwind_hints.h>
+#include <asm/cpufeatures.h>
+#include <asm/page_types.h>
+#include <asm/percpu.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor-flags.h>
 
 
 /*
 /*
 
 
@@ -187,6 +192,146 @@ For 32-bit we have the following conventions - kernel is built with
 #endif
 #endif
 .endm
 .endm
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+/*
+ * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
+ * halves:
+ */
+#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
+#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+
+.macro SET_NOFLUSH_BIT	reg:req
+	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
+.endm
+
+.macro ADJUST_KERNEL_CR3 reg:req
+	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
+	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
+	andq    $(~PTI_SWITCH_MASK), \reg
+.endm
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+	ADJUST_KERNEL_CR3 \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+#define THIS_CPU_user_pcid_flush_mask   \
+	PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
+
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+	mov	%cr3, \scratch_reg
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * Test if the ASID needs a flush.
+	 */
+	movq	\scratch_reg, \scratch_reg2
+	andq	$(0x7FF), \scratch_reg		/* mask ASID */
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	/* Flush needed, clear the bit */
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	movq	\scratch_reg2, \scratch_reg
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	movq	\scratch_reg2, \scratch_reg
+	SET_NOFLUSH_BIT \scratch_reg
+
+.Lwrcr3_\@:
+	/* Flip the PGD and ASID to the user version */
+	orq     $(PTI_SWITCH_MASK), \scratch_reg
+	mov	\scratch_reg, %cr3
+.Lend_\@:
+.endm
+
+.macro SWITCH_TO_USER_CR3_STACK	scratch_reg:req
+	pushq	%rax
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+	popq	%rax
+.endm
+
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI
+	movq	%cr3, \scratch_reg
+	movq	\scratch_reg, \save_reg
+	/*
+	 * Is the "switch mask" all zero?  That means that both of
+	 * these are zero:
+	 *
+	 *	1. The user/kernel PCID bit, and
+	 *	2. The user/kernel "bit" that points CR3 to the
+	 *	   bottom half of the 8k PGD
+	 *
+	 * That indicates a kernel CR3 value, not a user CR3.
+	 */
+	testq	$(PTI_SWITCH_MASK), \scratch_reg
+	jz	.Ldone_\@
+
+	ADJUST_KERNEL_CR3 \scratch_reg
+	movq	\scratch_reg, %cr3
+
+.Ldone_\@:
+.endm
+
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+	ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
+
+	ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID
+
+	/*
+	 * KERNEL pages can always resume with NOFLUSH as we do
+	 * explicit flushes.
+	 */
+	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	jnc	.Lnoflush_\@
+
+	/*
+	 * Check if there's a pending flush for the user ASID we're
+	 * about to set.
+	 */
+	movq	\save_reg, \scratch_reg
+	andq	$(0x7FF), \scratch_reg
+	bt	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jnc	.Lnoflush_\@
+
+	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
+	jmp	.Lwrcr3_\@
+
+.Lnoflush_\@:
+	SET_NOFLUSH_BIT \save_reg
+
+.Lwrcr3_\@:
+	/*
+	 * The CR3 write could be avoided when not changing its value,
+	 * but would require a CR3 read *and* a scratch register.
+	 */
+	movq	\save_reg, %cr3
+.Lend_\@:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */
+
+.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
+.endm
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+.endm
+.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req
+.endm
+.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
+.endm
+.macro RESTORE_CR3 scratch_reg:req save_reg:req
+.endm
+
+#endif
+
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_X86_64 */
 
 
 /*
 /*

+ 8 - 6
arch/x86/entry/entry_32.S

@@ -941,9 +941,10 @@ ENTRY(debug)
 	movl	%esp, %eax			# pt_regs pointer
 	movl	%esp, %eax			# pt_regs pointer
 
 
 	/* Are we currently on the SYSENTER stack? */
 	/* Are we currently on the SYSENTER stack? */
-	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
-	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
-	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
 	jb	.Ldebug_from_sysenter_stack
 	jb	.Ldebug_from_sysenter_stack
 
 
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
@@ -984,9 +985,10 @@ ENTRY(nmi)
 	movl	%esp, %eax			# pt_regs pointer
 	movl	%esp, %eax			# pt_regs pointer
 
 
 	/* Are we currently on the SYSENTER stack? */
 	/* Are we currently on the SYSENTER stack? */
-	PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx)
-	subl	%eax, %ecx	/* ecx = (end of SYSENTER_stack) - esp */
-	cmpl	$SIZEOF_SYSENTER_stack, %ecx
+	movl	PER_CPU_VAR(cpu_entry_area), %ecx
+	addl	$CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx
+	subl	%eax, %ecx	/* ecx = (end of entry_stack) - esp */
+	cmpl	$SIZEOF_entry_stack, %ecx
 	jb	.Lnmi_from_sysenter_stack
 	jb	.Lnmi_from_sysenter_stack
 
 
 	/* Not on SYSENTER stack. */
 	/* Not on SYSENTER stack. */

+ 205 - 32
arch/x86/entry/entry_64.S

@@ -23,7 +23,6 @@
 #include <asm/segment.h>
 #include <asm/segment.h>
 #include <asm/cache.h>
 #include <asm/cache.h>
 #include <asm/errno.h>
 #include <asm/errno.h>
-#include "calling.h"
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
@@ -40,6 +39,8 @@
 #include <asm/frame.h>
 #include <asm/frame.h>
 #include <linux/err.h>
 #include <linux/err.h>
 
 
+#include "calling.h"
+
 .code64
 .code64
 .section .entry.text, "ax"
 .section .entry.text, "ax"
 
 
@@ -140,6 +141,67 @@ END(native_usergs_sysret64)
  * with them due to bugs in both AMD and Intel CPUs.
  * with them due to bugs in both AMD and Intel CPUs.
  */
  */
 
 
+	.pushsection .entry_trampoline, "ax"
+
+/*
+ * The code in here gets remapped into cpu_entry_area's trampoline.  This means
+ * that the assembler and linker have the wrong idea as to where this code
+ * lives (and, in fact, it's mapped more than once, so it's not even at a
+ * fixed address).  So we can't reference any symbols outside the entry
+ * trampoline and expect it to work.
+ *
+ * Instead, we carefully abuse %rip-relative addressing.
+ * _entry_trampoline(%rip) refers to the start of the remapped) entry
+ * trampoline.  We can thus find cpu_entry_area with this macro:
+ */
+
+#define CPU_ENTRY_AREA \
+	_entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip)
+
+/* The top word of the SYSENTER stack is hot and is usable as scratch space. */
+#define RSP_SCRATCH	CPU_ENTRY_AREA_entry_stack + \
+			SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA
+
+ENTRY(entry_SYSCALL_64_trampoline)
+	UNWIND_HINT_EMPTY
+	swapgs
+
+	/* Stash the user RSP. */
+	movq	%rsp, RSP_SCRATCH
+
+	/* Note: using %rsp as a scratch reg. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
+	/* Load the top of the task stack into RSP */
+	movq	CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp
+
+	/* Start building the simulated IRET frame. */
+	pushq	$__USER_DS			/* pt_regs->ss */
+	pushq	RSP_SCRATCH			/* pt_regs->sp */
+	pushq	%r11				/* pt_regs->flags */
+	pushq	$__USER_CS			/* pt_regs->cs */
+	pushq	%rcx				/* pt_regs->ip */
+
+	/*
+	 * x86 lacks a near absolute jump, and we can't jump to the real
+	 * entry text with a relative jump.  We could push the target
+	 * address and then use retq, but this destroys the pipeline on
+	 * many CPUs (wasting over 20 cycles on Sandy Bridge).  Instead,
+	 * spill RDI and restore it in a second-stage trampoline.
+	 */
+	pushq	%rdi
+	movq	$entry_SYSCALL_64_stage2, %rdi
+	jmp	*%rdi
+END(entry_SYSCALL_64_trampoline)
+
+	.popsection
+
+ENTRY(entry_SYSCALL_64_stage2)
+	UNWIND_HINT_EMPTY
+	popq	%rdi
+	jmp	entry_SYSCALL_64_after_hwframe
+END(entry_SYSCALL_64_stage2)
+
 ENTRY(entry_SYSCALL_64)
 ENTRY(entry_SYSCALL_64)
 	UNWIND_HINT_EMPTY
 	UNWIND_HINT_EMPTY
 	/*
 	/*
@@ -149,6 +211,10 @@ ENTRY(entry_SYSCALL_64)
 	 */
 	 */
 
 
 	swapgs
 	swapgs
+	/*
+	 * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it
+	 * is not required to switch CR3.
+	 */
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	%rsp, PER_CPU_VAR(rsp_scratch)
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
@@ -330,8 +396,25 @@ syscall_return_via_sysret:
 	popq	%rsi	/* skip rcx */
 	popq	%rsi	/* skip rcx */
 	popq	%rdx
 	popq	%rdx
 	popq	%rsi
 	popq	%rsi
+
+	/*
+	 * Now all regs are restored except RSP and RDI.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+	pushq	RSP-RDI(%rdi)	/* RSP */
+	pushq	(%rdi)		/* RDI */
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
 	popq	%rdi
 	popq	%rdi
-	movq	RSP-ORIG_RAX(%rsp), %rsp
+	popq	%rsp
 	USERGS_SYSRET64
 	USERGS_SYSRET64
 END(entry_SYSCALL_64)
 END(entry_SYSCALL_64)
 
 
@@ -466,12 +549,13 @@ END(irq_entries_start)
 
 
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
 .macro DEBUG_ENTRY_ASSERT_IRQS_OFF
 #ifdef CONFIG_DEBUG_ENTRY
 #ifdef CONFIG_DEBUG_ENTRY
-	pushfq
-	testl $X86_EFLAGS_IF, (%rsp)
+	pushq %rax
+	SAVE_FLAGS(CLBR_RAX)
+	testl $X86_EFLAGS_IF, %eax
 	jz .Lokay_\@
 	jz .Lokay_\@
 	ud2
 	ud2
 .Lokay_\@:
 .Lokay_\@:
-	addq $8, %rsp
+	popq %rax
 #endif
 #endif
 .endm
 .endm
 
 
@@ -563,6 +647,13 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	.macro interrupt func
 	cld
 	cld
+
+	testb	$3, CS-ORIG_RAX(%rsp)
+	jz	1f
+	SWAPGS
+	call	switch_to_thread_stack
+1:
+
 	ALLOC_PT_GPREGS_ON_STACK
 	ALLOC_PT_GPREGS_ON_STACK
 	SAVE_C_REGS
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
 	SAVE_EXTRA_REGS
@@ -572,12 +663,8 @@ END(irq_entries_start)
 	jz	1f
 	jz	1f
 
 
 	/*
 	/*
-	 * IRQ from user mode.  Switch to kernel gsbase and inform context
-	 * tracking that we're in kernel mode.
-	 */
-	SWAPGS
-
-	/*
+	 * IRQ from user mode.
+	 *
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * we fix gsbase, and we should do it before enter_from_user_mode
 	 * we fix gsbase, and we should do it before enter_from_user_mode
 	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
 	 * (which can take locks).  Since TRACE_IRQS_OFF idempotent,
@@ -630,10 +717,43 @@ GLOBAL(swapgs_restore_regs_and_return_to_usermode)
 	ud2
 	ud2
 1:
 1:
 #endif
 #endif
-	SWAPGS
 	POP_EXTRA_REGS
 	POP_EXTRA_REGS
-	POP_C_REGS
-	addq	$8, %rsp	/* skip regs->orig_ax */
+	popq	%r11
+	popq	%r10
+	popq	%r9
+	popq	%r8
+	popq	%rax
+	popq	%rcx
+	popq	%rdx
+	popq	%rsi
+
+	/*
+	 * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS.
+	 * Save old stack pointer and switch to trampoline stack.
+	 */
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
+
+	/* Copy the IRET frame to the trampoline stack. */
+	pushq	6*8(%rdi)	/* SS */
+	pushq	5*8(%rdi)	/* RSP */
+	pushq	4*8(%rdi)	/* EFLAGS */
+	pushq	3*8(%rdi)	/* CS */
+	pushq	2*8(%rdi)	/* RIP */
+
+	/* Push user RDI on the trampoline stack. */
+	pushq	(%rdi)
+
+	/*
+	 * We are on the trampoline stack.  All regs except RDI are live.
+	 * We can do future final exit work right here.
+	 */
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+
+	/* Restore RDI. */
+	popq	%rdi
+	SWAPGS
 	INTERRUPT_RETURN
 	INTERRUPT_RETURN
 
 
 
 
@@ -713,7 +833,9 @@ native_irq_return_ldt:
 	 */
 	 */
 
 
 	pushq	%rdi				/* Stash user RDI */
 	pushq	%rdi				/* Stash user RDI */
-	SWAPGS
+	SWAPGS					/* to kernel GS */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi	/* to kernel CR3 */
+
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	%rax, (0*8)(%rdi)		/* user RAX */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
 	movq	(1*8)(%rsp), %rax		/* user RIP */
@@ -729,7 +851,6 @@ native_irq_return_ldt:
 	/* Now RAX == RSP. */
 	/* Now RAX == RSP. */
 
 
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
 	andl	$0xffff0000, %eax		/* RAX = (RSP & 0xffff0000) */
-	popq	%rdi				/* Restore user RDI */
 
 
 	/*
 	/*
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
 	 * espfix_stack[31:16] == 0.  The page tables are set up such that
@@ -740,7 +861,11 @@ native_irq_return_ldt:
 	 * still points to an RO alias of the ESPFIX stack.
 	 * still points to an RO alias of the ESPFIX stack.
 	 */
 	 */
 	orq	PER_CPU_VAR(espfix_stack), %rax
 	orq	PER_CPU_VAR(espfix_stack), %rax
-	SWAPGS
+
+	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
+	SWAPGS					/* to user GS */
+	popq	%rdi				/* Restore user RDI */
+
 	movq	%rax, %rsp
 	movq	%rax, %rsp
 	UNWIND_HINT_IRET_REGS offset=8
 	UNWIND_HINT_IRET_REGS offset=8
 
 
@@ -829,7 +954,35 @@ apicinterrupt IRQ_WORK_VECTOR			irq_work_interrupt		smp_irq_work_interrupt
 /*
 /*
  * Exception entry points.
  * Exception entry points.
  */
  */
-#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8)
+
+/*
+ * Switch to the thread stack.  This is called with the IRET frame and
+ * orig_ax on the stack.  (That is, RDI..R12 are not on the stack and
+ * space has not been allocated for them.)
+ */
+ENTRY(switch_to_thread_stack)
+	UNWIND_HINT_FUNC
+
+	pushq	%rdi
+	/* Need to switch before accessing the thread stack. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+	movq	%rsp, %rdi
+	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
+	UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI
+
+	pushq	7*8(%rdi)		/* regs->ss */
+	pushq	6*8(%rdi)		/* regs->rsp */
+	pushq	5*8(%rdi)		/* regs->eflags */
+	pushq	4*8(%rdi)		/* regs->cs */
+	pushq	3*8(%rdi)		/* regs->ip */
+	pushq	2*8(%rdi)		/* regs->orig_ax */
+	pushq	8(%rdi)			/* return address */
+	UNWIND_HINT_FUNC
+
+	movq	(%rdi), %rdi
+	ret
+END(switch_to_thread_stack)
 
 
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
 ENTRY(\sym)
 ENTRY(\sym)
@@ -848,11 +1001,12 @@ ENTRY(\sym)
 
 
 	ALLOC_PT_GPREGS_ON_STACK
 	ALLOC_PT_GPREGS_ON_STACK
 
 
-	.if \paranoid
-	.if \paranoid == 1
+	.if \paranoid < 2
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
 	testb	$3, CS(%rsp)			/* If coming from userspace, switch stacks */
-	jnz	1f
+	jnz	.Lfrom_usermode_switch_stack_\@
 	.endif
 	.endif
+
+	.if \paranoid
 	call	paranoid_entry
 	call	paranoid_entry
 	.else
 	.else
 	call	error_entry
 	call	error_entry
@@ -894,20 +1048,15 @@ ENTRY(\sym)
 	jmp	error_exit
 	jmp	error_exit
 	.endif
 	.endif
 
 
-	.if \paranoid == 1
+	.if \paranoid < 2
 	/*
 	/*
-	 * Paranoid entry from userspace.  Switch stacks and treat it
+	 * Entry from userspace.  Switch stacks and treat it
 	 * as a normal entry.  This means that paranoid handlers
 	 * as a normal entry.  This means that paranoid handlers
 	 * run in real process context if user_mode(regs).
 	 * run in real process context if user_mode(regs).
 	 */
 	 */
-1:
+.Lfrom_usermode_switch_stack_\@:
 	call	error_entry
 	call	error_entry
 
 
-
-	movq	%rsp, %rdi			/* pt_regs pointer */
-	call	sync_regs
-	movq	%rax, %rsp			/* switch stack */
-
 	movq	%rsp, %rdi			/* pt_regs pointer */
 	movq	%rsp, %rdi			/* pt_regs pointer */
 
 
 	.if \has_error_code
 	.if \has_error_code
@@ -1119,7 +1268,11 @@ ENTRY(paranoid_entry)
 	js	1f				/* negative -> in kernel */
 	js	1f				/* negative -> in kernel */
 	SWAPGS
 	SWAPGS
 	xorl	%ebx, %ebx
 	xorl	%ebx, %ebx
-1:	ret
+
+1:
+	SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14
+
+	ret
 END(paranoid_entry)
 END(paranoid_entry)
 
 
 /*
 /*
@@ -1141,6 +1294,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	.Lparanoid_exit_no_swapgs
 	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	TRACE_IRQS_IRETQ
+	RESTORE_CR3	scratch_reg=%rbx save_reg=%r14
 	SWAPGS_UNSAFE_STACK
 	SWAPGS_UNSAFE_STACK
 	jmp	.Lparanoid_exit_restore
 	jmp	.Lparanoid_exit_restore
 .Lparanoid_exit_no_swapgs:
 .Lparanoid_exit_no_swapgs:
@@ -1168,8 +1322,18 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 * from user mode due to an IRET fault.
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	/* We have user CR3.  Change to kernel CR3. */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 
 .Lerror_entry_from_usermode_after_swapgs:
 .Lerror_entry_from_usermode_after_swapgs:
+	/* Put us onto the real thread stack. */
+	popq	%r12				/* save return addr in %12 */
+	movq	%rsp, %rdi			/* arg0 = pt_regs pointer */
+	call	sync_regs
+	movq	%rax, %rsp			/* switch stack */
+	ENCODE_FRAME_POINTER
+	pushq	%r12
+
 	/*
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
 	 * we fix gsbase, and we should do it before enter_from_user_mode
 	 * we fix gsbase, and we should do it before enter_from_user_mode
@@ -1206,6 +1370,7 @@ ENTRY(error_entry)
 	 * .Lgs_change's error handler with kernel gsbase.
 	 * .Lgs_change's error handler with kernel gsbase.
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 	jmp .Lerror_entry_done
 	jmp .Lerror_entry_done
 
 
 .Lbstep_iret:
 .Lbstep_iret:
@@ -1215,10 +1380,11 @@ ENTRY(error_entry)
 
 
 .Lerror_bad_iret:
 .Lerror_bad_iret:
 	/*
 	/*
-	 * We came from an IRET to user mode, so we have user gsbase.
-	 * Switch to kernel gsbase:
+	 * We came from an IRET to user mode, so we have user
+	 * gsbase and CR3.  Switch to kernel gsbase and CR3:
 	 */
 	 */
 	SWAPGS
 	SWAPGS
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
 
 
 	/*
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * Pretend that the exception came from user mode: set up pt_regs
@@ -1250,6 +1416,10 @@ END(error_exit)
 /*
 /*
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * Runs on exception stack.  Xen PV does not go through this path at all,
  * so we can use real assembly here.
  * so we can use real assembly here.
+ *
+ * Registers:
+ *	%r14: Used to save/restore the CR3 of the interrupted context
+ *	      when PAGE_TABLE_ISOLATION is in use.  Do not clobber.
  */
  */
 ENTRY(nmi)
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
 	UNWIND_HINT_IRET_REGS
@@ -1313,6 +1483,7 @@ ENTRY(nmi)
 
 
 	swapgs
 	swapgs
 	cld
 	cld
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
 	movq	%rsp, %rdx
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
 	UNWIND_HINT_IRET_REGS base=%rdx offset=8
@@ -1565,6 +1736,8 @@ end_repeat_nmi:
 	movq	$-1, %rsi
 	movq	$-1, %rsi
 	call	do_nmi
 	call	do_nmi
 
 
+	RESTORE_CR3 scratch_reg=%r15 save_reg=%r14
+
 	testl	%ebx, %ebx			/* swapgs needed? */
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 	jnz	nmi_restore
 nmi_swapgs:
 nmi_swapgs:

+ 28 - 3
arch/x86/entry/entry_64_compat.S

@@ -48,7 +48,11 @@
  */
  */
 ENTRY(entry_SYSENTER_compat)
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	/* Interrupts are off on entry. */
-	SWAPGS_UNSAFE_STACK
+	SWAPGS
+
+	/* We are about to clobber %rsp anyway, clobbering here is OK */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
+
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 
 	/*
 	/*
@@ -215,6 +219,12 @@ GLOBAL(entry_SYSCALL_compat_after_hwframe)
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r14 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 	pushq   $0			/* pt_regs->r15 = 0 */
 
 
+	/*
+	 * We just saved %rdi so it is safe to clobber.  It is not
+	 * preserved during the C calls inside TRACE_IRQS_OFF anyway.
+	 */
+	SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
+
 	/*
 	/*
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * User mode is traced as though IRQs are on, and SYSENTER
 	 * turned them off.
 	 * turned them off.
@@ -256,10 +266,22 @@ sysret32_from_system_call:
 	 * when the system call started, which is already known to user
 	 * when the system call started, which is already known to user
 	 * code.  We zero R8-R10 to avoid info leaks.
 	 * code.  We zero R8-R10 to avoid info leaks.
          */
          */
+	movq	RSP-ORIG_RAX(%rsp), %rsp
+
+	/*
+	 * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored
+	 * on the process stack which is not mapped to userspace and
+	 * not readable after we SWITCH_TO_USER_CR3.  Delay the CR3
+	 * switch until after after the last reference to the process
+	 * stack.
+	 *
+	 * %r8/%r9 are zeroed before the sysret, thus safe to clobber.
+	 */
+	SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9
+
 	xorq	%r8, %r8
 	xorq	%r8, %r8
 	xorq	%r9, %r9
 	xorq	%r9, %r9
 	xorq	%r10, %r10
 	xorq	%r10, %r10
-	movq	RSP-ORIG_RAX(%rsp), %rsp
 	swapgs
 	swapgs
 	sysretl
 	sysretl
 END(entry_SYSCALL_compat)
 END(entry_SYSCALL_compat)
@@ -306,8 +328,11 @@ ENTRY(entry_INT80_compat)
 	 */
 	 */
 	movl	%eax, %eax
 	movl	%eax, %eax
 
 
-	/* Construct struct pt_regs on stack (iret frame is already on stack) */
 	pushq	%rax			/* pt_regs->orig_ax */
 	pushq	%rax			/* pt_regs->orig_ax */
+
+	/* switch to thread stack expects orig_ax to be pushed */
+	call	switch_to_thread_stack
+
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rdi			/* pt_regs->di */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rsi			/* pt_regs->si */
 	pushq	%rdx			/* pt_regs->dx */
 	pushq	%rdx			/* pt_regs->dx */

+ 37 - 1
arch/x86/entry/vsyscall/vsyscall_64.c

@@ -37,6 +37,7 @@
 #include <asm/unistd.h>
 #include <asm/unistd.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
 #include <asm/traps.h>
 #include <asm/traps.h>
+#include <asm/paravirt.h>
 
 
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include "vsyscall_trace.h"
 #include "vsyscall_trace.h"
@@ -138,6 +139,10 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 
 
 	WARN_ON_ONCE(address != regs->ip);
 	WARN_ON_ONCE(address != regs->ip);
 
 
+	/* This should be unreachable in NATIVE mode. */
+	if (WARN_ON(vsyscall_mode == NATIVE))
+		return false;
+
 	if (vsyscall_mode == NONE) {
 	if (vsyscall_mode == NONE) {
 		warn_bad_vsyscall(KERN_INFO, regs,
 		warn_bad_vsyscall(KERN_INFO, regs,
 				  "vsyscall attempted with vsyscall=none");
 				  "vsyscall attempted with vsyscall=none");
@@ -329,16 +334,47 @@ int in_gate_area_no_mm(unsigned long addr)
 	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
 	return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR;
 }
 }
 
 
+/*
+ * The VSYSCALL page is the only user-accessible page in the kernel address
+ * range.  Normally, the kernel page tables can have _PAGE_USER clear, but
+ * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls
+ * are enabled.
+ *
+ * Some day we may create a "minimal" vsyscall mode in which we emulate
+ * vsyscalls but leave the page not present.  If so, we skip calling
+ * this.
+ */
+void __init set_vsyscall_pgtable_user_bits(pgd_t *root)
+{
+	pgd_t *pgd;
+	p4d_t *p4d;
+	pud_t *pud;
+	pmd_t *pmd;
+
+	pgd = pgd_offset_pgd(root, VSYSCALL_ADDR);
+	set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+	p4d = p4d_offset(pgd, VSYSCALL_ADDR);
+#if CONFIG_PGTABLE_LEVELS >= 5
+	p4d->p4d |= _PAGE_USER;
+#endif
+	pud = pud_offset(p4d, VSYSCALL_ADDR);
+	set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER));
+	pmd = pmd_offset(pud, VSYSCALL_ADDR);
+	set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER));
+}
+
 void __init map_vsyscall(void)
 void __init map_vsyscall(void)
 {
 {
 	extern char __vsyscall_page;
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 
 
-	if (vsyscall_mode != NONE)
+	if (vsyscall_mode != NONE) {
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
 			     vsyscall_mode == NATIVE
 			     vsyscall_mode == NATIVE
 			     ? PAGE_KERNEL_VSYSCALL
 			     ? PAGE_KERNEL_VSYSCALL
 			     : PAGE_KERNEL_VVAR);
 			     : PAGE_KERNEL_VVAR);
+		set_vsyscall_pgtable_user_bits(swapper_pg_dir);
+	}
 
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 		     (unsigned long)VSYSCALL_ADDR);
 		     (unsigned long)VSYSCALL_ADDR);

+ 4 - 1
arch/x86/events/intel/core.c

@@ -3847,6 +3847,8 @@ static struct attribute *intel_pmu_attrs[] = {
 
 
 __init int intel_pmu_init(void)
 __init int intel_pmu_init(void)
 {
 {
+	struct attribute **extra_attr = NULL;
+	struct attribute **to_free = NULL;
 	union cpuid10_edx edx;
 	union cpuid10_edx edx;
 	union cpuid10_eax eax;
 	union cpuid10_eax eax;
 	union cpuid10_ebx ebx;
 	union cpuid10_ebx ebx;
@@ -3854,7 +3856,6 @@ __init int intel_pmu_init(void)
 	unsigned int unused;
 	unsigned int unused;
 	struct extra_reg *er;
 	struct extra_reg *er;
 	int version, i;
 	int version, i;
-	struct attribute **extra_attr = NULL;
 	char *name;
 	char *name;
 
 
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
 	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
@@ -4294,6 +4295,7 @@ __init int intel_pmu_init(void)
 		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
 		extra_attr = boot_cpu_has(X86_FEATURE_RTM) ?
 			hsw_format_attr : nhm_format_attr;
 			hsw_format_attr : nhm_format_attr;
 		extra_attr = merge_attr(extra_attr, skl_format_attr);
 		extra_attr = merge_attr(extra_attr, skl_format_attr);
+		to_free = extra_attr;
 		x86_pmu.cpu_events = get_hsw_events_attrs();
 		x86_pmu.cpu_events = get_hsw_events_attrs();
 		intel_pmu_pebs_data_source_skl(
 		intel_pmu_pebs_data_source_skl(
 			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
 			boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X);
@@ -4401,6 +4403,7 @@ __init int intel_pmu_init(void)
 		pr_cont("full-width counters, ");
 		pr_cont("full-width counters, ");
 	}
 	}
 
 
+	kfree(to_free);
 	return 0;
 	return 0;
 }
 }
 
 

+ 83 - 47
arch/x86/events/intel/ds.c

@@ -3,16 +3,18 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 
 
+#include <asm/cpu_entry_area.h>
 #include <asm/perf_event.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 #include <asm/insn.h>
 
 
 #include "../perf_event.h"
 #include "../perf_event.h"
 
 
+/* Waste a full page so it can be mapped into the cpu_entry_area */
+DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 #define BTS_RECORD_SIZE		24
 
 
-#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
-#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 #define PEBS_FIXUP_SIZE		PAGE_SIZE
 
 
 /*
 /*
@@ -279,17 +281,52 @@ void fini_debug_store_on_cpu(int cpu)
 
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 static DEFINE_PER_CPU(void *, insn_buffer);
 
 
-static int alloc_pebs_buffer(int cpu)
+static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	phys_addr_t pa;
+	size_t msz = 0;
+
+	pa = virt_to_phys(addr);
+	for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, pa, prot);
+}
+
+static void ds_clear_cea(void *cea, size_t size)
+{
+	size_t msz = 0;
+
+	for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE)
+		cea_set_pte(cea, 0, PAGE_NONE);
+}
+
+static void *dsalloc_pages(size_t size, gfp_t flags, int cpu)
+{
+	unsigned int order = get_order(size);
 	int node = cpu_to_node(cpu);
 	int node = cpu_to_node(cpu);
-	int max;
-	void *buffer, *ibuffer;
+	struct page *page;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	return page ? page_address(page) : NULL;
+}
+
+static void dsfree_pages(const void *buffer, size_t size)
+{
+	if (buffer)
+		free_pages((unsigned long)buffer, get_order(size));
+}
+
+static int alloc_pebs_buffer(int cpu)
+{
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	size_t bsiz = x86_pmu.pebs_buffer_size;
+	int max, node = cpu_to_node(cpu);
+	void *buffer, *ibuffer, *cea;
 
 
 	if (!x86_pmu.pebs)
 	if (!x86_pmu.pebs)
 		return 0;
 		return 0;
 
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu);
 	if (unlikely(!buffer))
 	if (unlikely(!buffer))
 		return -ENOMEM;
 		return -ENOMEM;
 
 
@@ -300,25 +337,27 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree_pages(buffer, bsiz);
 			return -ENOMEM;
 			return -ENOMEM;
 		}
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
 		per_cpu(insn_buffer, cpu) = ibuffer;
 	}
 	}
-
-	max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size;
-
-	ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_pebs_vaddr = buffer;
+	/* Update the cpu entry area mapping */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds->pebs_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL);
 	ds->pebs_index = ds->pebs_buffer_base;
 	ds->pebs_index = ds->pebs_buffer_base;
-	ds->pebs_absolute_maximum = ds->pebs_buffer_base +
-		max * x86_pmu.pebs_record_size;
-
+	max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size);
+	ds->pebs_absolute_maximum = ds->pebs_buffer_base + max;
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_pebs_buffer(int cpu)
 static void release_pebs_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 
 	if (!ds || !x86_pmu.pebs)
 	if (!ds || !x86_pmu.pebs)
 		return;
 		return;
@@ -326,73 +365,70 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 	per_cpu(insn_buffer, cpu) = NULL;
 
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer;
+	ds_clear_cea(cea, x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 	ds->pebs_buffer_base = 0;
+	dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size);
+	hwev->ds_pebs_vaddr = NULL;
 }
 }
 
 
 static int alloc_bts_buffer(int cpu)
 static int alloc_bts_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-	int node = cpu_to_node(cpu);
-	int max, thresh;
-	void *buffer;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *buffer, *cea;
+	int max;
 
 
 	if (!x86_pmu.bts)
 	if (!x86_pmu.bts)
 		return 0;
 		return 0;
 
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu);
 	if (unlikely(!buffer)) {
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
-
-	max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
-	thresh = max / 16;
-
-	ds->bts_buffer_base = (u64)(unsigned long)buffer;
+	hwev->ds_bts_vaddr = buffer;
+	/* Update the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds->bts_buffer_base = (unsigned long) cea;
+	ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL);
 	ds->bts_index = ds->bts_buffer_base;
 	ds->bts_index = ds->bts_buffer_base;
-	ds->bts_absolute_maximum = ds->bts_buffer_base +
-		max * BTS_RECORD_SIZE;
-	ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
-		thresh * BTS_RECORD_SIZE;
-
+	max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE);
+	ds->bts_absolute_maximum = ds->bts_buffer_base + max;
+	ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16);
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_bts_buffer(int cpu)
 static void release_bts_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+	struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu);
+	struct debug_store *ds = hwev->ds;
+	void *cea;
 
 
 	if (!ds || !x86_pmu.bts)
 	if (!ds || !x86_pmu.bts)
 		return;
 		return;
 
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	/* Clear the fixmap */
+	cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer;
+	ds_clear_cea(cea, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 	ds->bts_buffer_base = 0;
+	dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE);
+	hwev->ds_bts_vaddr = NULL;
 }
 }
 
 
 static int alloc_ds_buffer(int cpu)
 static int alloc_ds_buffer(int cpu)
 {
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store;
 
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 	per_cpu(cpu_hw_events, cpu).ds = ds;
-
 	return 0;
 	return 0;
 }
 }
 
 
 static void release_ds_buffer(int cpu)
 static void release_ds_buffer(int cpu)
 {
 {
-	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
-	if (!ds)
-		return;
-
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 }
 
 
 void release_ds_buffers(void)
 void release_ds_buffers(void)

+ 4 - 19
arch/x86/events/perf_event.h

@@ -14,6 +14,8 @@
 
 
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 
 
+#include <asm/intel_ds.h>
+
 /* To enable MSR tracing please use the generic trace points. */
 /* To enable MSR tracing please use the generic trace points. */
 
 
 /*
 /*
@@ -77,8 +79,6 @@ struct amd_nb {
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 	struct event_constraint event_constraints[X86_PMC_IDX_MAX];
 };
 };
 
 
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS		8
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 #define PEBS_COUNTER_MASK	((1ULL << MAX_PEBS_EVENTS) - 1)
 
 
 /*
 /*
@@ -95,23 +95,6 @@ struct amd_nb {
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 	PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER)
 
 
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
-	u64	bts_buffer_base;
-	u64	bts_index;
-	u64	bts_absolute_maximum;
-	u64	bts_interrupt_threshold;
-	u64	pebs_buffer_base;
-	u64	pebs_index;
-	u64	pebs_absolute_maximum;
-	u64	pebs_interrupt_threshold;
-	u64	pebs_event_reset[MAX_PEBS_EVENTS];
-};
-
 #define PEBS_REGS \
 #define PEBS_REGS \
 	(PERF_REG_X86_AX | \
 	(PERF_REG_X86_AX | \
 	 PERF_REG_X86_BX | \
 	 PERF_REG_X86_BX | \
@@ -216,6 +199,8 @@ struct cpu_hw_events {
 	 * Intel DebugStore bits
 	 * Intel DebugStore bits
 	 */
 	 */
 	struct debug_store	*ds;
 	struct debug_store	*ds;
+	void			*ds_pebs_vaddr;
+	void			*ds_bts_vaddr;
 	u64			pebs_enabled;
 	u64			pebs_enabled;
 	int			n_pebs;
 	int			n_pebs;
 	int			n_large_pebs;
 	int			n_large_pebs;

+ 2 - 0
arch/x86/include/asm/asm.h

@@ -136,6 +136,7 @@
 #endif
 #endif
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
+#ifndef __BPF__
 /*
 /*
  * This output constraint should be used for any inline asm which has a "call"
  * This output constraint should be used for any inline asm which has a "call"
  * instruction.  Otherwise the asm may be inserted before the frame pointer
  * instruction.  Otherwise the asm may be inserted before the frame pointer
@@ -145,5 +146,6 @@
 register unsigned long current_stack_pointer asm(_ASM_SP);
 register unsigned long current_stack_pointer asm(_ASM_SP);
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
 #endif
 #endif
+#endif
 
 
 #endif /* _ASM_X86_ASM_H */
 #endif /* _ASM_X86_ASM_H */

+ 81 - 0
arch/x86/include/asm/cpu_entry_area.h

@@ -0,0 +1,81 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef _ASM_X86_CPU_ENTRY_AREA_H
+#define _ASM_X86_CPU_ENTRY_AREA_H
+
+#include <linux/percpu-defs.h>
+#include <asm/processor.h>
+#include <asm/intel_ds.h>
+
+/*
+ * cpu_entry_area is a percpu region that contains things needed by the CPU
+ * and early entry/exit code.  Real types aren't used for all fields here
+ * to avoid circular header dependencies.
+ *
+ * Every field is a virtual alias of some other allocated backing store.
+ * There is no direct allocation of a struct cpu_entry_area.
+ */
+struct cpu_entry_area {
+	char gdt[PAGE_SIZE];
+
+	/*
+	 * The GDT is just below entry_stack and thus serves (on x86_64) as
+	 * a a read-only guard page.
+	 */
+	struct entry_stack_page entry_stack_page;
+
+	/*
+	 * On x86_64, the TSS is mapped RO.  On x86_32, it's mapped RW because
+	 * we need task switches to work, and task switches write to the TSS.
+	 */
+	struct tss_struct tss;
+
+	char entry_trampoline[PAGE_SIZE];
+
+#ifdef CONFIG_X86_64
+	/*
+	 * Exception stacks used for IST entries.
+	 *
+	 * In the future, this should have a separate slot for each stack
+	 * with guard pages between them.
+	 */
+	char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ];
+#endif
+#ifdef CONFIG_CPU_SUP_INTEL
+	/*
+	 * Per CPU debug store for Intel performance monitoring. Wastes a
+	 * full page at the moment.
+	 */
+	struct debug_store cpu_debug_store;
+	/*
+	 * The actual PEBS/BTS buffers must be mapped to user space
+	 * Reserve enough fixmap PTEs.
+	 */
+	struct debug_store_buffers cpu_debug_buffers;
+#endif
+};
+
+#define CPU_ENTRY_AREA_SIZE	(sizeof(struct cpu_entry_area))
+#define CPU_ENTRY_AREA_TOT_SIZE	(CPU_ENTRY_AREA_SIZE * NR_CPUS)
+
+DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
+
+extern void setup_cpu_entry_areas(void);
+extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags);
+
+#define	CPU_ENTRY_AREA_RO_IDT		CPU_ENTRY_AREA_BASE
+#define CPU_ENTRY_AREA_PER_CPU		(CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE)
+
+#define CPU_ENTRY_AREA_RO_IDT_VADDR	((void *)CPU_ENTRY_AREA_RO_IDT)
+
+#define CPU_ENTRY_AREA_MAP_SIZE			\
+	(CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE)
+
+extern struct cpu_entry_area *get_cpu_entry_area(int cpu);
+
+static inline struct entry_stack *cpu_entry_stack(int cpu)
+{
+	return &get_cpu_entry_area(cpu)->entry_stack_page.stack;
+}
+
+#endif

+ 2 - 0
arch/x86/include/asm/cpufeature.h

@@ -135,6 +135,8 @@ extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 } while (0)
 
 
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS)
 /*
 /*
  * Static testing of CPU features.  Used the same as boot_cpu_has().
  * Static testing of CPU features.  Used the same as boot_cpu_has().

+ 3 - 1
arch/x86/include/asm/cpufeatures.h

@@ -197,11 +197,12 @@
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
 #define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_INVPCID_SINGLE	( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */
 
 
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
-
+#define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -340,5 +341,6 @@
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_CPU_INSECURE		X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
 
 
 #endif /* _ASM_X86_CPUFEATURES_H */
 #endif /* _ASM_X86_CPUFEATURES_H */

+ 5 - 9
arch/x86/include/asm/desc.h

@@ -7,6 +7,7 @@
 #include <asm/mmu.h>
 #include <asm/mmu.h>
 #include <asm/fixmap.h>
 #include <asm/fixmap.h>
 #include <asm/irq_vectors.h>
 #include <asm/irq_vectors.h>
+#include <asm/cpu_entry_area.h>
 
 
 #include <linux/smp.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
 #include <linux/percpu.h>
@@ -20,6 +21,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in
 
 
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type		= (info->read_exec_only ^ 1) << 1;
 	desc->type	       |= info->contents << 2;
 	desc->type	       |= info->contents << 2;
+	/* Set the ACCESS bit so it can be mapped RO */
+	desc->type	       |= 1;
 
 
 	desc->s			= 1;
 	desc->s			= 1;
 	desc->dpl		= 0x3;
 	desc->dpl		= 0x3;
@@ -60,17 +63,10 @@ static inline struct desc_struct *get_current_gdt_rw(void)
 	return this_cpu_ptr(&gdt_page)->gdt;
 	return this_cpu_ptr(&gdt_page)->gdt;
 }
 }
 
 
-/* Get the fixmap index for a specific processor */
-static inline unsigned int get_cpu_gdt_ro_index(int cpu)
-{
-	return FIX_GDT_REMAP_BEGIN + cpu;
-}
-
 /* Provide the fixmap address of the remapped GDT */
 /* Provide the fixmap address of the remapped GDT */
 static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
 static inline struct desc_struct *get_cpu_gdt_ro(int cpu)
 {
 {
-	unsigned int idx = get_cpu_gdt_ro_index(cpu);
-	return (struct desc_struct *)__fix_to_virt(idx);
+	return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt;
 }
 }
 
 
 /* Provide the current read-only GDT */
 /* Provide the current read-only GDT */
@@ -185,7 +181,7 @@ static inline void set_tssldt_descriptor(void *d, unsigned long addr,
 #endif
 #endif
 }
 }
 
 
-static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
+static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr)
 {
 {
 	struct desc_struct *d = get_cpu_gdt_rw(cpu);
 	struct desc_struct *d = get_cpu_gdt_rw(cpu);
 	tss_desc tss;
 	tss_desc tss;

+ 7 - 1
arch/x86/include/asm/disabled-features.h

@@ -50,6 +50,12 @@
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 # define DISABLE_LA57	(1<<(X86_FEATURE_LA57 & 31))
 #endif
 #endif
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+# define DISABLE_PTI		0
+#else
+# define DISABLE_PTI		(1 << (X86_FEATURE_PTI & 31))
+#endif
+
 /*
 /*
  * Make sure to add features to the correct mask
  * Make sure to add features to the correct mask
  */
  */
@@ -60,7 +66,7 @@
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
 #define DISABLED_MASK6	0
-#define DISABLED_MASK7	0
+#define DISABLED_MASK7	(DISABLE_PTI)
 #define DISABLED_MASK8	0
 #define DISABLED_MASK8	0
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK9	(DISABLE_MPX)
 #define DISABLED_MASK10	0
 #define DISABLED_MASK10	0

+ 4 - 3
arch/x86/include/asm/espfix.h

@@ -2,7 +2,7 @@
 #ifndef _ASM_X86_ESPFIX_H
 #ifndef _ASM_X86_ESPFIX_H
 #define _ASM_X86_ESPFIX_H
 #define _ASM_X86_ESPFIX_H
 
 
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_X86_ESPFIX64
 
 
 #include <asm/percpu.h>
 #include <asm/percpu.h>
 
 
@@ -11,7 +11,8 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr);
 
 
 extern void init_espfix_bsp(void);
 extern void init_espfix_bsp(void);
 extern void init_espfix_ap(int cpu);
 extern void init_espfix_ap(int cpu);
-
-#endif /* CONFIG_X86_64 */
+#else
+static inline void init_espfix_ap(int cpu) { }
+#endif
 
 
 #endif /* _ASM_X86_ESPFIX_H */
 #endif /* _ASM_X86_ESPFIX_H */

+ 1 - 6
arch/x86/include/asm/fixmap.h

@@ -44,7 +44,6 @@ extern unsigned long __FIXADDR_TOP;
 			 PAGE_SIZE)
 			 PAGE_SIZE)
 #endif
 #endif
 
 
-
 /*
 /*
  * Here we define all the compile-time 'special' virtual
  * Here we define all the compile-time 'special' virtual
  * addresses. The point is to have a constant address at
  * addresses. The point is to have a constant address at
@@ -84,7 +83,6 @@ enum fixed_addresses {
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_0,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 	FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
 #endif
 #endif
-	FIX_RO_IDT,	/* Virtual mapping for read-only IDT */
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_BEGIN,	/* reserved pte's for temporary kernel mappings */
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
 	FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
@@ -100,9 +98,6 @@ enum fixed_addresses {
 #ifdef	CONFIG_X86_INTEL_MID
 #ifdef	CONFIG_X86_INTEL_MID
 	FIX_LNW_VRTC,
 	FIX_LNW_VRTC,
 #endif
 #endif
-	/* Fixmap entries to remap the GDTs, one per processor. */
-	FIX_GDT_REMAP_BEGIN,
-	FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1,
 
 
 #ifdef CONFIG_ACPI_APEI_GHES
 #ifdef CONFIG_ACPI_APEI_GHES
 	/* Used for GHES mapping from assorted contexts */
 	/* Used for GHES mapping from assorted contexts */
@@ -143,7 +138,7 @@ enum fixed_addresses {
 extern void reserve_top_address(unsigned long reserve);
 extern void reserve_top_address(unsigned long reserve);
 
 
 #define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
 #define FIXADDR_SIZE	(__end_of_permanent_fixed_addresses << PAGE_SHIFT)
-#define FIXADDR_START		(FIXADDR_TOP - FIXADDR_SIZE)
+#define FIXADDR_START	(FIXADDR_TOP - FIXADDR_SIZE)
 
 
 extern int fixmaps_set;
 extern int fixmaps_set;
 
 

+ 15 - 10
arch/x86/include/asm/hypervisor.h

@@ -20,16 +20,7 @@
 #ifndef _ASM_X86_HYPERVISOR_H
 #ifndef _ASM_X86_HYPERVISOR_H
 #define _ASM_X86_HYPERVISOR_H
 #define _ASM_X86_HYPERVISOR_H
 
 
-#ifdef CONFIG_HYPERVISOR_GUEST
-
-#include <asm/kvm_para.h>
-#include <asm/x86_init.h>
-#include <asm/xen/hypervisor.h>
-
-/*
- * x86 hypervisor information
- */
-
+/* x86 hypervisor types  */
 enum x86_hypervisor_type {
 enum x86_hypervisor_type {
 	X86_HYPER_NATIVE = 0,
 	X86_HYPER_NATIVE = 0,
 	X86_HYPER_VMWARE,
 	X86_HYPER_VMWARE,
@@ -39,6 +30,12 @@ enum x86_hypervisor_type {
 	X86_HYPER_KVM,
 	X86_HYPER_KVM,
 };
 };
 
 
+#ifdef CONFIG_HYPERVISOR_GUEST
+
+#include <asm/kvm_para.h>
+#include <asm/x86_init.h>
+#include <asm/xen/hypervisor.h>
+
 struct hypervisor_x86 {
 struct hypervisor_x86 {
 	/* Hypervisor name */
 	/* Hypervisor name */
 	const char	*name;
 	const char	*name;
@@ -58,7 +55,15 @@ struct hypervisor_x86 {
 
 
 extern enum x86_hypervisor_type x86_hyper_type;
 extern enum x86_hypervisor_type x86_hyper_type;
 extern void init_hypervisor_platform(void);
 extern void init_hypervisor_platform(void);
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+	return x86_hyper_type == type;
+}
 #else
 #else
 static inline void init_hypervisor_platform(void) { }
 static inline void init_hypervisor_platform(void) { }
+static inline bool hypervisor_is_type(enum x86_hypervisor_type type)
+{
+	return type == X86_HYPER_NATIVE;
+}
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* CONFIG_HYPERVISOR_GUEST */
 #endif /* _ASM_X86_HYPERVISOR_H */
 #endif /* _ASM_X86_HYPERVISOR_H */

+ 36 - 0
arch/x86/include/asm/intel_ds.h

@@ -0,0 +1,36 @@
+#ifndef _ASM_INTEL_DS_H
+#define _ASM_INTEL_DS_H
+
+#include <linux/percpu-defs.h>
+
+#define BTS_BUFFER_SIZE		(PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE	(PAGE_SIZE << 4)
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS		8
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+	u64	bts_buffer_base;
+	u64	bts_index;
+	u64	bts_absolute_maximum;
+	u64	bts_interrupt_threshold;
+	u64	pebs_buffer_base;
+	u64	pebs_index;
+	u64	pebs_absolute_maximum;
+	u64	pebs_interrupt_threshold;
+	u64	pebs_event_reset[MAX_PEBS_EVENTS];
+} __aligned(PAGE_SIZE);
+
+DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store);
+
+struct debug_store_buffers {
+	char	bts_buffer[BTS_BUFFER_SIZE];
+	char	pebs_buffer[PEBS_BUFFER_SIZE];
+};
+
+#endif

+ 53 - 0
arch/x86/include/asm/invpcid.h

@@ -0,0 +1,53 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_INVPCID
+#define _ASM_X86_INVPCID
+
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+			     unsigned long type)
+{
+	struct { u64 d[2]; } desc = { { pcid, addr } };
+
+	/*
+	 * The memory clobber is because the whole point is to invalidate
+	 * stale TLB entries and, especially if we're flushing global
+	 * mappings, we don't want the compiler to reorder any subsequent
+	 * memory accesses before the TLB flush.
+	 *
+	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+	 * invpcid (%rcx), %rax in long mode.
+	 */
+	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR		0
+#define INVPCID_TYPE_SINGLE_CTXT	1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
+#define INVPCID_TYPE_ALL_NON_GLOBAL	3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+				     unsigned long addr)
+{
+	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
+#endif /* _ASM_X86_INVPCID */

+ 1 - 1
arch/x86/include/asm/irqdomain.h

@@ -44,7 +44,7 @@ extern int mp_irqdomain_alloc(struct irq_domain *domain, unsigned int virq,
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 extern void mp_irqdomain_free(struct irq_domain *domain, unsigned int virq,
 			      unsigned int nr_irqs);
 			      unsigned int nr_irqs);
 extern int mp_irqdomain_activate(struct irq_domain *domain,
 extern int mp_irqdomain_activate(struct irq_domain *domain,
-				 struct irq_data *irq_data, bool early);
+				 struct irq_data *irq_data, bool reserve);
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 extern void mp_irqdomain_deactivate(struct irq_domain *domain,
 				    struct irq_data *irq_data);
 				    struct irq_data *irq_data);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);
 extern int mp_irqdomain_ioapic_idx(struct irq_domain *domain);

+ 3 - 0
arch/x86/include/asm/irqflags.h

@@ -142,6 +142,9 @@ static inline notrace unsigned long arch_local_irq_save(void)
 	swapgs;					\
 	swapgs;					\
 	sysretl
 	sysretl
 
 
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(x)		pushfq; popq %rax
+#endif
 #else
 #else
 #define INTERRUPT_RETURN		iret
 #define INTERRUPT_RETURN		iret
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit
 #define ENABLE_INTERRUPTS_SYSEXIT	sti; sysexit

+ 1 - 0
arch/x86/include/asm/kdebug.h

@@ -26,6 +26,7 @@ extern void die(const char *, struct pt_regs *,long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern int __must_check __die(const char *, struct pt_regs *, long);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void show_stack_regs(struct pt_regs *regs);
 extern void __show_regs(struct pt_regs *regs, int all);
 extern void __show_regs(struct pt_regs *regs, int all);
+extern void show_iret_regs(struct pt_regs *regs);
 extern unsigned long oops_begin(void);
 extern unsigned long oops_begin(void);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 extern void oops_end(unsigned long, struct pt_regs *, int signr);
 
 

+ 3 - 1
arch/x86/include/asm/mmu.h

@@ -3,6 +3,7 @@
 #define _ASM_X86_MMU_H
 #define _ASM_X86_MMU_H
 
 
 #include <linux/spinlock.h>
 #include <linux/spinlock.h>
+#include <linux/rwsem.h>
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/atomic.h>
 #include <linux/atomic.h>
 
 
@@ -27,7 +28,8 @@ typedef struct {
 	atomic64_t tlb_gen;
 	atomic64_t tlb_gen;
 
 
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
 #ifdef CONFIG_MODIFY_LDT_SYSCALL
-	struct ldt_struct *ldt;
+	struct rw_semaphore	ldt_usr_sem;
+	struct ldt_struct	*ldt;
 #endif
 #endif
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64

+ 71 - 42
arch/x86/include/asm/mmu_context.h

@@ -50,22 +50,53 @@ struct ldt_struct {
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * call gates.  On native, we could merge the ldt_struct and LDT
 	 * allocations, but it's not worth trying to optimize.
 	 * allocations, but it's not worth trying to optimize.
 	 */
 	 */
-	struct desc_struct *entries;
-	unsigned int nr_entries;
+	struct desc_struct	*entries;
+	unsigned int		nr_entries;
+
+	/*
+	 * If PTI is in use, then the entries array is not mapped while we're
+	 * in user mode.  The whole array will be aliased at the addressed
+	 * given by ldt_slot_va(slot).  We use two slots so that we can allocate
+	 * and map, and enable a new LDT without invalidating the mapping
+	 * of an older, still-in-use LDT.
+	 *
+	 * slot will be -1 if this LDT doesn't have an alias mapping.
+	 */
+	int			slot;
 };
 };
 
 
+/* This is a multiple of PAGE_SIZE. */
+#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE)
+
+static inline void *ldt_slot_va(int slot)
+{
+#ifdef CONFIG_X86_64
+	return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot);
+#else
+	BUG();
+#endif
+}
+
 /*
 /*
  * Used for LDT copy/destruction.
  * Used for LDT copy/destruction.
  */
  */
-int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm);
+static inline void init_new_context_ldt(struct mm_struct *mm)
+{
+	mm->context.ldt = NULL;
+	init_rwsem(&mm->context.ldt_usr_sem);
+}
+int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
 void destroy_context_ldt(struct mm_struct *mm);
+void ldt_arch_exit_mmap(struct mm_struct *mm);
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
 #else	/* CONFIG_MODIFY_LDT_SYSCALL */
-static inline int init_new_context_ldt(struct task_struct *tsk,
-				       struct mm_struct *mm)
+static inline void init_new_context_ldt(struct mm_struct *mm) { }
+static inline int ldt_dup_context(struct mm_struct *oldmm,
+				  struct mm_struct *mm)
 {
 {
 	return 0;
 	return 0;
 }
 }
-static inline void destroy_context_ldt(struct mm_struct *mm) {}
+static inline void destroy_context_ldt(struct mm_struct *mm) { }
+static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
 #endif
 #endif
 
 
 static inline void load_mm_ldt(struct mm_struct *mm)
 static inline void load_mm_ldt(struct mm_struct *mm)
@@ -90,10 +121,31 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 	 * that we can see.
 	 * that we can see.
 	 */
 	 */
 
 
-	if (unlikely(ldt))
-		set_ldt(ldt->entries, ldt->nr_entries);
-	else
+	if (unlikely(ldt)) {
+		if (static_cpu_has(X86_FEATURE_PTI)) {
+			if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) {
+				/*
+				 * Whoops -- either the new LDT isn't mapped
+				 * (if slot == -1) or is mapped into a bogus
+				 * slot (if slot > 1).
+				 */
+				clear_LDT();
+				return;
+			}
+
+			/*
+			 * If page table isolation is enabled, ldt->entries
+			 * will not be mapped in the userspace pagetables.
+			 * Tell the CPU to access the LDT through the alias
+			 * at ldt_slot_va(ldt->slot).
+			 */
+			set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries);
+		} else {
+			set_ldt(ldt->entries, ldt->nr_entries);
+		}
+	} else {
 		clear_LDT();
 		clear_LDT();
+	}
 #else
 #else
 	clear_LDT();
 	clear_LDT();
 #endif
 #endif
@@ -132,18 +184,21 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
 static inline int init_new_context(struct task_struct *tsk,
 static inline int init_new_context(struct task_struct *tsk,
 				   struct mm_struct *mm)
 				   struct mm_struct *mm)
 {
 {
+	mutex_init(&mm->context.lock);
+
 	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
 	mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
 	atomic64_set(&mm->context.tlb_gen, 0);
 	atomic64_set(&mm->context.tlb_gen, 0);
 
 
-	#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 	if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
 		/* pkey 0 is the default and always allocated */
 		/* pkey 0 is the default and always allocated */
 		mm->context.pkey_allocation_map = 0x1;
 		mm->context.pkey_allocation_map = 0x1;
 		/* -1 means unallocated or invalid */
 		/* -1 means unallocated or invalid */
 		mm->context.execute_only_pkey = -1;
 		mm->context.execute_only_pkey = -1;
 	}
 	}
-	#endif
-	return init_new_context_ldt(tsk, mm);
+#endif
+	init_new_context_ldt(mm);
+	return 0;
 }
 }
 static inline void destroy_context(struct mm_struct *mm)
 static inline void destroy_context(struct mm_struct *mm)
 {
 {
@@ -176,15 +231,16 @@ do {						\
 } while (0)
 } while (0)
 #endif
 #endif
 
 
-static inline void arch_dup_mmap(struct mm_struct *oldmm,
-				 struct mm_struct *mm)
+static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
 {
 {
 	paravirt_arch_dup_mmap(oldmm, mm);
 	paravirt_arch_dup_mmap(oldmm, mm);
+	return ldt_dup_context(oldmm, mm);
 }
 }
 
 
 static inline void arch_exit_mmap(struct mm_struct *mm)
 static inline void arch_exit_mmap(struct mm_struct *mm)
 {
 {
 	paravirt_arch_exit_mmap(mm);
 	paravirt_arch_exit_mmap(mm);
+	ldt_arch_exit_mmap(mm);
 }
 }
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
@@ -281,33 +337,6 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 	return __pkru_allows_pkey(vma_pkey(vma), write);
 }
 }
 
 
-/*
- * If PCID is on, ASID-aware code paths put the ASID+1 into the PCID
- * bits.  This serves two purposes.  It prevents a nasty situation in
- * which PCID-unaware code saves CR3, loads some other value (with PCID
- * == 0), and then restores CR3, thus corrupting the TLB for ASID 0 if
- * the saved ASID was nonzero.  It also means that any bugs involving
- * loading a PCID-enabled CR3 with CR4.PCIDE off will trigger
- * deterministically.
- */
-
-static inline unsigned long build_cr3(struct mm_struct *mm, u16 asid)
-{
-	if (static_cpu_has(X86_FEATURE_PCID)) {
-		VM_WARN_ON_ONCE(asid > 4094);
-		return __sme_pa(mm->pgd) | (asid + 1);
-	} else {
-		VM_WARN_ON_ONCE(asid != 0);
-		return __sme_pa(mm->pgd);
-	}
-}
-
-static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
-{
-	VM_WARN_ON_ONCE(asid > 4094);
-	return __sme_pa(mm->pgd) | (asid + 1) | CR3_NOFLUSH;
-}
-
 /*
 /*
  * This can be used from process context to figure out what the value of
  * This can be used from process context to figure out what the value of
  * CR3 is without needing to do a (slow) __read_cr3().
  * CR3 is without needing to do a (slow) __read_cr3().
@@ -317,7 +346,7 @@ static inline unsigned long build_cr3_noflush(struct mm_struct *mm, u16 asid)
  */
  */
 static inline unsigned long __get_current_cr3_fast(void)
 static inline unsigned long __get_current_cr3_fast(void)
 {
 {
-	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm),
+	unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd,
 		this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 		this_cpu_read(cpu_tlbstate.loaded_mm_asid));
 
 
 	/* For now, be very restrictive about when this can be called. */
 	/* For now, be very restrictive about when this can be called. */

+ 9 - 0
arch/x86/include/asm/paravirt.h

@@ -927,6 +927,15 @@ extern void default_banner(void);
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 	PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64),	\
 		  CLBR_NONE,						\
 		  CLBR_NONE,						\
 		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
 		  jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
+
+#ifdef CONFIG_DEBUG_ENTRY
+#define SAVE_FLAGS(clobbers)                                        \
+	PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \
+		  PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE);        \
+		  call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl);    \
+		  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
+#endif
+
 #endif	/* CONFIG_X86_32 */
 #endif	/* CONFIG_X86_32 */
 
 
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */

+ 11 - 0
arch/x86/include/asm/pgalloc.h

@@ -30,6 +30,17 @@ static inline void paravirt_release_p4d(unsigned long pfn) {}
  */
  */
 extern gfp_t __userpte_alloc_gfp;
 extern gfp_t __userpte_alloc_gfp;
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Instead of one PGD, we acquire two PGDs.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
+#else
+#define PGD_ALLOCATION_ORDER 0
+#endif
+
 /*
 /*
  * Allocate and free page tables.
  * Allocate and free page tables.
  */
  */

+ 26 - 4
arch/x86/include/asm/pgtable.h

@@ -28,6 +28,7 @@ extern pgd_t early_top_pgt[PTRS_PER_PGD];
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 int __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
 
 
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
+void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user);
 void ptdump_walk_pgd_level_checkwx(void);
 void ptdump_walk_pgd_level_checkwx(void);
 
 
 #ifdef CONFIG_DEBUG_WX
 #ifdef CONFIG_DEBUG_WX
@@ -841,7 +842,12 @@ static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
 
 
 static inline int p4d_bad(p4d_t p4d)
 static inline int p4d_bad(p4d_t p4d)
 {
 {
-	return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
+	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (p4d_flags(p4d) & ~ignore_flags) != 0;
 }
 }
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 #endif  /* CONFIG_PGTABLE_LEVELS > 3 */
 
 
@@ -875,7 +881,12 @@ static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
 
 
 static inline int pgd_bad(pgd_t pgd)
 static inline int pgd_bad(pgd_t pgd)
 {
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	unsigned long ignore_flags = _PAGE_USER;
+
+	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 }
 
 
 static inline int pgd_none(pgd_t pgd)
 static inline int pgd_none(pgd_t pgd)
@@ -904,7 +915,11 @@ static inline int pgd_none(pgd_t pgd)
  * pgd_offset() returns a (pgd_t *)
  * pgd_offset() returns a (pgd_t *)
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
  */
  */
-#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
+#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address)))
+/*
+ * a shortcut to get a pgd_t in a given mm
+ */
+#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address))
 /*
 /*
  * a shortcut which implies the use of the kernel's pgd, instead
  * a shortcut which implies the use of the kernel's pgd, instead
  * of a process's
  * of a process's
@@ -1106,7 +1121,14 @@ static inline int pud_write(pud_t pud)
  */
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return;
+	/* Clone the user space pgd as well */
+	memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src),
+	       count * sizeof(pgd_t));
+#endif
 }
 }
 
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)

+ 12 - 3
arch/x86/include/asm/pgtable_32_types.h

@@ -38,13 +38,22 @@ extern bool __vmalloc_start_set; /* set once high_memory is set */
 #define LAST_PKMAP 1024
 #define LAST_PKMAP 1024
 #endif
 #endif
 
 
-#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1))	\
-		    & PMD_MASK)
+/*
+ * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c
+ * to avoid include recursion hell
+ */
+#define CPU_ENTRY_AREA_PAGES	(NR_CPUS * 40)
+
+#define CPU_ENTRY_AREA_BASE				\
+	((FIXADDR_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) & PMD_MASK)
+
+#define PKMAP_BASE		\
+	((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK)
 
 
 #ifdef CONFIG_HIGHMEM
 #ifdef CONFIG_HIGHMEM
 # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
 # define VMALLOC_END	(PKMAP_BASE - 2 * PAGE_SIZE)
 #else
 #else
-# define VMALLOC_END	(FIXADDR_START - 2 * PAGE_SIZE)
+# define VMALLOC_END	(CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE)
 #endif
 #endif
 
 
 #define MODULES_VADDR	VMALLOC_START
 #define MODULES_VADDR	VMALLOC_START

+ 92 - 0
arch/x86/include/asm/pgtable_64.h

@@ -131,9 +131,97 @@ static inline pud_t native_pudp_get_and_clear(pud_t *xp)
 #endif
 #endif
 }
 }
 
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages
+ * (8k-aligned and 8k in size).  The kernel one is at the beginning 4k and
+ * the user one is in the last 4k.  To switch between them, you
+ * just need to flip the 12th bit in their addresses.
+ */
+#define PTI_PGTABLE_SWITCH_BIT	PAGE_SHIFT
+
+/*
+ * This generates better code than the inline assembly in
+ * __set_bit().
+ */
+static inline void *ptr_set_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr |= BIT(bit);
+	return (void *)__ptr;
+}
+static inline void *ptr_clear_bit(void *ptr, int bit)
+{
+	unsigned long __ptr = (unsigned long)ptr;
+
+	__ptr &= ~BIT(bit);
+	return (void *)__ptr;
+}
+
+static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp)
+{
+	return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp)
+{
+	return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp)
+{
+	return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+
+static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp)
+{
+	return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT);
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ *
+ * Returns true for parts of the PGD that map userspace and
+ * false for the parts that map the kernel.
+ */
+static inline bool pgdp_maps_userspace(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2);
+}
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd);
+
+/*
+ * Take a PGD location (pgdp) and a pgd value that needs to be set there.
+ * Populates the user and returns the resulting PGD that must be set in
+ * the kernel copy of the page tables.
+ */
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	if (!static_cpu_has(X86_FEATURE_PTI))
+		return pgd;
+	return __pti_set_user_pgd(pgdp, pgd);
+}
+#else
+static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+#endif
+
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d)
 {
 {
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL)
+	p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd);
+#else
 	*p4dp = p4d;
 	*p4dp = p4d;
+#endif
 }
 }
 
 
 static inline void native_p4d_clear(p4d_t *p4d)
 static inline void native_p4d_clear(p4d_t *p4d)
@@ -147,7 +235,11 @@ static inline void native_p4d_clear(p4d_t *p4d)
 
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 {
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	*pgdp = pti_set_user_pgd(pgdp, pgd);
+#else
 	*pgdp = pgd;
 	*pgdp = pgd;
+#endif
 }
 }
 
 
 static inline void native_pgd_clear(pgd_t *pgd)
 static inline void native_pgd_clear(pgd_t *pgd)

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно