Browse Source

Merge tag 'asoc-v4.16-3' of https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-next

ASoC: Updates for v4.16

Some final updates for the merge window, this brings in some
improvements to the ACPI GPIO handling for Intel and a bunch of fixes.
Takashi Iwai 7 years ago
parent
commit
c86d95cb6b
100 changed files with 1959 additions and 344 deletions
  1. 16 0
      Documentation/ABI/testing/sysfs-devices-system-cpu
  2. 48 10
      Documentation/admin-guide/kernel-parameters.txt
  3. 29 4
      Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt
  4. 2 2
      Documentation/filesystems/nilfs2.txt
  5. 15 8
      Documentation/kbuild/kconfig-language.txt
  6. 1 1
      Documentation/networking/index.rst
  7. 4 0
      Documentation/networking/msg_zerocopy.rst
  8. 1 1
      Documentation/usb/gadget-testing.txt
  9. 186 0
      Documentation/x86/pti.txt
  10. 3 3
      MAINTAINERS
  11. 24 21
      Makefile
  12. 1 1
      arch/ia64/kernel/time.c
  13. 2 0
      arch/mips/kernel/cps-vec.S
  14. 12 0
      arch/mips/kernel/process.c
  15. 122 25
      arch/mips/kernel/ptrace.c
  16. 6 0
      arch/powerpc/include/asm/exception-64e.h
  17. 55 2
      arch/powerpc/include/asm/exception-64s.h
  18. 13 0
      arch/powerpc/include/asm/feature-fixups.h
  19. 17 0
      arch/powerpc/include/asm/hvcall.h
  20. 10 0
      arch/powerpc/include/asm/paca.h
  21. 14 0
      arch/powerpc/include/asm/plpar_wrappers.h
  22. 13 0
      arch/powerpc/include/asm/setup.h
  23. 5 0
      arch/powerpc/kernel/asm-offsets.c
  24. 36 8
      arch/powerpc/kernel/entry_64.S
  25. 124 13
      arch/powerpc/kernel/exceptions-64s.S
  26. 101 0
      arch/powerpc/kernel/setup_64.c
  27. 9 0
      arch/powerpc/kernel/vmlinux.lds.S
  28. 1 0
      arch/powerpc/kvm/book3s_64_mmu.c
  29. 61 29
      arch/powerpc/kvm/book3s_64_mmu_hv.c
  30. 4 5
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  31. 2 0
      arch/powerpc/kvm/book3s_pr.c
  32. 5 2
      arch/powerpc/kvm/book3s_rmhandlers.S
  33. 2 2
      arch/powerpc/kvm/book3s_segment.S
  34. 41 0
      arch/powerpc/lib/feature-fixups.c
  35. 49 0
      arch/powerpc/platforms/powernv/setup.c
  36. 18 3
      arch/powerpc/platforms/pseries/dlpar.c
  37. 2 0
      arch/powerpc/platforms/pseries/pseries.h
  38. 2 1
      arch/powerpc/platforms/pseries/ras.c
  39. 35 0
      arch/powerpc/platforms/pseries/setup.c
  40. 75 0
      arch/riscv/configs/defconfig
  41. 4 4
      arch/riscv/include/asm/csr.h
  42. 0 4
      arch/riscv/include/asm/io.h
  43. 5 5
      arch/riscv/include/asm/irqflags.h
  44. 0 4
      arch/riscv/include/asm/pgtable.h
  45. 1 1
      arch/riscv/include/asm/ptrace.h
  46. 0 4
      arch/riscv/include/asm/tlbflush.h
  47. 0 12
      arch/riscv/include/asm/uaccess.h
  48. 1 0
      arch/riscv/include/asm/unistd.h
  49. 0 28
      arch/riscv/include/asm/vdso-syscalls.h
  50. 26 0
      arch/riscv/include/uapi/asm/syscalls.h
  51. 4 4
      arch/riscv/kernel/entry.S
  52. 2 2
      arch/riscv/kernel/process.c
  53. 0 1
      arch/riscv/kernel/syscall_table.c
  54. 0 1
      arch/riscv/kernel/vdso/flush_icache.S
  55. 1 1
      arch/riscv/mm/fault.c
  56. 20 4
      arch/sh/boards/mach-se/770x/setup.c
  57. 1 0
      arch/sh/include/mach-se/mach/se.h
  58. 14 1
      arch/x86/Kconfig
  59. 8 0
      arch/x86/Makefile
  60. 3 2
      arch/x86/crypto/aesni-intel_asm.S
  61. 2 1
      arch/x86/crypto/camellia-aesni-avx-asm_64.S
  62. 2 1
      arch/x86/crypto/camellia-aesni-avx2-asm_64.S
  63. 2 1
      arch/x86/crypto/crc32c-pcl-intel-asm_64.S
  64. 19 17
      arch/x86/entry/calling.h
  65. 3 2
      arch/x86/entry/entry_32.S
  66. 9 3
      arch/x86/entry/entry_64.S
  67. 18 0
      arch/x86/events/intel/bts.c
  68. 25 0
      arch/x86/include/asm/asm-prototypes.h
  69. 4 0
      arch/x86/include/asm/cpufeatures.h
  70. 10 8
      arch/x86/include/asm/mshyperv.h
  71. 3 0
      arch/x86/include/asm/msr-index.h
  72. 214 0
      arch/x86/include/asm/nospec-branch.h
  73. 1 0
      arch/x86/include/asm/pci_x86.h
  74. 1 1
      arch/x86/include/asm/processor-flags.h
  75. 3 3
      arch/x86/include/asm/tlbflush.h
  76. 3 2
      arch/x86/include/asm/xen/hypercall.h
  77. 5 2
      arch/x86/kernel/alternative.c
  78. 26 2
      arch/x86/kernel/cpu/amd.c
  79. 185 0
      arch/x86/kernel/cpu/bugs.c
  80. 3 0
      arch/x86/kernel/cpu/common.c
  81. 11 2
      arch/x86/kernel/cpu/microcode/intel.c
  82. 4 2
      arch/x86/kernel/ftrace_32.S
  83. 4 4
      arch/x86/kernel/ftrace_64.S
  84. 5 4
      arch/x86/kernel/irq_32.c
  85. 11 0
      arch/x86/kernel/tboot.c
  86. 12 7
      arch/x86/kvm/mmu.c
  87. 5 8
      arch/x86/kvm/svm.c
  88. 15 6
      arch/x86/kvm/vmx.c
  89. 1 0
      arch/x86/lib/Makefile
  90. 4 3
      arch/x86/lib/checksum_32.S
  91. 48 0
      arch/x86/lib/retpoline.S
  92. 6 26
      arch/x86/mm/pti.c
  93. 5 0
      arch/x86/pci/common.c
  94. 18 11
      arch/x86/pci/fixup.c
  95. 2 0
      arch/x86/platform/efi/efi_64.c
  96. 1 1
      arch/x86/platform/intel-mid/device_libs/platform_bt.c
  97. 3 5
      arch/x86/xen/mmu_pv.c
  98. 1 1
      arch/x86/xen/xen-ops.h
  99. 7 2
      block/blk-core.c
  100. 2 0
      block/blk-mq.c

+ 16 - 0
Documentation/ABI/testing/sysfs-devices-system-cpu

@@ -375,3 +375,19 @@ Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
 Description:	information about CPUs heterogeneity.
 Description:	information about CPUs heterogeneity.
 
 
 		cpu_capacity: capacity of cpu#.
 		cpu_capacity: capacity of cpu#.
+
+What:		/sys/devices/system/cpu/vulnerabilities
+		/sys/devices/system/cpu/vulnerabilities/meltdown
+		/sys/devices/system/cpu/vulnerabilities/spectre_v1
+		/sys/devices/system/cpu/vulnerabilities/spectre_v2
+Date:		January 2018
+Contact:	Linux kernel mailing list <linux-kernel@vger.kernel.org>
+Description:	Information about CPU vulnerabilities
+
+		The files are named after the code names of CPU
+		vulnerabilities. The output of those files reflects the
+		state of the CPUs in the system. Possible output values:
+
+		"Not affected"	  CPU is not affected by the vulnerability
+		"Vulnerable"	  CPU is affected and no mitigation in effect
+		"Mitigation: $M"  CPU is affected and mitigation $M is in effect

+ 48 - 10
Documentation/admin-guide/kernel-parameters.txt

@@ -713,9 +713,6 @@
 			It will be ignored when crashkernel=X,high is not used
 			It will be ignored when crashkernel=X,high is not used
 			or memory reserved is below 4G.
 			or memory reserved is below 4G.
 
 
-	crossrelease_fullstack
-			[KNL] Allow to record full stack trace in cross-release
-
 	cryptomgr.notests
 	cryptomgr.notests
                         [KNL] Disable crypto self-tests
                         [KNL] Disable crypto self-tests
 
 
@@ -2626,6 +2623,11 @@
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 	nosmt		[KNL,S390] Disable symmetric multithreading (SMT).
 			Equivalent to smt=1.
 			Equivalent to smt=1.
 
 
+	nospectre_v2	[X86] Disable all mitigations for the Spectre variant 2
+			(indirect branch prediction) vulnerability. System may
+			allow data leaks with this option, which is equivalent
+			to spectre_v2=off.
+
 	noxsave		[BUGS=X86] Disables x86 extended register state save
 	noxsave		[BUGS=X86] Disables x86 extended register state save
 			and restore using xsave. The kernel will fallback to
 			and restore using xsave. The kernel will fallback to
 			enabling legacy floating-point and sse state.
 			enabling legacy floating-point and sse state.
@@ -2712,8 +2714,6 @@
 			steal time is computed, but won't influence scheduler
 			steal time is computed, but won't influence scheduler
 			behaviour
 			behaviour
 
 
-	nopti		[X86-64] Disable kernel page table isolation
-
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 	nolapic		[X86-32,APIC] Do not enable or use the local APIC.
 
 
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
 	nolapic_timer	[X86-32,APIC] Do not use the local APIC timer.
@@ -3100,6 +3100,12 @@
 		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
 		pcie_scan_all	Scan all possible PCIe devices.  Otherwise we
 				only look for one device below a PCIe downstream
 				only look for one device below a PCIe downstream
 				port.
 				port.
+		big_root_window	Try to add a big 64bit memory window to the PCIe
+				root complex on AMD CPUs. Some GFX hardware
+				can resize a BAR to allow access to all VRAM.
+				Adding the window is slightly risky (it may
+				conflict with unreported devices), so this
+				taints the kernel.
 
 
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 	pcie_aspm=	[PCIE] Forcibly enable or disable PCIe Active State Power
 			Management.
 			Management.
@@ -3288,11 +3294,20 @@
 	pt.		[PARIDE]
 	pt.		[PARIDE]
 			See Documentation/blockdev/paride.txt.
 			See Documentation/blockdev/paride.txt.
 
 
-	pti=		[X86_64]
-			Control user/kernel address space isolation:
-			on - enable
-			off - disable
-			auto - default setting
+	pti=		[X86_64] Control Page Table Isolation of user and
+			kernel address spaces.  Disabling this feature
+			removes hardening, but improves performance of
+			system calls and interrupts.
+
+			on   - unconditionally enable
+			off  - unconditionally disable
+			auto - kernel detects whether your CPU model is
+			       vulnerable to issues that PTI mitigates
+
+			Not specifying this option is equivalent to pti=auto.
+
+	nopti		[X86_64]
+			Equivalent to pti=off
 
 
 	pty.legacy_count=
 	pty.legacy_count=
 			[KNL] Number of legacy pty's. Overwrites compiled-in
 			[KNL] Number of legacy pty's. Overwrites compiled-in
@@ -3943,6 +3958,29 @@
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 	sonypi.*=	[HW] Sony Programmable I/O Control Device driver
 			See Documentation/laptops/sonypi.txt
 			See Documentation/laptops/sonypi.txt
 
 
+	spectre_v2=	[X86] Control mitigation of Spectre variant 2
+			(indirect branch speculation) vulnerability.
+
+			on   - unconditionally enable
+			off  - unconditionally disable
+			auto - kernel detects whether your CPU model is
+			       vulnerable
+
+			Selecting 'on' will, and 'auto' may, choose a
+			mitigation method at run time according to the
+			CPU, the available microcode, the setting of the
+			CONFIG_RETPOLINE configuration option, and the
+			compiler with which the kernel was built.
+
+			Specific mitigations can also be selected manually:
+
+			retpoline	  - replace indirect branches
+			retpoline,generic - google's original retpoline
+			retpoline,amd     - AMD-specific minimal thunk
+
+			Not specifying this option is equivalent to
+			spectre_v2=auto.
+
 	spia_io_base=	[HW,MTD]
 	spia_io_base=	[HW,MTD]
 	spia_fio_base=
 	spia_fio_base=
 	spia_pedr=
 	spia_pedr=

+ 29 - 4
Documentation/devicetree/bindings/sound/mxs-audio-sgtl5000.txt

@@ -1,10 +1,31 @@
 * Freescale MXS audio complex with SGTL5000 codec
 * Freescale MXS audio complex with SGTL5000 codec
 
 
 Required properties:
 Required properties:
-- compatible: "fsl,mxs-audio-sgtl5000"
-- model: The user-visible name of this sound complex
-- saif-controllers: The phandle list of the MXS SAIF controller
-- audio-codec: The phandle of the SGTL5000 audio codec
+- compatible		: "fsl,mxs-audio-sgtl5000"
+- model			: The user-visible name of this sound complex
+- saif-controllers	: The phandle list of the MXS SAIF controller
+- audio-codec		: The phandle of the SGTL5000 audio codec
+- audio-routing		: A list of the connections between audio components.
+			  Each entry is a pair of strings, the first being the
+			  connection's sink, the second being the connection's
+			  source. Valid names could be power supplies, SGTL5000
+			  pins, and the jacks on the board:
+
+			  Power supplies:
+			   * Mic Bias
+
+			  SGTL5000 pins:
+			   * MIC_IN
+			   * LINE_IN
+			   * HP_OUT
+			   * LINE_OUT
+
+			  Board connectors:
+			   * Mic Jack
+			   * Line In Jack
+			   * Headphone Jack
+			   * Line Out Jack
+			   * Ext Spk
 
 
 Example:
 Example:
 
 
@@ -14,4 +35,8 @@ sound {
 	model = "imx28-evk-sgtl5000";
 	model = "imx28-evk-sgtl5000";
 	saif-controllers = <&saif0 &saif1>;
 	saif-controllers = <&saif0 &saif1>;
 	audio-codec = <&sgtl5000>;
 	audio-codec = <&sgtl5000>;
+	audio-routing =
+		"MIC_IN", "Mic Jack",
+		"Mic Jack", "Mic Bias",
+		"Headphone Jack", "HP_OUT";
 };
 };

+ 2 - 2
Documentation/filesystems/nilfs2.txt

@@ -25,8 +25,8 @@ available from the following download page.  At least "mkfs.nilfs2",
 cleaner or garbage collector) are required.  Details on the tools are
 cleaner or garbage collector) are required.  Details on the tools are
 described in the man pages included in the package.
 described in the man pages included in the package.
 
 
-Project web page:    http://nilfs.sourceforge.net/
-Download page:       http://nilfs.sourceforge.net/en/download.html
+Project web page:    https://nilfs.sourceforge.io/
+Download page:       https://nilfs.sourceforge.io/en/download.html
 List info:           http://vger.kernel.org/vger-lists.html#linux-nilfs
 List info:           http://vger.kernel.org/vger-lists.html#linux-nilfs
 
 
 Caveats
 Caveats

+ 15 - 8
Documentation/kbuild/kconfig-language.txt

@@ -200,10 +200,14 @@ module state. Dependency expressions have the following syntax:
 <expr> ::= <symbol>                             (1)
 <expr> ::= <symbol>                             (1)
            <symbol> '=' <symbol>                (2)
            <symbol> '=' <symbol>                (2)
            <symbol> '!=' <symbol>               (3)
            <symbol> '!=' <symbol>               (3)
-           '(' <expr> ')'                       (4)
-           '!' <expr>                           (5)
-           <expr> '&&' <expr>                   (6)
-           <expr> '||' <expr>                   (7)
+           <symbol1> '<' <symbol2>              (4)
+           <symbol1> '>' <symbol2>              (4)
+           <symbol1> '<=' <symbol2>             (4)
+           <symbol1> '>=' <symbol2>             (4)
+           '(' <expr> ')'                       (5)
+           '!' <expr>                           (6)
+           <expr> '&&' <expr>                   (7)
+           <expr> '||' <expr>                   (8)
 
 
 Expressions are listed in decreasing order of precedence. 
 Expressions are listed in decreasing order of precedence. 
 
 
@@ -214,10 +218,13 @@ Expressions are listed in decreasing order of precedence.
     otherwise 'n'.
     otherwise 'n'.
 (3) If the values of both symbols are equal, it returns 'n',
 (3) If the values of both symbols are equal, it returns 'n',
     otherwise 'y'.
     otherwise 'y'.
-(4) Returns the value of the expression. Used to override precedence.
-(5) Returns the result of (2-/expr/).
-(6) Returns the result of min(/expr/, /expr/).
-(7) Returns the result of max(/expr/, /expr/).
+(4) If value of <symbol1> is respectively lower, greater, lower-or-equal,
+    or greater-or-equal than value of <symbol2>, it returns 'y',
+    otherwise 'n'.
+(5) Returns the value of the expression. Used to override precedence.
+(6) Returns the result of (2-/expr/).
+(7) Returns the result of min(/expr/, /expr/).
+(8) Returns the result of max(/expr/, /expr/).
 
 
 An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
 An expression can have a value of 'n', 'm' or 'y' (or 0, 1, 2
 respectively for calculations). A menu entry becomes visible when its
 respectively for calculations). A menu entry becomes visible when its

+ 1 - 1
Documentation/networking/index.rst

@@ -9,6 +9,7 @@ Contents:
    batman-adv
    batman-adv
    kapi
    kapi
    z8530book
    z8530book
+   msg_zerocopy
 
 
 .. only::  subproject
 .. only::  subproject
 
 
@@ -16,4 +17,3 @@ Contents:
    =======
    =======
 
 
    * :ref:`genindex`
    * :ref:`genindex`
-

+ 4 - 0
Documentation/networking/msg_zerocopy.rst

@@ -72,6 +72,10 @@ this flag, a process must first signal intent by setting a socket option:
 	if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
 	if (setsockopt(fd, SOL_SOCKET, SO_ZEROCOPY, &one, sizeof(one)))
 		error(1, errno, "setsockopt zerocopy");
 		error(1, errno, "setsockopt zerocopy");
 
 
+Setting the socket option only works when the socket is in its initial
+(TCP_CLOSED) state.  Trying to set the option for a socket returned by accept(),
+for example, will lead to an EBUSY error. In this case, the option should be set
+to the listening socket and it will be inherited by the accepted sockets.
 
 
 Transmission
 Transmission
 ------------
 ------------

+ 1 - 1
Documentation/usb/gadget-testing.txt

@@ -693,7 +693,7 @@ such specification consists of a number of lines with an inverval value
 in each line. The rules stated above are best illustrated with an example:
 in each line. The rules stated above are best illustrated with an example:
 
 
 # mkdir functions/uvc.usb0/control/header/h
 # mkdir functions/uvc.usb0/control/header/h
-# cd functions/uvc.usb0/control/header/h
+# cd functions/uvc.usb0/control/
 # ln -s header/h class/fs
 # ln -s header/h class/fs
 # ln -s header/h class/ss
 # ln -s header/h class/ss
 # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p
 # mkdir -p functions/uvc.usb0/streaming/uncompressed/u/360p

+ 186 - 0
Documentation/x86/pti.txt

@@ -0,0 +1,186 @@
+Overview
+========
+
+Page Table Isolation (pti, previously known as KAISER[1]) is a
+countermeasure against attacks on the shared user/kernel address
+space such as the "Meltdown" approach[2].
+
+To mitigate this class of attacks, we create an independent set of
+page tables for use only when running userspace applications.  When
+the kernel is entered via syscalls, interrupts or exceptions, the
+page tables are switched to the full "kernel" copy.  When the system
+switches back to user mode, the user copy is used again.
+
+The userspace page tables contain only a minimal amount of kernel
+data: only what is needed to enter/exit the kernel such as the
+entry/exit functions themselves and the interrupt descriptor table
+(IDT).  There are a few strictly unnecessary things that get mapped
+such as the first C function when entering an interrupt (see
+comments in pti.c).
+
+This approach helps to ensure that side-channel attacks leveraging
+the paging structures do not function when PTI is enabled.  It can be
+enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time.
+Once enabled at compile-time, it can be disabled at boot with the
+'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt).
+
+Page Table Management
+=====================
+
+When PTI is enabled, the kernel manages two sets of page tables.
+The first set is very similar to the single set which is present in
+kernels without PTI.  This includes a complete mapping of userspace
+that the kernel can use for things like copy_to_user().
+
+Although _complete_, the user portion of the kernel page tables is
+crippled by setting the NX bit in the top level.  This ensures
+that any missed kernel->user CR3 switch will immediately crash
+userspace upon executing its first instruction.
+
+The userspace page tables map only the kernel data needed to enter
+and exit the kernel.  This data is entirely contained in the 'struct
+cpu_entry_area' structure which is placed in the fixmap which gives
+each CPU's copy of the area a compile-time-fixed virtual address.
+
+For new userspace mappings, the kernel makes the entries in its
+page tables like normal.  The only difference is when the kernel
+makes entries in the top (PGD) level.  In addition to setting the
+entry in the main kernel PGD, a copy of the entry is made in the
+userspace page tables' PGD.
+
+This sharing at the PGD level also inherently shares all the lower
+layers of the page tables.  This leaves a single, shared set of
+userspace page tables to manage.  One PTE to lock, one set of
+accessed bits, dirty bits, etc...
+
+Overhead
+========
+
+Protection against side-channel attacks is important.  But,
+this protection comes at a cost:
+
+1. Increased Memory Use
+  a. Each process now needs an order-1 PGD instead of order-0.
+     (Consumes an additional 4k per process).
+  b. The 'cpu_entry_area' structure must be 2MB in size and 2MB
+     aligned so that it can be mapped by setting a single PMD
+     entry.  This consumes nearly 2MB of RAM once the kernel
+     is decompressed, but no space in the kernel image itself.
+
+2. Runtime Cost
+  a. CR3 manipulation to switch between the page table copies
+     must be done at interrupt, syscall, and exception entry
+     and exit (it can be skipped when the kernel is interrupted,
+     though.)  Moves to CR3 are on the order of a hundred
+     cycles, and are required at every entry and exit.
+  b. A "trampoline" must be used for SYSCALL entry.  This
+     trampoline depends on a smaller set of resources than the
+     non-PTI SYSCALL entry code, so requires mapping fewer
+     things into the userspace page tables.  The downside is
+     that stacks must be switched at entry time.
+  d. Global pages are disabled for all kernel structures not
+     mapped into both kernel and userspace page tables.  This
+     feature of the MMU allows different processes to share TLB
+     entries mapping the kernel.  Losing the feature means more
+     TLB misses after a context switch.  The actual loss of
+     performance is very small, however, never exceeding 1%.
+  d. Process Context IDentifiers (PCID) is a CPU feature that
+     allows us to skip flushing the entire TLB when switching page
+     tables by setting a special bit in CR3 when the page tables
+     are changed.  This makes switching the page tables (at context
+     switch, or kernel entry/exit) cheaper.  But, on systems with
+     PCID support, the context switch code must flush both the user
+     and kernel entries out of the TLB.  The user PCID TLB flush is
+     deferred until the exit to userspace, minimizing the cost.
+     See intel.com/sdm for the gory PCID/INVPCID details.
+  e. The userspace page tables must be populated for each new
+     process.  Even without PTI, the shared kernel mappings
+     are created by copying top-level (PGD) entries into each
+     new process.  But, with PTI, there are now *two* kernel
+     mappings: one in the kernel page tables that maps everything
+     and one for the entry/exit structures.  At fork(), we need to
+     copy both.
+  f. In addition to the fork()-time copying, there must also
+     be an update to the userspace PGD any time a set_pgd() is done
+     on a PGD used to map userspace.  This ensures that the kernel
+     and userspace copies always map the same userspace
+     memory.
+  g. On systems without PCID support, each CR3 write flushes
+     the entire TLB.  That means that each syscall, interrupt
+     or exception flushes the TLB.
+  h. INVPCID is a TLB-flushing instruction which allows flushing
+     of TLB entries for non-current PCIDs.  Some systems support
+     PCIDs, but do not support INVPCID.  On these systems, addresses
+     can only be flushed from the TLB for the current PCID.  When
+     flushing a kernel address, we need to flush all PCIDs, so a
+     single kernel address flush will require a TLB-flushing CR3
+     write upon the next use of every PCID.
+
+Possible Future Work
+====================
+1. We can be more careful about not actually writing to CR3
+   unless its value is actually changed.
+2. Allow PTI to be enabled/disabled at runtime in addition to the
+   boot-time switching.
+
+Testing
+========
+
+To test stability of PTI, the following test procedure is recommended,
+ideally doing all of these in parallel:
+
+1. Set CONFIG_DEBUG_ENTRY=y
+2. Run several copies of all of the tools/testing/selftests/x86/ tests
+   (excluding MPX and protection_keys) in a loop on multiple CPUs for
+   several minutes.  These tests frequently uncover corner cases in the
+   kernel entry code.  In general, old kernels might cause these tests
+   themselves to crash, but they should never crash the kernel.
+3. Run the 'perf' tool in a mode (top or record) that generates many
+   frequent performance monitoring non-maskable interrupts (see "NMI"
+   in /proc/interrupts).  This exercises the NMI entry/exit code which
+   is known to trigger bugs in code paths that did not expect to be
+   interrupted, including nested NMIs.  Using "-c" boosts the rate of
+   NMIs, and using two -c with separate counters encourages nested NMIs
+   and less deterministic behavior.
+
+	while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done
+
+4. Launch a KVM virtual machine.
+5. Run 32-bit binaries on systems supporting the SYSCALL instruction.
+   This has been a lightly-tested code path and needs extra scrutiny.
+
+Debugging
+=========
+
+Bugs in PTI cause a few different signatures of crashes
+that are worth noting here.
+
+ * Failures of the selftests/x86 code.  Usually a bug in one of the
+   more obscure corners of entry_64.S
+ * Crashes in early boot, especially around CPU bringup.  Bugs
+   in the trampoline code or mappings cause these.
+ * Crashes at the first interrupt.  Caused by bugs in entry_64.S,
+   like screwing up a page table switch.  Also caused by
+   incorrectly mapping the IRQ handler entry code.
+ * Crashes at the first NMI.  The NMI code is separate from main
+   interrupt handlers and can have bugs that do not affect
+   normal interrupts.  Also caused by incorrectly mapping NMI
+   code.  NMIs that interrupt the entry code must be very
+   careful and can be the cause of crashes that show up when
+   running perf.
+ * Kernel crashes at the first exit to userspace.  entry_64.S
+   bugs, or failing to map some of the exit code.
+ * Crashes at first interrupt that interrupts userspace. The paths
+   in entry_64.S that return to userspace are sometimes separate
+   from the ones that return to the kernel.
+ * Double faults: overflowing the kernel stack because of page
+   faults upon page faults.  Caused by touching non-pti-mapped
+   data in the entry code, or forgetting to switch to kernel
+   CR3 before calling into C functions which are not pti-mapped.
+ * Userspace segfaults early in boot, sometimes manifesting
+   as mount(8) failing to mount the rootfs.  These have
+   tended to be TLB invalidation issues.  Usually invalidating
+   the wrong PCID, or otherwise missing an invalidation.
+
+1. https://gruss.cc/files/kaiser.pdf
+2. https://meltdownattack.com/meltdown.pdf

+ 3 - 3
MAINTAINERS

@@ -9638,8 +9638,8 @@ F:	include/uapi/linux/sunrpc/
 NILFS2 FILESYSTEM
 NILFS2 FILESYSTEM
 M:	Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
 M:	Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
 L:	linux-nilfs@vger.kernel.org
 L:	linux-nilfs@vger.kernel.org
-W:	http://nilfs.sourceforge.net/
-W:	http://nilfs.osdn.jp/
+W:	https://nilfs.sourceforge.io/
+W:	https://nilfs.osdn.jp/
 T:	git git://github.com/konis/nilfs2.git
 T:	git git://github.com/konis/nilfs2.git
 S:	Supported
 S:	Supported
 F:	Documentation/filesystems/nilfs2.txt
 F:	Documentation/filesystems/nilfs2.txt
@@ -10135,7 +10135,7 @@ F:	drivers/irqchip/irq-ompic.c
 F:	drivers/irqchip/irq-or1k-*
 F:	drivers/irqchip/irq-or1k-*
 
 
 OPENVSWITCH
 OPENVSWITCH
-M:	Pravin Shelar <pshelar@nicira.com>
+M:	Pravin B Shelar <pshelar@ovn.org>
 L:	netdev@vger.kernel.org
 L:	netdev@vger.kernel.org
 L:	dev@openvswitch.org
 L:	dev@openvswitch.org
 W:	http://openvswitch.org
 W:	http://openvswitch.org

+ 24 - 21
Makefile

@@ -2,7 +2,7 @@
 VERSION = 4
 VERSION = 4
 PATCHLEVEL = 15
 PATCHLEVEL = 15
 SUBLEVEL = 0
 SUBLEVEL = 0
-EXTRAVERSION = -rc7
+EXTRAVERSION = -rc8
 NAME = Fearless Coyote
 NAME = Fearless Coyote
 
 
 # *DOCUMENTATION*
 # *DOCUMENTATION*
@@ -484,26 +484,6 @@ CLANG_GCC_TC	:= --gcc-toolchain=$(GCC_TOOLCHAIN)
 endif
 endif
 KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_CFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
 KBUILD_AFLAGS += $(CLANG_TARGET) $(CLANG_GCC_TC)
-KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
-KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
-KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
-# Quiet clang warning: comparison of unsigned expression < 0 is always false
-KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
-# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
-# source of a reference will be _MergedGlobals and not on of the whitelisted names.
-# See modpost pattern 2
-KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
-KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
-KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
-KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
-else
-
-# These warnings generated too much noise in a regular build.
-# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
-KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
 endif
 endif
 
 
 ifeq ($(config-targets),1)
 ifeq ($(config-targets),1)
@@ -716,6 +696,29 @@ ifdef CONFIG_CC_STACKPROTECTOR
 endif
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 KBUILD_CFLAGS += $(stackp-flag)
 
 
+ifeq ($(cc-name),clang)
+KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
+KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
+KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+# Quiet clang warning: comparison of unsigned expression < 0 is always false
+KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
+# CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
+# source of a reference will be _MergedGlobals and not on of the whitelisted names.
+# See modpost pattern 2
+KBUILD_CFLAGS += $(call cc-option, -mno-global-merge,)
+KBUILD_CFLAGS += $(call cc-option, -fcatch-undefined-behavior)
+KBUILD_CFLAGS += $(call cc-option, -no-integrated-as)
+KBUILD_AFLAGS += $(call cc-option, -no-integrated-as)
+else
+
+# These warnings generated too much noise in a regular build.
+# Use make W=1 to enable them (see scripts/Makefile.extrawarn)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
+KBUILD_CFLAGS += $(call cc-disable-warning, unused-const-variable)
+endif
+
 ifdef CONFIG_FRAME_POINTER
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
 else
 else

+ 1 - 1
arch/ia64/kernel/time.c

@@ -88,7 +88,7 @@ void vtime_flush(struct task_struct *tsk)
 	}
 	}
 
 
 	if (ti->softirq_time) {
 	if (ti->softirq_time) {
-		delta = cycle_to_nsec(ti->softirq_time));
+		delta = cycle_to_nsec(ti->softirq_time);
 		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
 		account_system_index_time(tsk, delta, CPUTIME_SOFTIRQ);
 	}
 	}
 
 

+ 2 - 0
arch/mips/kernel/cps-vec.S

@@ -235,6 +235,7 @@ LEAF(mips_cps_core_init)
 	has_mt	t0, 3f
 	has_mt	t0, 3f
 
 
 	.set	push
 	.set	push
+	.set	MIPS_ISA_LEVEL_RAW
 	.set	mt
 	.set	mt
 
 
 	/* Only allow 1 TC per VPE to execute... */
 	/* Only allow 1 TC per VPE to execute... */
@@ -388,6 +389,7 @@ LEAF(mips_cps_boot_vpes)
 #elif defined(CONFIG_MIPS_MT)
 #elif defined(CONFIG_MIPS_MT)
 
 
 	.set	push
 	.set	push
+	.set	MIPS_ISA_LEVEL_RAW
 	.set	mt
 	.set	mt
 
 
 	/* If the core doesn't support MT then return */
 	/* If the core doesn't support MT then return */

+ 12 - 0
arch/mips/kernel/process.c

@@ -705,6 +705,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
 	struct task_struct *t;
 	struct task_struct *t;
 	int max_users;
 	int max_users;
 
 
+	/* If nothing to change, return right away, successfully.  */
+	if (value == mips_get_process_fp_mode(task))
+		return 0;
+
+	/* Only accept a mode change if 64-bit FP enabled for o32.  */
+	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
+		return -EOPNOTSUPP;
+
+	/* And only for o32 tasks.  */
+	if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS))
+		return -EOPNOTSUPP;
+
 	/* Check the value is valid */
 	/* Check the value is valid */
 	if (value & ~known_bits)
 	if (value & ~known_bits)
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;

+ 122 - 25
arch/mips/kernel/ptrace.c

@@ -419,63 +419,160 @@ static int gpr64_set(struct task_struct *target,
 
 
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_64BIT */
 
 
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots
+ * correspond 1:1 to buffer slots.  Only general registers are copied.
+ */
+static int fpr_get_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
+{
+	return user_regset_copyout(pos, count, kbuf, ubuf,
+				   &target->thread.fpu,
+				   0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
+}
+
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's
+ * general register slots are copied to buffer slots.  Only general
+ * registers are copied.
+ */
+static int fpr_get_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
+{
+	unsigned int i;
+	u64 fpr_val;
+	int err;
+
+	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
+	for (i = 0; i < NUM_FPU_REGS; i++) {
+		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
+		err = user_regset_copyout(pos, count, kbuf, ubuf,
+					  &fpr_val, i * sizeof(elf_fpreg_t),
+					  (i + 1) * sizeof(elf_fpreg_t));
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
+ */
 static int fpr_get(struct task_struct *target,
 static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
 		   unsigned int pos, unsigned int count,
 		   void *kbuf, void __user *ubuf)
 		   void *kbuf, void __user *ubuf)
 {
 {
-	unsigned i;
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
 	int err;
 	int err;
-	u64 fpr_val;
 
 
-	/* XXX fcr31  */
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
 
 
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu,
-					   0, sizeof(elf_fpregset_t));
+	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpu.fcr31,
+				  fcr31_pos, fcr31_pos + sizeof(u32));
 
 
-	for (i = 0; i < NUM_FPU_REGS; i++) {
-		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
-		err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					  &fpr_val, i * sizeof(elf_fpreg_t),
-					  (i + 1) * sizeof(elf_fpreg_t));
+	return err;
+}
+
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP
+ * context's general register slots.  Only general registers are copied.
+ */
+static int fpr_set_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	return user_regset_copyin(pos, count, kbuf, ubuf,
+				  &target->thread.fpu,
+				  0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
+}
+
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64
+ * bits only of FP context's general register slots.  Only general
+ * registers are copied.
+ */
+static int fpr_set_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	unsigned int i;
+	u64 fpr_val;
+	int err;
+
+	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
+	for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) {
+		err = user_regset_copyin(pos, count, kbuf, ubuf,
+					 &fpr_val, i * sizeof(elf_fpreg_t),
+					 (i + 1) * sizeof(elf_fpreg_t));
 		if (err)
 		if (err)
 			return err;
 			return err;
+		set_fpr64(&target->thread.fpu.fpr[i], 0, fpr_val);
 	}
 	}
 
 
 	return 0;
 	return 0;
 }
 }
 
 
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
+ *
+ * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
+ * which is supposed to have been guaranteed by the kernel before
+ * calling us, e.g. in `ptrace_regset'.  We enforce that requirement,
+ * so that we can safely avoid preinitializing temporaries for
+ * partial register writes.
+ */
 static int fpr_set(struct task_struct *target,
 static int fpr_set(struct task_struct *target,
 		   const struct user_regset *regset,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
 		   unsigned int pos, unsigned int count,
 		   const void *kbuf, const void __user *ubuf)
 		   const void *kbuf, const void __user *ubuf)
 {
 {
-	unsigned i;
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
+	u32 fcr31;
 	int err;
 	int err;
-	u64 fpr_val;
 
 
-	/* XXX fcr31  */
+	BUG_ON(count % sizeof(elf_fpreg_t));
+
+	if (pos + count > sizeof(elf_fpregset_t))
+		return -EIO;
 
 
 	init_fp_ctx(target);
 	init_fp_ctx(target);
 
 
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu,
-					  0, sizeof(elf_fpregset_t));
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
 
 
-	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
-	for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) {
+	if (count > 0) {
 		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					 &fpr_val, i * sizeof(elf_fpreg_t),
-					 (i + 1) * sizeof(elf_fpreg_t));
+					 &fcr31,
+					 fcr31_pos, fcr31_pos + sizeof(u32));
 		if (err)
 		if (err)
 			return err;
 			return err;
-		set_fpr64(&target->thread.fpu.fpr[i], 0, fpr_val);
+
+		ptrace_setfcr31(target, fcr31);
 	}
 	}
 
 
-	return 0;
+	return err;
 }
 }
 
 
 enum mips_regset {
 enum mips_regset {

+ 6 - 0
arch/powerpc/include/asm/exception-64e.h

@@ -209,5 +209,11 @@ exc_##label##_book3e:
 	ori	r3,r3,vector_offset@l;		\
 	ori	r3,r3,vector_offset@l;		\
 	mtspr	SPRN_IVOR##vector_number,r3;
 	mtspr	SPRN_IVOR##vector_number,r3;
 
 
+#define RFI_TO_KERNEL							\
+	rfi
+
+#define RFI_TO_USER							\
+	rfi
+
 #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
 #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
 
 

+ 55 - 2
arch/powerpc/include/asm/exception-64s.h

@@ -74,6 +74,59 @@
  */
  */
 #define EX_R3		EX_DAR
 #define EX_R3		EX_DAR
 
 
+/*
+ * Macros for annotating the expected destination of (h)rfid
+ *
+ * The nop instructions allow us to insert one or more instructions to flush the
+ * L1-D cache when returning to userspace or a guest.
+ */
+#define RFI_FLUSH_SLOT							\
+	RFI_FLUSH_FIXUP_SECTION;					\
+	nop;								\
+	nop;								\
+	nop
+
+#define RFI_TO_KERNEL							\
+	rfid
+
+#define RFI_TO_USER							\
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
+
+#define RFI_TO_USER_OR_KERNEL						\
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
+
+#define RFI_TO_GUEST							\
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
+
+#define HRFI_TO_KERNEL							\
+	hrfid
+
+#define HRFI_TO_USER							\
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
+
+#define HRFI_TO_USER_OR_KERNEL						\
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
+
+#define HRFI_TO_GUEST							\
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
+
+#define HRFI_TO_UNKNOWN							\
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
+
 #ifdef CONFIG_RELOCATABLE
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
 	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
 	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
@@ -218,7 +271,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mtspr	SPRN_##h##SRR1,r10;					\
 	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
+	h##RFI_TO_KERNEL;						\
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
@@ -232,7 +285,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mtspr	SPRN_##h##SRR1,r10;					\
 	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
+	h##RFI_TO_KERNEL;						\
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 
 
 #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\
 #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h)			\

+ 13 - 0
arch/powerpc/include/asm/feature-fixups.h

@@ -187,7 +187,20 @@ label##3:					       	\
 	FTR_ENTRY_OFFSET label##1b-label##3b;		\
 	FTR_ENTRY_OFFSET label##1b-label##3b;		\
 	.popsection;
 	.popsection;
 
 
+#define RFI_FLUSH_FIXUP_SECTION				\
+951:							\
+	.pushsection __rfi_flush_fixup,"a";		\
+	.align 2;					\
+952:							\
+	FTR_ENTRY_OFFSET 951b-952b;			\
+	.popsection;
+
+
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+
 void apply_feature_fixups(void);
 void apply_feature_fixups(void);
 void setup_feature_keys(void);
 void setup_feature_keys(void);
 #endif
 #endif

+ 17 - 0
arch/powerpc/include/asm/hvcall.h

@@ -241,6 +241,7 @@
 #define H_GET_HCA_INFO          0x1B8
 #define H_GET_HCA_INFO          0x1B8
 #define H_GET_PERF_COUNT        0x1BC
 #define H_GET_PERF_COUNT        0x1BC
 #define H_MANAGE_TRACE          0x1C0
 #define H_MANAGE_TRACE          0x1C0
+#define H_GET_CPU_CHARACTERISTICS 0x1C8
 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
 #define H_QUERY_INT_STATE       0x1E4
 #define H_QUERY_INT_STATE       0x1E4
 #define H_POLL_PENDING		0x1D8
 #define H_POLL_PENDING		0x1D8
@@ -330,6 +331,17 @@
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 #define H_SIGNAL_SYS_RESET_ALL_OTHERS		-2
 /* >= 0 values are CPU number */
 /* >= 0 values are CPU number */
 
 
+/* H_GET_CPU_CHARACTERISTICS return values */
+#define H_CPU_CHAR_SPEC_BAR_ORI31	(1ull << 63) // IBM bit 0
+#define H_CPU_CHAR_BCCTRL_SERIALISED	(1ull << 62) // IBM bit 1
+#define H_CPU_CHAR_L1D_FLUSH_ORI30	(1ull << 61) // IBM bit 2
+#define H_CPU_CHAR_L1D_FLUSH_TRIG2	(1ull << 60) // IBM bit 3
+#define H_CPU_CHAR_L1D_THREAD_PRIV	(1ull << 59) // IBM bit 4
+
+#define H_CPU_BEHAV_FAVOUR_SECURITY	(1ull << 63) // IBM bit 0
+#define H_CPU_BEHAV_L1D_FLUSH_PR	(1ull << 62) // IBM bit 1
+#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2
+
 /* Flag values used in H_REGISTER_PROC_TBL hcall */
 /* Flag values used in H_REGISTER_PROC_TBL hcall */
 #define PROC_TABLE_OP_MASK	0x18
 #define PROC_TABLE_OP_MASK	0x18
 #define PROC_TABLE_DEREG	0x10
 #define PROC_TABLE_DEREG	0x10
@@ -436,6 +448,11 @@ static inline unsigned int get_longbusy_msecs(int longbusy_rc)
 	}
 	}
 }
 }
 
 
+struct h_cpu_char_result {
+	u64 character;
+	u64 behaviour;
+};
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
 #endif /* _ASM_POWERPC_HVCALL_H */

+ 10 - 0
arch/powerpc/include/asm/paca.h

@@ -232,6 +232,16 @@ struct paca_struct {
 	struct sibling_subcore_state *sibling_subcore_state;
 	struct sibling_subcore_state *sibling_subcore_state;
 #endif
 #endif
 #endif
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * rfi fallback flush must be in its own cacheline to prevent
+	 * other paca data leaking into the L1d
+	 */
+	u64 exrfi[EX_SIZE] __aligned(0x80);
+	void *rfi_flush_fallback_area;
+	u64 l1d_flush_congruence;
+	u64 l1d_flush_sets;
+#endif
 };
 };
 
 
 extern void copy_mm_to_paca(struct mm_struct *mm);
 extern void copy_mm_to_paca(struct mm_struct *mm);

+ 14 - 0
arch/powerpc/include/asm/plpar_wrappers.h

@@ -326,4 +326,18 @@ static inline long plapr_signal_sys_reset(long cpu)
 	return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
 	return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu);
 }
 }
 
 
+static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf);
+	if (rc == H_SUCCESS) {
+		p->character = retbuf[0];
+		p->behaviour = retbuf[1];
+	}
+
+	return rc;
+}
+
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */

+ 13 - 0
arch/powerpc/include/asm/setup.h

@@ -39,6 +39,19 @@ static inline void pseries_big_endian_exceptions(void) {}
 static inline void pseries_little_endian_exceptions(void) {}
 static inline void pseries_little_endian_exceptions(void) {}
 #endif /* CONFIG_PPC_PSERIES */
 #endif /* CONFIG_PPC_PSERIES */
 
 
+void rfi_flush_enable(bool enable);
+
+/* These are bit flags */
+enum l1d_flush_type {
+	L1D_FLUSH_NONE		= 0x1,
+	L1D_FLUSH_FALLBACK	= 0x2,
+	L1D_FLUSH_ORI		= 0x4,
+	L1D_FLUSH_MTTRIG	= 0x8,
+};
+
+void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
+void do_rfi_flush_fixups(enum l1d_flush_type types);
+
 #endif /* !__ASSEMBLY__ */
 #endif /* !__ASSEMBLY__ */
 
 
 #endif	/* _ASM_POWERPC_SETUP_H */
 #endif	/* _ASM_POWERPC_SETUP_H */

+ 5 - 0
arch/powerpc/kernel/asm-offsets.c

@@ -237,6 +237,11 @@ int main(void)
 	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
 	OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
 	OFFSET(PACA_IN_MCE, paca_struct, in_mce);
 	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
 	OFFSET(PACA_IN_NMI, paca_struct, in_nmi);
+	OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area);
+	OFFSET(PACA_EXRFI, paca_struct, exrfi);
+	OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence);
+	OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets);
+
 #endif
 #endif
 	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
 	OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id);
 	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);
 	OFFSET(PACAKEXECSTATE, paca_struct, kexec_state);

+ 36 - 8
arch/powerpc/kernel/entry_64.S

@@ -37,6 +37,11 @@
 #include <asm/tm.h>
 #include <asm/tm.h>
 #include <asm/ppc-opcode.h>
 #include <asm/ppc-opcode.h>
 #include <asm/export.h>
 #include <asm/export.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#else
+#include <asm/exception-64e.h>
+#endif
 
 
 /*
 /*
  * System calls.
  * System calls.
@@ -262,13 +267,23 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
 
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
+	ld	r2,GPR2(r1)
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+	/* exit to kernel */
 1:	ld	r2,GPR2(r1)
 1:	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
 	ld	r1,GPR1(r1)
 	mtlr	r4
 	mtlr	r4
 	mtcr	r5
 	mtcr	r5
 	mtspr	SPRN_SRR0,r7
 	mtspr	SPRN_SRR0,r7
 	mtspr	SPRN_SRR1,r8
 	mtspr	SPRN_SRR1,r8
-	RFI
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 
 
 .Lsyscall_error:
 .Lsyscall_error:
@@ -397,8 +412,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mtmsrd	r10, 1
 	mtmsrd	r10, 1
 	mtspr	SPRN_SRR0, r11
 	mtspr	SPRN_SRR0, r11
 	mtspr	SPRN_SRR1, r12
 	mtspr	SPRN_SRR1, r12
-
-	rfid
+	RFI_TO_USER
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 #endif
 #endif
 _ASM_NOKPROBE_SYMBOL(system_call_common);
 _ASM_NOKPROBE_SYMBOL(system_call_common);
@@ -878,7 +892,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
 	ACCOUNT_CPU_USER_EXIT(r13, r2, r4)
 	REST_GPR(13, r1)
 	REST_GPR(13, r1)
-1:
+
 	mtspr	SPRN_SRR1,r3
 	mtspr	SPRN_SRR1,r3
 
 
 	ld	r2,_CCR(r1)
 	ld	r2,_CCR(r1)
@@ -891,8 +905,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ld	r3,GPR3(r1)
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
 	ld	r4,GPR4(r1)
 	ld	r1,GPR1(r1)
 	ld	r1,GPR1(r1)
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
 
 
-	rfid
+1:	mtspr	SPRN_SRR1,r3
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SPRN_SRR0,r2
+
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 
 
 #endif /* CONFIG_PPC_BOOK3E */
 #endif /* CONFIG_PPC_BOOK3E */
@@ -1073,7 +1101,7 @@ __enter_rtas:
 	
 	
 	mtspr	SPRN_SRR0,r5
 	mtspr	SPRN_SRR0,r5
 	mtspr	SPRN_SRR1,r6
 	mtspr	SPRN_SRR1,r6
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 
 
 rtas_return_loc:
 rtas_return_loc:
@@ -1098,7 +1126,7 @@ rtas_return_loc:
 
 
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR1,r4
 	mtspr	SPRN_SRR1,r4
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 _ASM_NOKPROBE_SYMBOL(__enter_rtas)
 _ASM_NOKPROBE_SYMBOL(__enter_rtas)
 _ASM_NOKPROBE_SYMBOL(rtas_return_loc)
 _ASM_NOKPROBE_SYMBOL(rtas_return_loc)
@@ -1171,7 +1199,7 @@ _GLOBAL(enter_prom)
 	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
 	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
 	andc	r11,r11,r12
 	andc	r11,r11,r12
 	mtsrr1	r11
 	mtsrr1	r11
-	rfid
+	RFI_TO_KERNEL
 #endif /* CONFIG_PPC_BOOK3E */
 #endif /* CONFIG_PPC_BOOK3E */
 
 
 1:	/* Return from OF */
 1:	/* Return from OF */

+ 124 - 13
arch/powerpc/kernel/exceptions-64s.S

@@ -256,7 +256,7 @@ BEGIN_FTR_SECTION
 	LOAD_HANDLER(r12, machine_check_handle_early)
 	LOAD_HANDLER(r12, machine_check_handle_early)
 1:	mtspr	SPRN_SRR0,r12
 1:	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r11
 	mtspr	SPRN_SRR1,r11
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 2:
 2:
 	/* Stack overflow. Stay on emergency stack and panic.
 	/* Stack overflow. Stay on emergency stack and panic.
@@ -445,7 +445,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	li	r3,MSR_ME
 	li	r3,MSR_ME
 	andc	r10,r10,r3		/* Turn off MSR_ME */
 	andc	r10,r10,r3		/* Turn off MSR_ME */
 	mtspr	SPRN_SRR1,r10
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 	b	.
 2:
 2:
 	/*
 	/*
@@ -463,7 +463,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early)
 	 */
 	 */
 	bl	machine_check_queue_event
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
 	MACHINE_CHECK_HANDLER_WINDUP
-	rfid
+	RFI_TO_USER_OR_KERNEL
 9:
 9:
 	/* Deliver the machine check to host kernel in V mode. */
 	/* Deliver the machine check to host kernel in V mode. */
 	MACHINE_CHECK_HANDLER_WINDUP
 	MACHINE_CHECK_HANDLER_WINDUP
@@ -598,6 +598,9 @@ EXC_COMMON_BEGIN(slb_miss_common)
 	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
 	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
 	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
 	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
 
 
+	andi.	r9,r11,MSR_PR	// Check for exception from userspace
+	cmpdi	cr4,r9,MSR_PR	// And save the result in CR4 for later
+
 	/*
 	/*
 	 * Test MSR_RI before calling slb_allocate_realmode, because the
 	 * Test MSR_RI before calling slb_allocate_realmode, because the
 	 * MSR in r11 gets clobbered. However we still want to allocate
 	 * MSR in r11 gets clobbered. However we still want to allocate
@@ -624,9 +627,12 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 
 
 	/* All done -- return from exception. */
 	/* All done -- return from exception. */
 
 
+	bne	cr4,1f		/* returning to kernel */
+
 .machine	push
 .machine	push
 .machine	"power4"
 .machine	"power4"
 	mtcrf	0x80,r9
 	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
 	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
 	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
 	mtcrf	0x02,r9		/* I/D indication is in cr6 */
 	mtcrf	0x02,r9		/* I/D indication is in cr6 */
 	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
 	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
@@ -640,9 +646,30 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	ld	r11,PACA_EXSLB+EX_R11(r13)
 	ld	r11,PACA_EXSLB+EX_R11(r13)
 	ld	r12,PACA_EXSLB+EX_R12(r13)
 	ld	r12,PACA_EXSLB+EX_R12(r13)
 	ld	r13,PACA_EXSLB+EX_R13(r13)
 	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+1:
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x08,r9		/* MSR[PR] indication is in cr4 */
+	mtcrf	0x04,r9		/* MSR[RI] indication is in cr5 */
+	mtcrf	0x02,r9		/* I/D indication is in cr6 */
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+	RESTORE_CTR(r9, PACA_EXSLB)
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	mr	r3,r12
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 	b	.	/* prevent speculative execution */
 
 
+
 2:	std     r3,PACA_EXSLB+EX_DAR(r13)
 2:	std     r3,PACA_EXSLB+EX_DAR(r13)
 	mr	r3,r12
 	mr	r3,r12
 	mfspr	r11,SPRN_SRR0
 	mfspr	r11,SPRN_SRR0
@@ -651,7 +678,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	mtspr	SPRN_SRR0,r10
 	mtspr	SPRN_SRR0,r10
 	ld	r10,PACAKMSR(r13)
 	ld	r10,PACAKMSR(r13)
 	mtspr	SPRN_SRR1,r10
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 	b	.
 
 
 8:	std     r3,PACA_EXSLB+EX_DAR(r13)
 8:	std     r3,PACA_EXSLB+EX_DAR(r13)
@@ -662,7 +689,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 	mtspr	SPRN_SRR0,r10
 	mtspr	SPRN_SRR0,r10
 	ld	r10,PACAKMSR(r13)
 	ld	r10,PACAKMSR(r13)
 	mtspr	SPRN_SRR1,r10
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 	b	.
 
 
 EXC_COMMON_BEGIN(unrecov_slb)
 EXC_COMMON_BEGIN(unrecov_slb)
@@ -901,7 +928,7 @@ EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
 	mtspr	SPRN_SRR0,r10 ; 				\
 	mtspr	SPRN_SRR0,r10 ; 				\
 	ld	r10,PACAKMSR(r13) ;				\
 	ld	r10,PACAKMSR(r13) ;				\
 	mtspr	SPRN_SRR1,r10 ; 				\
 	mtspr	SPRN_SRR1,r10 ; 				\
-	rfid ; 							\
+	RFI_TO_KERNEL ;						\
 	b	. ;	/* prevent speculative execution */
 	b	. ;	/* prevent speculative execution */
 
 
 #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
 #ifdef CONFIG_PPC_FAST_ENDIAN_SWITCH
@@ -917,7 +944,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	xori	r12,r12,MSR_LE ;				\
 	xori	r12,r12,MSR_LE ;				\
 	mtspr	SPRN_SRR1,r12 ;					\
 	mtspr	SPRN_SRR1,r12 ;					\
 	mr	r13,r9 ;					\
 	mr	r13,r9 ;					\
-	rfid ;		/* return to userspace */		\
+	RFI_TO_USER ;	/* return to userspace */		\
 	b	. ;	/* prevent speculative execution */
 	b	. ;	/* prevent speculative execution */
 #else
 #else
 #define SYSCALL_FASTENDIAN_TEST
 #define SYSCALL_FASTENDIAN_TEST
@@ -1063,7 +1090,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early)
 	mtcr	r11
 	mtcr	r11
 	REST_GPR(11, r1)
 	REST_GPR(11, r1)
 	ld	r1,GPR1(r1)
 	ld	r1,GPR1(r1)
-	hrfid
+	HRFI_TO_USER_OR_KERNEL
 
 
 1:	mtcr	r11
 1:	mtcr	r11
 	REST_GPR(11, r1)
 	REST_GPR(11, r1)
@@ -1314,7 +1341,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r11,PACA_EXGEN+EX_R11(r13)
 	ld	r11,PACA_EXGEN+EX_R11(r13)
 	ld	r12,PACA_EXGEN+EX_R12(r13)
 	ld	r12,PACA_EXGEN+EX_R12(r13)
 	ld	r13,PACA_EXGEN+EX_R13(r13)
 	ld	r13,PACA_EXGEN+EX_R13(r13)
-	HRFID
+	HRFI_TO_UNKNOWN
 	b	.
 	b	.
 #endif
 #endif
 
 
@@ -1418,10 +1445,94 @@ masked_##_H##interrupt:					\
 	ld	r10,PACA_EXGEN+EX_R10(r13);		\
 	ld	r10,PACA_EXGEN+EX_R10(r13);		\
 	ld	r11,PACA_EXGEN+EX_R11(r13);		\
 	ld	r11,PACA_EXGEN+EX_R11(r13);		\
 	/* returns to kernel where r13 must be set up, so don't restore it */ \
 	/* returns to kernel where r13 must be set up, so don't restore it */ \
-	##_H##rfid;					\
+	##_H##RFI_TO_KERNEL;				\
 	b	.;					\
 	b	.;					\
 	MASKED_DEC_HANDLER(_H)
 	MASKED_DEC_HANDLER(_H)
 
 
+TRAMP_REAL_BEGIN(rfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	rfid
+
+TRAMP_REAL_BEGIN(hrfi_flush_fallback)
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	hrfid
+
 /*
 /*
  * Real mode exceptions actually use this too, but alternate
  * Real mode exceptions actually use this too, but alternate
  * instruction code patches (which end up in the common .text area)
  * instruction code patches (which end up in the common .text area)
@@ -1441,7 +1552,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
 	addi	r13, r13, 4
 	addi	r13, r13, 4
 	mtspr	SPRN_SRR0, r13
 	mtspr	SPRN_SRR0, r13
 	GET_SCRATCH0(r13)
 	GET_SCRATCH0(r13)
-	rfid
+	RFI_TO_KERNEL
 	b	.
 	b	.
 
 
 TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
 TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
@@ -1453,7 +1564,7 @@ TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
 	addi	r13, r13, 4
 	addi	r13, r13, 4
 	mtspr	SPRN_HSRR0, r13
 	mtspr	SPRN_HSRR0, r13
 	GET_SCRATCH0(r13)
 	GET_SCRATCH0(r13)
-	hrfid
+	HRFI_TO_KERNEL
 	b	.
 	b	.
 #endif
 #endif
 
 

+ 101 - 0
arch/powerpc/kernel/setup_64.c

@@ -801,3 +801,104 @@ static int __init disable_hardlockup_detector(void)
 	return 0;
 	return 0;
 }
 }
 early_initcall(disable_hardlockup_detector);
 early_initcall(disable_hardlockup_detector);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static enum l1d_flush_type enabled_flush_types;
+static void *l1d_flush_fallback_area;
+static bool no_rfi_flush;
+bool rfi_flush;
+
+static int __init handle_no_rfi_flush(char *p)
+{
+	pr_info("rfi-flush: disabled on command line.");
+	no_rfi_flush = true;
+	return 0;
+}
+early_param("no_rfi_flush", handle_no_rfi_flush);
+
+/*
+ * The RFI flush is not KPTI, but because users will see doco that says to use
+ * nopti we hijack that option here to also disable the RFI flush.
+ */
+static int __init handle_no_pti(char *p)
+{
+	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
+	handle_no_rfi_flush(NULL);
+	return 0;
+}
+early_param("nopti", handle_no_pti);
+
+static void do_nothing(void *unused)
+{
+	/*
+	 * We don't need to do the flush explicitly, just enter+exit kernel is
+	 * sufficient, the RFI exit handlers will do the right thing.
+	 */
+}
+
+void rfi_flush_enable(bool enable)
+{
+	if (rfi_flush == enable)
+		return;
+
+	if (enable) {
+		do_rfi_flush_fixups(enabled_flush_types);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else
+		do_rfi_flush_fixups(L1D_FLUSH_NONE);
+
+	rfi_flush = enable;
+}
+
+static void init_fallback_flush(void)
+{
+	u64 l1d_size, limit;
+	int cpu;
+
+	l1d_size = ppc64_caches.l1d.size;
+	limit = min(safe_stack_limit(), ppc64_rma_size);
+
+	/*
+	 * Align to L1d size, and size it at 2x L1d size, to catch possible
+	 * hardware prefetch runoff. We don't have a recipe for load patterns to
+	 * reliably avoid the prefetcher.
+	 */
+	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
+	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * The fallback flush is currently coded for 8-way
+		 * associativity. Different associativity is possible, but it
+		 * will be treated as 8-way and may not evict the lines as
+		 * effectively.
+		 *
+		 * 128 byte lines are mandatory.
+		 */
+		u64 c = l1d_size / 8;
+
+		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
+		paca[cpu].l1d_flush_congruence = c;
+		paca[cpu].l1d_flush_sets = c / 128;
+	}
+}
+
+void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
+{
+	if (types & L1D_FLUSH_FALLBACK) {
+		pr_info("rfi-flush: Using fallback displacement flush\n");
+		init_fallback_flush();
+	}
+
+	if (types & L1D_FLUSH_ORI)
+		pr_info("rfi-flush: Using ori type flush\n");
+
+	if (types & L1D_FLUSH_MTTRIG)
+		pr_info("rfi-flush: Using mttrig type flush\n");
+
+	enabled_flush_types = types;
+
+	if (!no_rfi_flush)
+		rfi_flush_enable(enable);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */

+ 9 - 0
arch/powerpc/kernel/vmlinux.lds.S

@@ -132,6 +132,15 @@ SECTIONS
 	/* Read-only data */
 	/* Read-only data */
 	RO_DATA(PAGE_SIZE)
 	RO_DATA(PAGE_SIZE)
 
 
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+		__start___rfi_flush_fixup = .;
+		*(__rfi_flush_fixup)
+		__stop___rfi_flush_fixup = .;
+	}
+#endif
+
 	EXCEPTION_TABLE(0)
 	EXCEPTION_TABLE(0)
 
 
 	NOTES :kernel :notes
 	NOTES :kernel :notes

+ 1 - 0
arch/powerpc/kvm/book3s_64_mmu.c

@@ -235,6 +235,7 @@ static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 		gpte->may_read = true;
 		gpte->may_read = true;
 		gpte->may_write = true;
 		gpte->may_write = true;
 		gpte->page_size = MMU_PAGE_4K;
 		gpte->page_size = MMU_PAGE_4K;
+		gpte->wimg = HPTE_R_M;
 
 
 		return 0;
 		return 0;
 	}
 	}

+ 61 - 29
arch/powerpc/kvm/book3s_64_mmu_hv.c

@@ -65,11 +65,17 @@ struct kvm_resize_hpt {
 	u32 order;
 	u32 order;
 
 
 	/* These fields protected by kvm->lock */
 	/* These fields protected by kvm->lock */
+
+	/* Possible values and their usage:
+	 *  <0     an error occurred during allocation,
+	 *  -EBUSY allocation is in the progress,
+	 *  0      allocation made successfuly.
+	 */
 	int error;
 	int error;
-	bool prepare_done;
 
 
-	/* Private to the work thread, until prepare_done is true,
-	 * then protected by kvm->resize_hpt_sem */
+	/* Private to the work thread, until error != -EBUSY,
+	 * then protected by kvm->lock.
+	 */
 	struct kvm_hpt_info hpt;
 	struct kvm_hpt_info hpt;
 };
 };
 
 
@@ -159,8 +165,6 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 		 * Reset all the reverse-mapping chains for all memslots
 		 * Reset all the reverse-mapping chains for all memslots
 		 */
 		 */
 		kvmppc_rmap_reset(kvm);
 		kvmppc_rmap_reset(kvm);
-		/* Ensure that each vcpu will flush its TLB on next entry. */
-		cpumask_setall(&kvm->arch.need_tlb_flush);
 		err = 0;
 		err = 0;
 		goto out;
 		goto out;
 	}
 	}
@@ -176,6 +180,10 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
 	kvmppc_set_hpt(kvm, &info);
 	kvmppc_set_hpt(kvm, &info);
 
 
 out:
 out:
+	if (err == 0)
+		/* Ensure that each vcpu will flush its TLB on next entry. */
+		cpumask_setall(&kvm->arch.need_tlb_flush);
+
 	mutex_unlock(&kvm->lock);
 	mutex_unlock(&kvm->lock);
 	return err;
 	return err;
 }
 }
@@ -1413,16 +1421,20 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
 
 
 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
 {
 {
-	BUG_ON(kvm->arch.resize_hpt != resize);
+	if (WARN_ON(!mutex_is_locked(&kvm->lock)))
+		return;
 
 
 	if (!resize)
 	if (!resize)
 		return;
 		return;
 
 
-	if (resize->hpt.virt)
-		kvmppc_free_hpt(&resize->hpt);
+	if (resize->error != -EBUSY) {
+		if (resize->hpt.virt)
+			kvmppc_free_hpt(&resize->hpt);
+		kfree(resize);
+	}
 
 
-	kvm->arch.resize_hpt = NULL;
-	kfree(resize);
+	if (kvm->arch.resize_hpt == resize)
+		kvm->arch.resize_hpt = NULL;
 }
 }
 
 
 static void resize_hpt_prepare_work(struct work_struct *work)
 static void resize_hpt_prepare_work(struct work_struct *work)
@@ -1431,17 +1443,41 @@ static void resize_hpt_prepare_work(struct work_struct *work)
 						     struct kvm_resize_hpt,
 						     struct kvm_resize_hpt,
 						     work);
 						     work);
 	struct kvm *kvm = resize->kvm;
 	struct kvm *kvm = resize->kvm;
-	int err;
+	int err = 0;
 
 
-	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
-			 resize->order);
-
-	err = resize_hpt_allocate(resize);
+	if (WARN_ON(resize->error != -EBUSY))
+		return;
 
 
 	mutex_lock(&kvm->lock);
 	mutex_lock(&kvm->lock);
 
 
+	/* Request is still current? */
+	if (kvm->arch.resize_hpt == resize) {
+		/* We may request large allocations here:
+		 * do not sleep with kvm->lock held for a while.
+		 */
+		mutex_unlock(&kvm->lock);
+
+		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
+				 resize->order);
+
+		err = resize_hpt_allocate(resize);
+
+		/* We have strict assumption about -EBUSY
+		 * when preparing for HPT resize.
+		 */
+		if (WARN_ON(err == -EBUSY))
+			err = -EINPROGRESS;
+
+		mutex_lock(&kvm->lock);
+		/* It is possible that kvm->arch.resize_hpt != resize
+		 * after we grab kvm->lock again.
+		 */
+	}
+
 	resize->error = err;
 	resize->error = err;
-	resize->prepare_done = true;
+
+	if (kvm->arch.resize_hpt != resize)
+		resize_hpt_release(kvm, resize);
 
 
 	mutex_unlock(&kvm->lock);
 	mutex_unlock(&kvm->lock);
 }
 }
@@ -1466,14 +1502,12 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 
 
 	if (resize) {
 	if (resize) {
 		if (resize->order == shift) {
 		if (resize->order == shift) {
-			/* Suitable resize in progress */
-			if (resize->prepare_done) {
-				ret = resize->error;
-				if (ret != 0)
-					resize_hpt_release(kvm, resize);
-			} else {
+			/* Suitable resize in progress? */
+			ret = resize->error;
+			if (ret == -EBUSY)
 				ret = 100; /* estimated time in ms */
 				ret = 100; /* estimated time in ms */
-			}
+			else if (ret)
+				resize_hpt_release(kvm, resize);
 
 
 			goto out;
 			goto out;
 		}
 		}
@@ -1493,6 +1527,8 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto out;
 		goto out;
 	}
 	}
+
+	resize->error = -EBUSY;
 	resize->order = shift;
 	resize->order = shift;
 	resize->kvm = kvm;
 	resize->kvm = kvm;
 	INIT_WORK(&resize->work, resize_hpt_prepare_work);
 	INIT_WORK(&resize->work, resize_hpt_prepare_work);
@@ -1547,16 +1583,12 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
 	if (!resize || (resize->order != shift))
 	if (!resize || (resize->order != shift))
 		goto out;
 		goto out;
 
 
-	ret = -EBUSY;
-	if (!resize->prepare_done)
-		goto out;
-
 	ret = resize->error;
 	ret = resize->error;
-	if (ret != 0)
+	if (ret)
 		goto out;
 		goto out;
 
 
 	ret = resize_hpt_rehash(resize);
 	ret = resize_hpt_rehash(resize);
-	if (ret != 0)
+	if (ret)
 		goto out;
 		goto out;
 
 
 	resize_hpt_pivot(resize);
 	resize_hpt_pivot(resize);

+ 4 - 5
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -79,7 +79,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
 	mtmsrd	r0,1		/* clear RI in MSR */
 	mtmsrd	r0,1		/* clear RI in MSR */
 	mtsrr0	r5
 	mtsrr0	r5
 	mtsrr1	r6
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 
 kvmppc_call_hv_entry:
 kvmppc_call_hv_entry:
 BEGIN_FTR_SECTION
 BEGIN_FTR_SECTION
@@ -199,7 +199,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtmsrd	r6, 1			/* Clear RI in MSR */
 	mtmsrd	r6, 1			/* Clear RI in MSR */
 	mtsrr0	r8
 	mtsrr0	r8
 	mtsrr1	r7
 	mtsrr1	r7
-	RFI
+	RFI_TO_KERNEL
 
 
 	/* Virtual-mode return */
 	/* Virtual-mode return */
 .Lvirt_return:
 .Lvirt_return:
@@ -1167,8 +1167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
 
 	ld	r0, VCPU_GPR(R0)(r4)
 	ld	r0, VCPU_GPR(R0)(r4)
 	ld	r4, VCPU_GPR(R4)(r4)
 	ld	r4, VCPU_GPR(R4)(r4)
-
-	hrfid
+	HRFI_TO_GUEST
 	b	.
 	b	.
 
 
 secondary_too_late:
 secondary_too_late:
@@ -3320,7 +3319,7 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
 	ld	r4, PACAKMSR(r13)
 	ld	r4, PACAKMSR(r13)
 	mtspr	SPRN_SRR0, r3
 	mtspr	SPRN_SRR0, r3
 	mtspr	SPRN_SRR1, r4
 	mtspr	SPRN_SRR1, r4
-	rfid
+	RFI_TO_KERNEL
 9:	addi	r3, r1, STACK_FRAME_OVERHEAD
 9:	addi	r3, r1, STACK_FRAME_OVERHEAD
 	bl	kvmppc_bad_interrupt
 	bl	kvmppc_bad_interrupt
 	b	9b
 	b	9b

+ 2 - 0
arch/powerpc/kvm/book3s_pr.c

@@ -60,6 +60,7 @@ static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
 #define MSR_USER32 MSR_USER
 #define MSR_USER32 MSR_USER
 #define MSR_USER64 MSR_USER
 #define MSR_USER64 MSR_USER
 #define HW_PAGE_SIZE PAGE_SIZE
 #define HW_PAGE_SIZE PAGE_SIZE
+#define HPTE_R_M   _PAGE_COHERENT
 #endif
 #endif
 
 
 static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
 static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu)
@@ -557,6 +558,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 		pte.eaddr = eaddr;
 		pte.eaddr = eaddr;
 		pte.vpage = eaddr >> 12;
 		pte.vpage = eaddr >> 12;
 		pte.page_size = MMU_PAGE_64K;
 		pte.page_size = MMU_PAGE_64K;
+		pte.wimg = HPTE_R_M;
 	}
 	}
 
 
 	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {
 	switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) {

+ 5 - 2
arch/powerpc/kvm/book3s_rmhandlers.S

@@ -46,6 +46,9 @@
 
 
 #define FUNC(name)		name
 #define FUNC(name)		name
 
 
+#define RFI_TO_KERNEL	RFI
+#define RFI_TO_GUEST	RFI
+
 .macro INTERRUPT_TRAMPOLINE intno
 .macro INTERRUPT_TRAMPOLINE intno
 
 
 .global kvmppc_trampoline_\intno
 .global kvmppc_trampoline_\intno
@@ -141,7 +144,7 @@ kvmppc_handler_skip_ins:
 	GET_SCRATCH0(r13)
 	GET_SCRATCH0(r13)
 
 
 	/* And get back into the code */
 	/* And get back into the code */
-	RFI
+	RFI_TO_KERNEL
 #endif
 #endif
 
 
 /*
 /*
@@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline)
 	ori	r5, r5, MSR_EE
 	ori	r5, r5, MSR_EE
 	mtsrr0	r7
 	mtsrr0	r7
 	mtsrr1	r6
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 
 #include "book3s_segment.S"
 #include "book3s_segment.S"

+ 2 - 2
arch/powerpc/kvm/book3s_segment.S

@@ -156,7 +156,7 @@ no_dcbz32_on:
 	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r3, (SVCPU_R3)(r3)
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
 
-	RFI
+	RFI_TO_GUEST
 kvmppc_handler_trampoline_enter_end:
 kvmppc_handler_trampoline_enter_end:
 
 
 
 
@@ -407,5 +407,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
 	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
 	beqa	BOOK3S_INTERRUPT_DOORBELL
 	beqa	BOOK3S_INTERRUPT_DOORBELL
 
 
-	RFI
+	RFI_TO_KERNEL
 kvmppc_handler_trampoline_exit_end:
 kvmppc_handler_trampoline_exit_end:

+ 41 - 0
arch/powerpc/lib/feature-fixups.c

@@ -116,6 +116,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 	}
 	}
 }
 }
 
 
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(enum l1d_flush_type types)
+{
+	unsigned int instrs[3], *dest;
+	long *start, *end;
+	int i;
+
+	start = PTRRELOC(&__start___rfi_flush_fixup),
+	end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+	instrs[0] = 0x60000000; /* nop */
+	instrs[1] = 0x60000000; /* nop */
+	instrs[2] = 0x60000000; /* nop */
+
+	if (types & L1D_FLUSH_FALLBACK)
+		/* b .+16 to fallback flush */
+		instrs[0] = 0x48000010;
+
+	i = 0;
+	if (types & L1D_FLUSH_ORI) {
+		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+		instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/
+	}
+
+	if (types & L1D_FLUSH_MTTRIG)
+		instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+		patch_instruction(dest, instrs[0]);
+		patch_instruction(dest + 1, instrs[1]);
+		patch_instruction(dest + 2, instrs[2]);
+	}
+
+	printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
 {
 	long *start, *end;
 	long *start, *end;

+ 49 - 0
arch/powerpc/platforms/powernv/setup.c

@@ -37,13 +37,62 @@
 #include <asm/kexec.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
 #include <asm/smp.h>
 #include <asm/tm.h>
 #include <asm/tm.h>
+#include <asm/setup.h>
 
 
 #include "powernv.h"
 #include "powernv.h"
 
 
+static void pnv_setup_rfi_flush(void)
+{
+	struct device_node *np, *fw_features;
+	enum l1d_flush_type type;
+	int enable;
+
+	/* Default to fallback in case fw-features are not available */
+	type = L1D_FLUSH_FALLBACK;
+	enable = 1;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	fw_features = of_get_child_by_name(np, "fw-features");
+	of_node_put(np);
+
+	if (fw_features) {
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_MTTRIG;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_ORI;
+
+		of_node_put(np);
+
+		/* Enable unless firmware says NOT to */
+		enable = 2;
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+		of_node_put(fw_features);
+	}
+
+	setup_rfi_flush(type, enable > 0);
+}
+
 static void __init pnv_setup_arch(void)
 static void __init pnv_setup_arch(void)
 {
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 
 
+	pnv_setup_rfi_flush();
+
 	/* Initialize SMP */
 	/* Initialize SMP */
 	pnv_smp_init();
 	pnv_smp_init();
 
 

+ 18 - 3
arch/powerpc/platforms/pseries/dlpar.c

@@ -574,11 +574,26 @@ static ssize_t dlpar_show(struct class *class, struct class_attribute *attr,
 
 
 static CLASS_ATTR_RW(dlpar);
 static CLASS_ATTR_RW(dlpar);
 
 
-static int __init pseries_dlpar_init(void)
+int __init dlpar_workqueue_init(void)
 {
 {
+	if (pseries_hp_wq)
+		return 0;
+
 	pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
 	pseries_hp_wq = alloc_workqueue("pseries hotplug workqueue",
-					WQ_UNBOUND, 1);
+			WQ_UNBOUND, 1);
+
+	return pseries_hp_wq ? 0 : -ENOMEM;
+}
+
+static int __init dlpar_sysfs_init(void)
+{
+	int rc;
+
+	rc = dlpar_workqueue_init();
+	if (rc)
+		return rc;
+
 	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
 	return sysfs_create_file(kernel_kobj, &class_attr_dlpar.attr);
 }
 }
-machine_device_initcall(pseries, pseries_dlpar_init);
+machine_device_initcall(pseries, dlpar_sysfs_init);
 
 

+ 2 - 0
arch/powerpc/platforms/pseries/pseries.h

@@ -98,4 +98,6 @@ static inline unsigned long cmo_get_page_size(void)
 	return CMO_PageSize;
 	return CMO_PageSize;
 }
 }
 
 
+int dlpar_workqueue_init(void);
+
 #endif /* _PSERIES_PSERIES_H */
 #endif /* _PSERIES_PSERIES_H */

+ 2 - 1
arch/powerpc/platforms/pseries/ras.c

@@ -69,7 +69,8 @@ static int __init init_ras_IRQ(void)
 	/* Hotplug Events */
 	/* Hotplug Events */
 	np = of_find_node_by_path("/event-sources/hot-plug-events");
 	np = of_find_node_by_path("/event-sources/hot-plug-events");
 	if (np != NULL) {
 	if (np != NULL) {
-		request_event_sources_irqs(np, ras_hotplug_interrupt,
+		if (dlpar_workqueue_init() == 0)
+			request_event_sources_irqs(np, ras_hotplug_interrupt,
 					   "RAS_HOTPLUG");
 					   "RAS_HOTPLUG");
 		of_node_put(np);
 		of_node_put(np);
 	}
 	}

+ 35 - 0
arch/powerpc/platforms/pseries/setup.c

@@ -459,6 +459,39 @@ static void __init find_and_init_phbs(void)
 	of_pci_check_probe_only();
 	of_pci_check_probe_only();
 }
 }
 
 
+static void pseries_setup_rfi_flush(void)
+{
+	struct h_cpu_char_result result;
+	enum l1d_flush_type types;
+	bool enable;
+	long rc;
+
+	/* Enable by default */
+	enable = true;
+
+	rc = plpar_get_cpu_characteristics(&result);
+	if (rc == H_SUCCESS) {
+		types = L1D_FLUSH_NONE;
+
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
+			types |= L1D_FLUSH_MTTRIG;
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30)
+			types |= L1D_FLUSH_ORI;
+
+		/* Use fallback if nothing set in hcall */
+		if (types == L1D_FLUSH_NONE)
+			types = L1D_FLUSH_FALLBACK;
+
+		if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
+			enable = false;
+	} else {
+		/* Default to fallback if case hcall is not available */
+		types = L1D_FLUSH_FALLBACK;
+	}
+
+	setup_rfi_flush(types, enable);
+}
+
 static void __init pSeries_setup_arch(void)
 static void __init pSeries_setup_arch(void)
 {
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
@@ -476,6 +509,8 @@ static void __init pSeries_setup_arch(void)
 
 
 	fwnmi_init();
 	fwnmi_init();
 
 
+	pseries_setup_rfi_flush();
+
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);
 	pci_add_flags(PCI_PROBE_ONLY);
 
 

+ 75 - 0
arch/riscv/configs/defconfig

@@ -0,0 +1,75 @@
+CONFIG_SMP=y
+CONFIG_PCI=y
+CONFIG_PCIE_XILINX=y
+CONFIG_SYSVIPC=y
+CONFIG_POSIX_MQUEUE=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CFS_BANDWIDTH=y
+CONFIG_CGROUP_BPF=y
+CONFIG_NAMESPACES=y
+CONFIG_USER_NS=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_EXPERT=y
+CONFIG_CHECKPOINT_RESTORE=y
+CONFIG_BPF_SYSCALL=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_NETLINK_DIAG=y
+CONFIG_DEVTMPFS=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_SATA_AHCI_PLATFORM=y
+CONFIG_NETDEVICES=y
+CONFIG_VIRTIO_NET=y
+CONFIG_MACB=y
+CONFIG_E1000E=y
+CONFIG_R8169=y
+CONFIG_MICROSEMI_PHY=y
+CONFIG_INPUT_MOUSEDEV=y
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_SERIAL_OF_PLATFORM=y
+# CONFIG_PTP_1588_CLOCK is not set
+CONFIG_DRM=y
+CONFIG_DRM_RADEON=y
+CONFIG_FRAMEBUFFER_CONSOLE=y
+CONFIG_USB=y
+CONFIG_USB_XHCI_HCD=y
+CONFIG_USB_XHCI_PLATFORM=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_EHCI_HCD_PLATFORM=y
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_OHCI_HCD_PLATFORM=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_UAS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_RAS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_POSIX_ACL=y
+CONFIG_AUTOFS4_FS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_NFS_FS=y
+CONFIG_NFS_V4=y
+CONFIG_NFS_V4_1=y
+CONFIG_NFS_V4_2=y
+CONFIG_ROOT_NFS=y
+# CONFIG_RCU_TRACE is not set
+CONFIG_CRYPTO_USER_API_HASH=y

+ 4 - 4
arch/riscv/include/asm/csr.h

@@ -17,10 +17,10 @@
 #include <linux/const.h>
 #include <linux/const.h>
 
 
 /* Status register flags */
 /* Status register flags */
-#define SR_IE   _AC(0x00000002, UL) /* Interrupt Enable */
-#define SR_PIE  _AC(0x00000020, UL) /* Previous IE */
-#define SR_PS   _AC(0x00000100, UL) /* Previously Supervisor */
-#define SR_SUM  _AC(0x00040000, UL) /* Supervisor may access User Memory */
+#define SR_SIE	_AC(0x00000002, UL) /* Supervisor Interrupt Enable */
+#define SR_SPIE	_AC(0x00000020, UL) /* Previous Supervisor IE */
+#define SR_SPP	_AC(0x00000100, UL) /* Previously Supervisor */
+#define SR_SUM	_AC(0x00040000, UL) /* Supervisor may access User Memory */
 
 
 #define SR_FS           _AC(0x00006000, UL) /* Floating-point Status */
 #define SR_FS           _AC(0x00006000, UL) /* Floating-point Status */
 #define SR_FS_OFF       _AC(0x00000000, UL)
 #define SR_FS_OFF       _AC(0x00000000, UL)

+ 0 - 4
arch/riscv/include/asm/io.h

@@ -21,8 +21,6 @@
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 
 
-#ifdef CONFIG_MMU
-
 extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);
 extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);
 
 
 /*
 /*
@@ -36,8 +34,6 @@ extern void __iomem *ioremap(phys_addr_t offset, unsigned long size);
 
 
 extern void iounmap(volatile void __iomem *addr);
 extern void iounmap(volatile void __iomem *addr);
 
 
-#endif /* CONFIG_MMU */
-
 /* Generic IO read/write.  These perform native-endian accesses. */
 /* Generic IO read/write.  These perform native-endian accesses. */
 #define __raw_writeb __raw_writeb
 #define __raw_writeb __raw_writeb
 static inline void __raw_writeb(u8 val, volatile void __iomem *addr)
 static inline void __raw_writeb(u8 val, volatile void __iomem *addr)

+ 5 - 5
arch/riscv/include/asm/irqflags.h

@@ -27,25 +27,25 @@ static inline unsigned long arch_local_save_flags(void)
 /* unconditionally enable interrupts */
 /* unconditionally enable interrupts */
 static inline void arch_local_irq_enable(void)
 static inline void arch_local_irq_enable(void)
 {
 {
-	csr_set(sstatus, SR_IE);
+	csr_set(sstatus, SR_SIE);
 }
 }
 
 
 /* unconditionally disable interrupts */
 /* unconditionally disable interrupts */
 static inline void arch_local_irq_disable(void)
 static inline void arch_local_irq_disable(void)
 {
 {
-	csr_clear(sstatus, SR_IE);
+	csr_clear(sstatus, SR_SIE);
 }
 }
 
 
 /* get status and disable interrupts */
 /* get status and disable interrupts */
 static inline unsigned long arch_local_irq_save(void)
 static inline unsigned long arch_local_irq_save(void)
 {
 {
-	return csr_read_clear(sstatus, SR_IE);
+	return csr_read_clear(sstatus, SR_SIE);
 }
 }
 
 
 /* test flags */
 /* test flags */
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 static inline int arch_irqs_disabled_flags(unsigned long flags)
 {
 {
-	return !(flags & SR_IE);
+	return !(flags & SR_SIE);
 }
 }
 
 
 /* test hardware interrupt enable bit */
 /* test hardware interrupt enable bit */
@@ -57,7 +57,7 @@ static inline int arch_irqs_disabled(void)
 /* set interrupt enabled status */
 /* set interrupt enabled status */
 static inline void arch_local_irq_restore(unsigned long flags)
 static inline void arch_local_irq_restore(unsigned long flags)
 {
 {
-	csr_set(sstatus, flags & SR_IE);
+	csr_set(sstatus, flags & SR_SIE);
 }
 }
 
 
 #endif /* _ASM_RISCV_IRQFLAGS_H */
 #endif /* _ASM_RISCV_IRQFLAGS_H */

+ 0 - 4
arch/riscv/include/asm/pgtable.h

@@ -20,8 +20,6 @@
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 
-#ifdef CONFIG_MMU
-
 /* Page Upper Directory not used in RISC-V */
 /* Page Upper Directory not used in RISC-V */
 #include <asm-generic/pgtable-nopud.h>
 #include <asm-generic/pgtable-nopud.h>
 #include <asm/page.h>
 #include <asm/page.h>
@@ -413,8 +411,6 @@ static inline void pgtable_cache_init(void)
 	/* No page table caches to initialize */
 	/* No page table caches to initialize */
 }
 }
 
 
-#endif /* CONFIG_MMU */
-
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_SIZE     (KERN_VIRT_SIZE >> 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_END      (PAGE_OFFSET - 1)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)
 #define VMALLOC_START    (PAGE_OFFSET - VMALLOC_SIZE)

+ 1 - 1
arch/riscv/include/asm/ptrace.h

@@ -66,7 +66,7 @@ struct pt_regs {
 #define REG_FMT "%08lx"
 #define REG_FMT "%08lx"
 #endif
 #endif
 
 
-#define user_mode(regs) (((regs)->sstatus & SR_PS) == 0)
+#define user_mode(regs) (((regs)->sstatus & SR_SPP) == 0)
 
 
 
 
 /* Helpers for working with the instruction pointer */
 /* Helpers for working with the instruction pointer */

+ 0 - 4
arch/riscv/include/asm/tlbflush.h

@@ -15,8 +15,6 @@
 #ifndef _ASM_RISCV_TLBFLUSH_H
 #ifndef _ASM_RISCV_TLBFLUSH_H
 #define _ASM_RISCV_TLBFLUSH_H
 #define _ASM_RISCV_TLBFLUSH_H
 
 
-#ifdef CONFIG_MMU
-
 #include <linux/mm_types.h>
 #include <linux/mm_types.h>
 
 
 /*
 /*
@@ -64,6 +62,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 	flush_tlb_all();
 	flush_tlb_all();
 }
 }
 
 
-#endif /* CONFIG_MMU */
-
 #endif /* _ASM_RISCV_TLBFLUSH_H */
 #endif /* _ASM_RISCV_TLBFLUSH_H */

+ 0 - 12
arch/riscv/include/asm/uaccess.h

@@ -127,7 +127,6 @@ extern int fixup_exception(struct pt_regs *state);
  * call.
  * call.
  */
  */
 
 
-#ifdef CONFIG_MMU
 #define __get_user_asm(insn, x, ptr, err)			\
 #define __get_user_asm(insn, x, ptr, err)			\
 do {								\
 do {								\
 	uintptr_t __tmp;					\
 	uintptr_t __tmp;					\
@@ -153,13 +152,11 @@ do {								\
 	__disable_user_access();				\
 	__disable_user_access();				\
 	(x) = __x;						\
 	(x) = __x;						\
 } while (0)
 } while (0)
-#endif /* CONFIG_MMU */
 
 
 #ifdef CONFIG_64BIT
 #ifdef CONFIG_64BIT
 #define __get_user_8(x, ptr, err) \
 #define __get_user_8(x, ptr, err) \
 	__get_user_asm("ld", x, ptr, err)
 	__get_user_asm("ld", x, ptr, err)
 #else /* !CONFIG_64BIT */
 #else /* !CONFIG_64BIT */
-#ifdef CONFIG_MMU
 #define __get_user_8(x, ptr, err)				\
 #define __get_user_8(x, ptr, err)				\
 do {								\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
@@ -193,7 +190,6 @@ do {								\
 	(x) = (__typeof__(x))((__typeof__((x)-(x)))(		\
 	(x) = (__typeof__(x))((__typeof__((x)-(x)))(		\
 		(((u64)__hi << 32) | __lo)));			\
 		(((u64)__hi << 32) | __lo)));			\
 } while (0)
 } while (0)
-#endif /* CONFIG_MMU */
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_64BIT */
 
 
 
 
@@ -267,8 +263,6 @@ do {								\
 		((x) = 0, -EFAULT);				\
 		((x) = 0, -EFAULT);				\
 })
 })
 
 
-
-#ifdef CONFIG_MMU
 #define __put_user_asm(insn, x, ptr, err)			\
 #define __put_user_asm(insn, x, ptr, err)			\
 do {								\
 do {								\
 	uintptr_t __tmp;					\
 	uintptr_t __tmp;					\
@@ -292,14 +286,11 @@ do {								\
 		: "rJ" (__x), "i" (-EFAULT));			\
 		: "rJ" (__x), "i" (-EFAULT));			\
 	__disable_user_access();				\
 	__disable_user_access();				\
 } while (0)
 } while (0)
-#endif /* CONFIG_MMU */
-
 
 
 #ifdef CONFIG_64BIT
 #ifdef CONFIG_64BIT
 #define __put_user_8(x, ptr, err) \
 #define __put_user_8(x, ptr, err) \
 	__put_user_asm("sd", x, ptr, err)
 	__put_user_asm("sd", x, ptr, err)
 #else /* !CONFIG_64BIT */
 #else /* !CONFIG_64BIT */
-#ifdef CONFIG_MMU
 #define __put_user_8(x, ptr, err)				\
 #define __put_user_8(x, ptr, err)				\
 do {								\
 do {								\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
 	u32 __user *__ptr = (u32 __user *)(ptr);		\
@@ -329,7 +320,6 @@ do {								\
 		: "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT));	\
 		: "rJ" (__x), "rJ" (__x >> 32), "i" (-EFAULT));	\
 	__disable_user_access();				\
 	__disable_user_access();				\
 } while (0)
 } while (0)
-#endif /* CONFIG_MMU */
 #endif /* CONFIG_64BIT */
 #endif /* CONFIG_64BIT */
 
 
 
 
@@ -438,7 +428,6 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
  * will set "err" to -EFAULT, while successful accesses return the previous
  * will set "err" to -EFAULT, while successful accesses return the previous
  * value.
  * value.
  */
  */
-#ifdef CONFIG_MMU
 #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb)	\
 #define __cmpxchg_user(ptr, old, new, err, size, lrb, scb)	\
 ({								\
 ({								\
 	__typeof__(ptr) __ptr = (ptr);				\
 	__typeof__(ptr) __ptr = (ptr);				\
@@ -508,6 +497,5 @@ unsigned long __must_check clear_user(void __user *to, unsigned long n)
 	(err) = __err;						\
 	(err) = __err;						\
 	__ret;							\
 	__ret;							\
 })
 })
-#endif /* CONFIG_MMU */
 
 
 #endif /* _ASM_RISCV_UACCESS_H */
 #endif /* _ASM_RISCV_UACCESS_H */

+ 1 - 0
arch/riscv/include/asm/unistd.h

@@ -14,3 +14,4 @@
 #define __ARCH_HAVE_MMU
 #define __ARCH_HAVE_MMU
 #define __ARCH_WANT_SYS_CLONE
 #define __ARCH_WANT_SYS_CLONE
 #include <uapi/asm/unistd.h>
 #include <uapi/asm/unistd.h>
+#include <uapi/asm/syscalls.h>

+ 0 - 28
arch/riscv/include/asm/vdso-syscalls.h

@@ -1,28 +0,0 @@
-/*
- * Copyright (C) 2017 SiFive
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef _ASM_RISCV_VDSO_SYSCALLS_H
-#define _ASM_RISCV_VDSO_SYSCALLS_H
-
-#ifdef CONFIG_SMP
-
-/* These syscalls are only used by the vDSO and are not in the uapi. */
-#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
-__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
-
-#endif
-
-#endif /* _ASM_RISCV_VDSO_H */

+ 26 - 0
arch/riscv/include/uapi/asm/syscalls.h

@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2017 SiFive
+ */
+
+#ifndef _ASM__UAPI__SYSCALLS_H
+#define _ASM__UAPI__SYSCALLS_H
+
+/*
+ * Allows the instruction cache to be flushed from userspace.  Despite RISC-V
+ * having a direct 'fence.i' instruction available to userspace (which we
+ * can't trap!), that's not actually viable when running on Linux because the
+ * kernel might schedule a process on another hart.  There is no way for
+ * userspace to handle this without invoking the kernel (as it doesn't know the
+ * thread->hart mappings), so we've defined a RISC-V specific system call to
+ * flush the instruction cache.
+ *
+ * __NR_riscv_flush_icache is defined to flush the instruction cache over an
+ * address range, with the flush applying to either all threads or just the
+ * caller.  We don't currently do anything with the address range, that's just
+ * in there for forwards compatibility.
+ */
+#define __NR_riscv_flush_icache (__NR_arch_specific_syscall + 15)
+__SYSCALL(__NR_riscv_flush_icache, sys_riscv_flush_icache)
+
+#endif

+ 4 - 4
arch/riscv/kernel/entry.S

@@ -196,7 +196,7 @@ handle_syscall:
 	addi s2, s2, 0x4
 	addi s2, s2, 0x4
 	REG_S s2, PT_SEPC(sp)
 	REG_S s2, PT_SEPC(sp)
 	/* System calls run with interrupts enabled */
 	/* System calls run with interrupts enabled */
-	csrs sstatus, SR_IE
+	csrs sstatus, SR_SIE
 	/* Trace syscalls, but only if requested by the user. */
 	/* Trace syscalls, but only if requested by the user. */
 	REG_L t0, TASK_TI_FLAGS(tp)
 	REG_L t0, TASK_TI_FLAGS(tp)
 	andi t0, t0, _TIF_SYSCALL_TRACE
 	andi t0, t0, _TIF_SYSCALL_TRACE
@@ -224,8 +224,8 @@ ret_from_syscall:
 
 
 ret_from_exception:
 ret_from_exception:
 	REG_L s0, PT_SSTATUS(sp)
 	REG_L s0, PT_SSTATUS(sp)
-	csrc sstatus, SR_IE
-	andi s0, s0, SR_PS
+	csrc sstatus, SR_SIE
+	andi s0, s0, SR_SPP
 	bnez s0, restore_all
 	bnez s0, restore_all
 
 
 resume_userspace:
 resume_userspace:
@@ -255,7 +255,7 @@ work_pending:
 	bnez s1, work_resched
 	bnez s1, work_resched
 work_notifysig:
 work_notifysig:
 	/* Handle pending signals and notify-resume requests */
 	/* Handle pending signals and notify-resume requests */
-	csrs sstatus, SR_IE /* Enable interrupts for do_notify_resume() */
+	csrs sstatus, SR_SIE /* Enable interrupts for do_notify_resume() */
 	move a0, sp /* pt_regs */
 	move a0, sp /* pt_regs */
 	move a1, s0 /* current_thread_info->flags */
 	move a1, s0 /* current_thread_info->flags */
 	tail do_notify_resume
 	tail do_notify_resume

+ 2 - 2
arch/riscv/kernel/process.c

@@ -76,7 +76,7 @@ void show_regs(struct pt_regs *regs)
 void start_thread(struct pt_regs *regs, unsigned long pc,
 void start_thread(struct pt_regs *regs, unsigned long pc,
 	unsigned long sp)
 	unsigned long sp)
 {
 {
-	regs->sstatus = SR_PIE /* User mode, irqs on */ | SR_FS_INITIAL;
+	regs->sstatus = SR_SPIE /* User mode, irqs on */ | SR_FS_INITIAL;
 	regs->sepc = pc;
 	regs->sepc = pc;
 	regs->sp = sp;
 	regs->sp = sp;
 	set_fs(USER_DS);
 	set_fs(USER_DS);
@@ -110,7 +110,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp,
 		const register unsigned long gp __asm__ ("gp");
 		const register unsigned long gp __asm__ ("gp");
 		memset(childregs, 0, sizeof(struct pt_regs));
 		memset(childregs, 0, sizeof(struct pt_regs));
 		childregs->gp = gp;
 		childregs->gp = gp;
-		childregs->sstatus = SR_PS | SR_PIE; /* Supervisor, irqs on */
+		childregs->sstatus = SR_SPP | SR_SPIE; /* Supervisor, irqs on */
 
 
 		p->thread.ra = (unsigned long)ret_from_kernel_thread;
 		p->thread.ra = (unsigned long)ret_from_kernel_thread;
 		p->thread.s[0] = usp; /* fn */
 		p->thread.s[0] = usp; /* fn */

+ 0 - 1
arch/riscv/kernel/syscall_table.c

@@ -23,5 +23,4 @@
 void *sys_call_table[__NR_syscalls] = {
 void *sys_call_table[__NR_syscalls] = {
 	[0 ... __NR_syscalls - 1] = sys_ni_syscall,
 	[0 ... __NR_syscalls - 1] = sys_ni_syscall,
 #include <asm/unistd.h>
 #include <asm/unistd.h>
-#include <asm/vdso-syscalls.h>
 };
 };

+ 0 - 1
arch/riscv/kernel/vdso/flush_icache.S

@@ -13,7 +13,6 @@
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
-#include <asm/vdso-syscalls.h>
 
 
 	.text
 	.text
 /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */
 /* int __vdso_flush_icache(void *start, void *end, unsigned long flags); */

+ 1 - 1
arch/riscv/mm/fault.c

@@ -63,7 +63,7 @@ asmlinkage void do_page_fault(struct pt_regs *regs)
 		goto vmalloc_fault;
 		goto vmalloc_fault;
 
 
 	/* Enable interrupts if they were enabled in the parent context. */
 	/* Enable interrupts if they were enabled in the parent context. */
-	if (likely(regs->sstatus & SR_PIE))
+	if (likely(regs->sstatus & SR_SPIE))
 		local_irq_enable();
 		local_irq_enable();
 
 
 	/*
 	/*

+ 20 - 4
arch/sh/boards/mach-se/770x/setup.c

@@ -9,6 +9,7 @@
  */
  */
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/platform_device.h>
 #include <linux/platform_device.h>
+#include <linux/sh_eth.h>
 #include <mach-se/mach/se.h>
 #include <mach-se/mach/se.h>
 #include <mach-se/mach/mrshpc.h>
 #include <mach-se/mach/mrshpc.h>
 #include <asm/machvec.h>
 #include <asm/machvec.h>
@@ -115,13 +116,23 @@ static struct platform_device heartbeat_device = {
 #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\
 #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\
 	defined(CONFIG_CPU_SUBTYPE_SH7712)
 	defined(CONFIG_CPU_SUBTYPE_SH7712)
 /* SH771X Ethernet driver */
 /* SH771X Ethernet driver */
+static struct sh_eth_plat_data sh_eth_plat = {
+	.phy = PHY_ID,
+	.phy_interface = PHY_INTERFACE_MODE_MII,
+};
+
 static struct resource sh_eth0_resources[] = {
 static struct resource sh_eth0_resources[] = {
 	[0] = {
 	[0] = {
 		.start = SH_ETH0_BASE,
 		.start = SH_ETH0_BASE,
-		.end = SH_ETH0_BASE + 0x1B8,
+		.end = SH_ETH0_BASE + 0x1B8 - 1,
 		.flags = IORESOURCE_MEM,
 		.flags = IORESOURCE_MEM,
 	},
 	},
 	[1] = {
 	[1] = {
+		.start = SH_TSU_BASE,
+		.end = SH_TSU_BASE + 0x200 - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[2] = {
 		.start = SH_ETH0_IRQ,
 		.start = SH_ETH0_IRQ,
 		.end = SH_ETH0_IRQ,
 		.end = SH_ETH0_IRQ,
 		.flags = IORESOURCE_IRQ,
 		.flags = IORESOURCE_IRQ,
@@ -132,7 +143,7 @@ static struct platform_device sh_eth0_device = {
 	.name = "sh771x-ether",
 	.name = "sh771x-ether",
 	.id = 0,
 	.id = 0,
 	.dev = {
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	},
 	.num_resources = ARRAY_SIZE(sh_eth0_resources),
 	.num_resources = ARRAY_SIZE(sh_eth0_resources),
 	.resource = sh_eth0_resources,
 	.resource = sh_eth0_resources,
@@ -141,10 +152,15 @@ static struct platform_device sh_eth0_device = {
 static struct resource sh_eth1_resources[] = {
 static struct resource sh_eth1_resources[] = {
 	[0] = {
 	[0] = {
 		.start = SH_ETH1_BASE,
 		.start = SH_ETH1_BASE,
-		.end = SH_ETH1_BASE + 0x1B8,
+		.end = SH_ETH1_BASE + 0x1B8 - 1,
 		.flags = IORESOURCE_MEM,
 		.flags = IORESOURCE_MEM,
 	},
 	},
 	[1] = {
 	[1] = {
+		.start = SH_TSU_BASE,
+		.end = SH_TSU_BASE + 0x200 - 1,
+		.flags = IORESOURCE_MEM,
+	},
+	[2] = {
 		.start = SH_ETH1_IRQ,
 		.start = SH_ETH1_IRQ,
 		.end = SH_ETH1_IRQ,
 		.end = SH_ETH1_IRQ,
 		.flags = IORESOURCE_IRQ,
 		.flags = IORESOURCE_IRQ,
@@ -155,7 +171,7 @@ static struct platform_device sh_eth1_device = {
 	.name = "sh771x-ether",
 	.name = "sh771x-ether",
 	.id = 1,
 	.id = 1,
 	.dev = {
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	},
 	.num_resources = ARRAY_SIZE(sh_eth1_resources),
 	.num_resources = ARRAY_SIZE(sh_eth1_resources),
 	.resource = sh_eth1_resources,
 	.resource = sh_eth1_resources,

+ 1 - 0
arch/sh/include/mach-se/mach/se.h

@@ -100,6 +100,7 @@
 /* Base address */
 /* Base address */
 #define SH_ETH0_BASE 0xA7000000
 #define SH_ETH0_BASE 0xA7000000
 #define SH_ETH1_BASE 0xA7000400
 #define SH_ETH1_BASE 0xA7000400
+#define SH_TSU_BASE  0xA7000800
 /* PHY ID */
 /* PHY ID */
 #if defined(CONFIG_CPU_SUBTYPE_SH7710)
 #if defined(CONFIG_CPU_SUBTYPE_SH7710)
 # define PHY_ID 0x00
 # define PHY_ID 0x00

+ 14 - 1
arch/x86/Kconfig

@@ -55,7 +55,6 @@ config X86
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_GCOV_PROFILE_ALL
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_KCOV			if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
 	select ARCH_HAS_PMEM_API		if X86_64
-	# Causing hangs/crashes, see the commit that added this change for details.
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_REFCOUNT
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_UACCESS_FLUSHCACHE	if X86_64
 	select ARCH_HAS_SET_MEMORY
 	select ARCH_HAS_SET_MEMORY
@@ -89,6 +88,7 @@ config X86
 	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CPU_AUTOPROBE
 	select GENERIC_CPU_AUTOPROBE
+	select GENERIC_CPU_VULNERABILITIES
 	select GENERIC_EARLY_IOREMAP
 	select GENERIC_EARLY_IOREMAP
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IOMAP
 	select GENERIC_IOMAP
@@ -429,6 +429,19 @@ config GOLDFISH
        def_bool y
        def_bool y
        depends on X86_GOLDFISH
        depends on X86_GOLDFISH
 
 
+config RETPOLINE
+	bool "Avoid speculative indirect branches in kernel"
+	default y
+	help
+	  Compile kernel with the retpoline compiler options to guard against
+	  kernel-to-user data leaks by avoiding speculative indirect
+	  branches. Requires a compiler with -mindirect-branch=thunk-extern
+	  support for full protection. The kernel may run slower.
+
+	  Without compiler support, at least indirect branches in assembler
+	  code are eliminated. Since this includes the syscall entry path,
+	  it is not entirely pointless.
+
 config INTEL_RDT
 config INTEL_RDT
 	bool "Intel Resource Director Technology support"
 	bool "Intel Resource Director Technology support"
 	default n
 	default n

+ 8 - 0
arch/x86/Makefile

@@ -230,6 +230,14 @@ KBUILD_CFLAGS += -Wno-sign-compare
 #
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 
 
+# Avoid indirect branches in kernel to deal with Spectre
+ifdef CONFIG_RETPOLINE
+    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+    ifneq ($(RETPOLINE_CFLAGS),)
+        KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+    endif
+endif
+
 archscripts: scripts_basic
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
 

+ 3 - 2
arch/x86/crypto/aesni-intel_asm.S

@@ -32,6 +32,7 @@
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <asm/inst.h>
 #include <asm/inst.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 
 /*
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2884,7 +2885,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 	movdqu IV, 0x30(OUTP)
 
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 
 	movdqu 0x00(OUTP), INC
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
 	pxor INC, STATE1
@@ -2929,7 +2930,7 @@ ENTRY(aesni_xts_crypt8)
 	_aesni_gf128mul_x_ble()
 	_aesni_gf128mul_x_ble()
 	movups IV, (IVP)
 	movups IV, (IVP)
 
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 
 	movdqu 0x40(OUTP), INC
 	movdqu 0x40(OUTP), INC
 	pxor INC, STATE1
 	pxor INC, STATE1

+ 2 - 1
arch/x86/crypto/camellia-aesni-avx-asm_64.S

@@ -17,6 +17,7 @@
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 
@@ -1227,7 +1228,7 @@ camellia_xts_crypt_16way:
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 
 	addq $(16 * 16), %rsp;
 	addq $(16 * 16), %rsp;
 
 

+ 2 - 1
arch/x86/crypto/camellia-aesni-avx2-asm_64.S

@@ -12,6 +12,7 @@
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
 
@@ -1343,7 +1344,7 @@ camellia_xts_crypt_32way:
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 
 	addq $(16 * 32), %rsp;
 	addq $(16 * 32), %rsp;
 
 

+ 2 - 1
arch/x86/crypto/crc32c-pcl-intel-asm_64.S

@@ -45,6 +45,7 @@
 
 
 #include <asm/inst.h>
 #include <asm/inst.h>
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/nospec-branch.h>
 
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
 
@@ -172,7 +173,7 @@ continue_block:
 	movzxw  (bufp, %rax, 2), len
 	movzxw  (bufp, %rax, 2), len
 	lea	crc_array(%rip), bufp
 	lea	crc_array(%rip), bufp
 	lea     (bufp, len, 1), bufp
 	lea     (bufp, len, 1), bufp
-	jmp     *bufp
+	JMP_NOSPEC bufp
 
 
 	################################################################
 	################################################################
 	## 2a) PROCESS FULL BLOCKS:
 	## 2a) PROCESS FULL BLOCKS:

+ 19 - 17
arch/x86/entry/calling.h

@@ -198,8 +198,11 @@ For 32-bit we have the following conventions - kernel is built with
  * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
  * PAGE_TABLE_ISOLATION PGDs are 8k.  Flip bit 12 to switch between the two
  * halves:
  * halves:
  */
  */
-#define PTI_SWITCH_PGTABLES_MASK	(1<<PAGE_SHIFT)
-#define PTI_SWITCH_MASK		(PTI_SWITCH_PGTABLES_MASK|(1<<X86_CR3_PTI_SWITCH_BIT))
+#define PTI_USER_PGTABLE_BIT		PAGE_SHIFT
+#define PTI_USER_PGTABLE_MASK		(1 << PTI_USER_PGTABLE_BIT)
+#define PTI_USER_PCID_BIT		X86_CR3_PTI_PCID_USER_BIT
+#define PTI_USER_PCID_MASK		(1 << PTI_USER_PCID_BIT)
+#define PTI_USER_PGTABLE_AND_PCID_MASK  (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK)
 
 
 .macro SET_NOFLUSH_BIT	reg:req
 .macro SET_NOFLUSH_BIT	reg:req
 	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
 	bts	$X86_CR3_PCID_NOFLUSH_BIT, \reg
@@ -208,7 +211,7 @@ For 32-bit we have the following conventions - kernel is built with
 .macro ADJUST_KERNEL_CR3 reg:req
 .macro ADJUST_KERNEL_CR3 reg:req
 	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
 	ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID
 	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
 	/* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */
-	andq    $(~PTI_SWITCH_MASK), \reg
+	andq    $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg
 .endm
 .endm
 
 
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
 .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
@@ -239,15 +242,19 @@ For 32-bit we have the following conventions - kernel is built with
 	/* Flush needed, clear the bit */
 	/* Flush needed, clear the bit */
 	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
 	btr	\scratch_reg, THIS_CPU_user_pcid_flush_mask
 	movq	\scratch_reg2, \scratch_reg
 	movq	\scratch_reg2, \scratch_reg
-	jmp	.Lwrcr3_\@
+	jmp	.Lwrcr3_pcid_\@
 
 
 .Lnoflush_\@:
 .Lnoflush_\@:
 	movq	\scratch_reg2, \scratch_reg
 	movq	\scratch_reg2, \scratch_reg
 	SET_NOFLUSH_BIT \scratch_reg
 	SET_NOFLUSH_BIT \scratch_reg
 
 
+.Lwrcr3_pcid_\@:
+	/* Flip the ASID to the user version */
+	orq	$(PTI_USER_PCID_MASK), \scratch_reg
+
 .Lwrcr3_\@:
 .Lwrcr3_\@:
-	/* Flip the PGD and ASID to the user version */
-	orq     $(PTI_SWITCH_MASK), \scratch_reg
+	/* Flip the PGD to the user version */
+	orq     $(PTI_USER_PGTABLE_MASK), \scratch_reg
 	mov	\scratch_reg, %cr3
 	mov	\scratch_reg, %cr3
 .Lend_\@:
 .Lend_\@:
 .endm
 .endm
@@ -263,17 +270,12 @@ For 32-bit we have the following conventions - kernel is built with
 	movq	%cr3, \scratch_reg
 	movq	%cr3, \scratch_reg
 	movq	\scratch_reg, \save_reg
 	movq	\scratch_reg, \save_reg
 	/*
 	/*
-	 * Is the "switch mask" all zero?  That means that both of
-	 * these are zero:
-	 *
-	 *	1. The user/kernel PCID bit, and
-	 *	2. The user/kernel "bit" that points CR3 to the
-	 *	   bottom half of the 8k PGD
-	 *
-	 * That indicates a kernel CR3 value, not a user CR3.
+	 * Test the user pagetable bit. If set, then the user page tables
+	 * are active. If clear CR3 already has the kernel page table
+	 * active.
 	 */
 	 */
-	testq	$(PTI_SWITCH_MASK), \scratch_reg
-	jz	.Ldone_\@
+	bt	$PTI_USER_PGTABLE_BIT, \scratch_reg
+	jnc	.Ldone_\@
 
 
 	ADJUST_KERNEL_CR3 \scratch_reg
 	ADJUST_KERNEL_CR3 \scratch_reg
 	movq	\scratch_reg, %cr3
 	movq	\scratch_reg, %cr3
@@ -290,7 +292,7 @@ For 32-bit we have the following conventions - kernel is built with
 	 * KERNEL pages can always resume with NOFLUSH as we do
 	 * KERNEL pages can always resume with NOFLUSH as we do
 	 * explicit flushes.
 	 * explicit flushes.
 	 */
 	 */
-	bt	$X86_CR3_PTI_SWITCH_BIT, \save_reg
+	bt	$PTI_USER_PGTABLE_BIT, \save_reg
 	jnc	.Lnoflush_\@
 	jnc	.Lnoflush_\@
 
 
 	/*
 	/*

+ 3 - 2
arch/x86/entry/entry_32.S

@@ -44,6 +44,7 @@
 #include <asm/asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/smap.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 
 
 	.section .entry.text, "ax"
 	.section .entry.text, "ax"
 
 
@@ -290,7 +291,7 @@ ENTRY(ret_from_fork)
 
 
 	/* kernel thread */
 	/* kernel thread */
 1:	movl	%edi, %eax
 1:	movl	%edi, %eax
-	call	*%ebx
+	CALL_NOSPEC %ebx
 	/*
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
 	 * calling do_execve().  Exit to userspace to complete the execve()
@@ -919,7 +920,7 @@ common_exception:
 	movl	%ecx, %es
 	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
 	movl	%esp, %eax			# pt_regs pointer
-	call	*%edi
+	CALL_NOSPEC %edi
 	jmp	ret_from_exception
 	jmp	ret_from_exception
 END(common_exception)
 END(common_exception)
 
 

+ 9 - 3
arch/x86/entry/entry_64.S

@@ -37,6 +37,7 @@
 #include <asm/pgtable_types.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
 #include <asm/export.h>
 #include <asm/frame.h>
 #include <asm/frame.h>
+#include <asm/nospec-branch.h>
 #include <linux/err.h>
 #include <linux/err.h>
 
 
 #include "calling.h"
 #include "calling.h"
@@ -191,7 +192,7 @@ ENTRY(entry_SYSCALL_64_trampoline)
 	 */
 	 */
 	pushq	%rdi
 	pushq	%rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
 	movq	$entry_SYSCALL_64_stage2, %rdi
-	jmp	*%rdi
+	JMP_NOSPEC %rdi
 END(entry_SYSCALL_64_trampoline)
 END(entry_SYSCALL_64_trampoline)
 
 
 	.popsection
 	.popsection
@@ -270,7 +271,12 @@ entry_SYSCALL_64_fastpath:
 	 * It might end up jumping to the slow path.  If it jumps, RAX
 	 * It might end up jumping to the slow path.  If it jumps, RAX
 	 * and all argument registers are clobbered.
 	 * and all argument registers are clobbered.
 	 */
 	 */
+#ifdef CONFIG_RETPOLINE
+	movq	sys_call_table(, %rax, 8), %rax
+	call	__x86_indirect_thunk_rax
+#else
 	call	*sys_call_table(, %rax, 8)
 	call	*sys_call_table(, %rax, 8)
+#endif
 .Lentry_SYSCALL_64_after_fastpath_call:
 .Lentry_SYSCALL_64_after_fastpath_call:
 
 
 	movq	%rax, RAX(%rsp)
 	movq	%rax, RAX(%rsp)
@@ -442,7 +448,7 @@ ENTRY(stub_ptregs_64)
 	jmp	entry_SYSCALL64_slow_path
 	jmp	entry_SYSCALL64_slow_path
 
 
 1:
 1:
-	jmp	*%rax				/* Called from C */
+	JMP_NOSPEC %rax				/* Called from C */
 END(stub_ptregs_64)
 END(stub_ptregs_64)
 
 
 .macro ptregs_stub func
 .macro ptregs_stub func
@@ -521,7 +527,7 @@ ENTRY(ret_from_fork)
 1:
 1:
 	/* kernel thread */
 	/* kernel thread */
 	movq	%r12, %rdi
 	movq	%r12, %rdi
-	call	*%rbx
+	CALL_NOSPEC %rbx
 	/*
 	/*
 	 * A kernel thread is allowed to return here after successfully
 	 * A kernel thread is allowed to return here after successfully
 	 * calling do_execve().  Exit to userspace to complete the execve()
 	 * calling do_execve().  Exit to userspace to complete the execve()

+ 18 - 0
arch/x86/events/intel/bts.c

@@ -582,6 +582,24 @@ static __init int bts_init(void)
 	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
 	if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
 		return -ENODEV;
 		return -ENODEV;
 
 
+	if (boot_cpu_has(X86_FEATURE_PTI)) {
+		/*
+		 * BTS hardware writes through a virtual memory map we must
+		 * either use the kernel physical map, or the user mapping of
+		 * the AUX buffer.
+		 *
+		 * However, since this driver supports per-CPU and per-task inherit
+		 * we cannot use the user mapping since it will not be availble
+		 * if we're not running the owning process.
+		 *
+		 * With PTI we can't use the kernal map either, because its not
+		 * there when we run userspace.
+		 *
+		 * For now, disable this driver when using PTI.
+		 */
+		return -ENODEV;
+	}
+
 	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
 	bts_pmu.capabilities	= PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE |
 				  PERF_PMU_CAP_EXCLUSIVE;
 				  PERF_PMU_CAP_EXCLUSIVE;
 	bts_pmu.task_ctx_nr	= perf_sw_context;
 	bts_pmu.task_ctx_nr	= perf_sw_context;

+ 25 - 0
arch/x86/include/asm/asm-prototypes.h

@@ -11,7 +11,32 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/special_insns.h>
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
 #include <asm/preempt.h>
+#include <asm/asm.h>
 
 
 #ifndef CONFIG_X86_CMPXCHG64
 #ifndef CONFIG_X86_CMPXCHG64
 extern void cmpxchg8b_emu(void);
 extern void cmpxchg8b_emu(void);
 #endif
 #endif
+
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_32
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void);
+#else
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void);
+INDIRECT_THUNK(8)
+INDIRECT_THUNK(9)
+INDIRECT_THUNK(10)
+INDIRECT_THUNK(11)
+INDIRECT_THUNK(12)
+INDIRECT_THUNK(13)
+INDIRECT_THUNK(14)
+INDIRECT_THUNK(15)
+#endif
+INDIRECT_THUNK(ax)
+INDIRECT_THUNK(bx)
+INDIRECT_THUNK(cx)
+INDIRECT_THUNK(dx)
+INDIRECT_THUNK(si)
+INDIRECT_THUNK(di)
+INDIRECT_THUNK(bp)
+INDIRECT_THUNK(sp)
+#endif /* CONFIG_RETPOLINE */

+ 4 - 0
arch/x86/include/asm/cpufeatures.h

@@ -203,6 +203,8 @@
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 #define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 #define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
 #define X86_FEATURE_PTI			( 7*32+11) /* Kernel Page Table Isolation enabled */
+#define X86_FEATURE_RETPOLINE		( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_AMD	( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
 #define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
@@ -342,5 +344,7 @@
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
 #define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 #define X86_BUG_CPU_MELTDOWN		X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1		X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2		X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 
 
 #endif /* _ASM_X86_CPUFEATURES_H */
 #endif /* _ASM_X86_CPUFEATURES_H */

+ 10 - 8
arch/x86/include/asm/mshyperv.h

@@ -7,6 +7,7 @@
 #include <linux/nmi.h>
 #include <linux/nmi.h>
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/hyperv.h>
 #include <asm/hyperv.h>
+#include <asm/nospec-branch.h>
 
 
 /*
 /*
  * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
  * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
@@ -186,10 +187,11 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 		return U64_MAX;
 		return U64_MAX;
 
 
 	__asm__ __volatile__("mov %4, %%r8\n"
 	__asm__ __volatile__("mov %4, %%r8\n"
-			     "call *%5"
+			     CALL_NOSPEC
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 			       "+c" (control), "+d" (input_address)
 			       "+c" (control), "+d" (input_address)
-			     :  "r" (output_address), "m" (hv_hypercall_pg)
+			     :  "r" (output_address),
+				THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory", "r8", "r9", "r10", "r11");
 			     : "cc", "memory", "r8", "r9", "r10", "r11");
 #else
 #else
 	u32 input_address_hi = upper_32_bits(input_address);
 	u32 input_address_hi = upper_32_bits(input_address);
@@ -200,13 +202,13 @@ static inline u64 hv_do_hypercall(u64 control, void *input, void *output)
 	if (!hv_hypercall_pg)
 	if (!hv_hypercall_pg)
 		return U64_MAX;
 		return U64_MAX;
 
 
-	__asm__ __volatile__("call *%7"
+	__asm__ __volatile__(CALL_NOSPEC
 			     : "=A" (hv_status),
 			     : "=A" (hv_status),
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			       "+c" (input_address_lo), ASM_CALL_CONSTRAINT
 			     : "A" (control),
 			     : "A" (control),
 			       "b" (input_address_hi),
 			       "b" (input_address_hi),
 			       "D"(output_address_hi), "S"(output_address_lo),
 			       "D"(output_address_hi), "S"(output_address_lo),
-			       "m" (hv_hypercall_pg)
+			       THUNK_TARGET(hv_hypercall_pg)
 			     : "cc", "memory");
 			     : "cc", "memory");
 #endif /* !x86_64 */
 #endif /* !x86_64 */
 	return hv_status;
 	return hv_status;
@@ -227,10 +229,10 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	{
 	{
-		__asm__ __volatile__("call *%4"
+		__asm__ __volatile__(CALL_NOSPEC
 				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				     : "=a" (hv_status), ASM_CALL_CONSTRAINT,
 				       "+c" (control), "+d" (input1)
 				       "+c" (control), "+d" (input1)
-				     : "m" (hv_hypercall_pg)
+				     : THUNK_TARGET(hv_hypercall_pg)
 				     : "cc", "r8", "r9", "r10", "r11");
 				     : "cc", "r8", "r9", "r10", "r11");
 	}
 	}
 #else
 #else
@@ -238,13 +240,13 @@ static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1)
 		u32 input1_hi = upper_32_bits(input1);
 		u32 input1_hi = upper_32_bits(input1);
 		u32 input1_lo = lower_32_bits(input1);
 		u32 input1_lo = lower_32_bits(input1);
 
 
-		__asm__ __volatile__ ("call *%5"
+		__asm__ __volatile__ (CALL_NOSPEC
 				      : "=A"(hv_status),
 				      : "=A"(hv_status),
 					"+c"(input1_lo),
 					"+c"(input1_lo),
 					ASM_CALL_CONSTRAINT
 					ASM_CALL_CONSTRAINT
 				      :	"A" (control),
 				      :	"A" (control),
 					"b" (input1_hi),
 					"b" (input1_hi),
-					"m" (hv_hypercall_pg)
+					THUNK_TARGET(hv_hypercall_pg)
 				      : "cc", "edi", "esi");
 				      : "cc", "edi", "esi");
 	}
 	}
 #endif
 #endif

+ 3 - 0
arch/x86/include/asm/msr-index.h

@@ -355,6 +355,9 @@
 #define FAM10H_MMIO_CONF_BASE_MASK	0xfffffffULL
 #define FAM10H_MMIO_CONF_BASE_MASK	0xfffffffULL
 #define FAM10H_MMIO_CONF_BASE_SHIFT	20
 #define FAM10H_MMIO_CONF_BASE_SHIFT	20
 #define MSR_FAM10H_NODE_ID		0xc001100c
 #define MSR_FAM10H_NODE_ID		0xc001100c
+#define MSR_F10H_DECFG			0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE		BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
 
 
 /* K8 MSRs */
 /* K8 MSRs */
 #define MSR_K8_TOP_MEM1			0xc001001a
 #define MSR_K8_TOP_MEM1			0xc001001a

+ 214 - 0
arch/x86/include/asm/nospec-branch.h

@@ -0,0 +1,214 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NOSPEC_BRANCH_H__
+#define __NOSPEC_BRANCH_H__
+
+#include <asm/alternative.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeatures.h>
+
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS		16	/* To avoid underflow */
+
+/*
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version — two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr, sp)	\
+	mov	$(nr/2), reg;			\
+771:						\
+	call	772f;				\
+773:	/* speculation trap */			\
+	pause;					\
+	jmp	773b;				\
+772:						\
+	call	774f;				\
+775:	/* speculation trap */			\
+	pause;					\
+	jmp	775b;				\
+774:						\
+	dec	reg;				\
+	jnz	771b;				\
+	add	$(BITS_PER_LONG/8) * nr, sp;
+
+#ifdef __ASSEMBLY__
+
+/*
+ * This should be used immediately before a retpoline alternative.  It tells
+ * objtool where the retpolines are so that it can make sense of the control
+ * flow by just reading the original instruction(s) and ignoring the
+ * alternatives.
+ */
+.macro ANNOTATE_NOSPEC_ALTERNATIVE
+	.Lannotate_\@:
+	.pushsection .discard.nospec
+	.long .Lannotate_\@ - .
+	.popsection
+.endm
+
+/*
+ * These are the bare retpoline primitives for indirect jmp and call.
+ * Do not use these directly; they only exist to make the ALTERNATIVE
+ * invocation below less ugly.
+ */
+.macro RETPOLINE_JMP reg:req
+	call	.Ldo_rop_\@
+.Lspec_trap_\@:
+	pause
+	jmp	.Lspec_trap_\@
+.Ldo_rop_\@:
+	mov	\reg, (%_ASM_SP)
+	ret
+.endm
+
+/*
+ * This is a wrapper around RETPOLINE_JMP so the called function in reg
+ * returns to the instruction after the macro.
+ */
+.macro RETPOLINE_CALL reg:req
+	jmp	.Ldo_call_\@
+.Ldo_retpoline_jmp_\@:
+	RETPOLINE_JMP \reg
+.Ldo_call_\@:
+	call	.Ldo_retpoline_jmp_\@
+.endm
+
+/*
+ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
+ * indirect jmp/call which may be susceptible to the Spectre variant 2
+ * attack.
+ */
+.macro JMP_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE_2 __stringify(jmp *\reg),				\
+		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\
+		__stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	jmp	*\reg
+#endif
+.endm
+
+.macro CALL_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE_2 __stringify(call *\reg),				\
+		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
+		__stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	call	*\reg
+#endif
+.endm
+
+ /*
+  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+  * monstrosity above, manually.
+  */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+#ifdef CONFIG_RETPOLINE
+	ANNOTATE_NOSPEC_ALTERNATIVE
+	ALTERNATIVE "jmp .Lskip_rsb_\@",				\
+		__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))	\
+		\ftr
+.Lskip_rsb_\@:
+#endif
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#define ANNOTATE_NOSPEC_ALTERNATIVE				\
+	"999:\n\t"						\
+	".pushsection .discard.nospec\n\t"			\
+	".long 999b - .\n\t"					\
+	".popsection\n\t"
+
+#if defined(CONFIG_X86_64) && defined(RETPOLINE)
+
+/*
+ * Since the inline asm uses the %V modifier which is only in newer GCC,
+ * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
+ */
+# define CALL_NOSPEC						\
+	ANNOTATE_NOSPEC_ALTERNATIVE				\
+	ALTERNATIVE(						\
+	"call *%[thunk_target]\n",				\
+	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
+	X86_FEATURE_RETPOLINE)
+# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+
+#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
+/*
+ * For i386 we use the original ret-equivalent retpoline, because
+ * otherwise we'll run out of registers. We don't care about CET
+ * here, anyway.
+ */
+# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n",	\
+	"       jmp    904f;\n"					\
+	"       .align 16\n"					\
+	"901:	call   903f;\n"					\
+	"902:	pause;\n"					\
+	"       jmp    902b;\n"					\
+	"       .align 16\n"					\
+	"903:	addl   $4, %%esp;\n"				\
+	"       pushl  %[thunk_target];\n"			\
+	"       ret;\n"						\
+	"       .align 16\n"					\
+	"904:	call   901b;\n",				\
+	X86_FEATURE_RETPOLINE)
+
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#else /* No retpoline for C / inline asm */
+# define CALL_NOSPEC "call *%[thunk_target]\n"
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
+/* The Spectre V2 mitigation variants */
+enum spectre_v2_mitigation {
+	SPECTRE_V2_NONE,
+	SPECTRE_V2_RETPOLINE_MINIMAL,
+	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
+	SPECTRE_V2_RETPOLINE_GENERIC,
+	SPECTRE_V2_RETPOLINE_AMD,
+	SPECTRE_V2_IBRS,
+};
+
+/*
+ * On VMEXIT we must ensure that no RSB predictions learned in the guest
+ * can be followed in the host, by overwriting the RSB completely. Both
+ * retpoline and IBRS mitigations for Spectre v2 need this; only on future
+ * CPUs with IBRS_ATT *might* it be avoided.
+ */
+static inline void vmexit_fill_RSB(void)
+{
+#ifdef CONFIG_RETPOLINE
+	unsigned long loops = RSB_CLEAR_LOOPS / 2;
+
+	asm volatile (ANNOTATE_NOSPEC_ALTERNATIVE
+		      ALTERNATIVE("jmp 910f",
+				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+				  X86_FEATURE_RETPOLINE)
+		      "910:"
+		      : "=&r" (loops), ASM_CALL_CONSTRAINT
+		      : "r" (loops) : "memory" );
+#endif
+}
+#endif /* __ASSEMBLY__ */
+#endif /* __NOSPEC_BRANCH_H__ */

+ 1 - 0
arch/x86/include/asm/pci_x86.h

@@ -38,6 +38,7 @@ do {						\
 #define PCI_NOASSIGN_ROMS	0x80000
 #define PCI_NOASSIGN_ROMS	0x80000
 #define PCI_ROOT_NO_CRS		0x100000
 #define PCI_ROOT_NO_CRS		0x100000
 #define PCI_NOASSIGN_BARS	0x200000
 #define PCI_NOASSIGN_BARS	0x200000
+#define PCI_BIG_ROOT_WINDOW	0x400000
 
 
 extern unsigned int pci_probe;
 extern unsigned int pci_probe;
 extern unsigned long pirq_table_addr;
 extern unsigned long pirq_table_addr;

+ 1 - 1
arch/x86/include/asm/processor-flags.h

@@ -40,7 +40,7 @@
 #define CR3_NOFLUSH	BIT_ULL(63)
 #define CR3_NOFLUSH	BIT_ULL(63)
 
 
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
-# define X86_CR3_PTI_SWITCH_BIT	11
+# define X86_CR3_PTI_PCID_USER_BIT	11
 #endif
 #endif
 
 
 #else
 #else

+ 3 - 3
arch/x86/include/asm/tlbflush.h

@@ -81,13 +81,13 @@ static inline u16 kern_pcid(u16 asid)
 	 * Make sure that the dynamic ASID space does not confict with the
 	 * Make sure that the dynamic ASID space does not confict with the
 	 * bit we are using to switch between user and kernel ASIDs.
 	 * bit we are using to switch between user and kernel ASIDs.
 	 */
 	 */
-	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_SWITCH_BIT));
+	BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT));
 
 
 	/*
 	/*
 	 * The ASID being passed in here should have respected the
 	 * The ASID being passed in here should have respected the
 	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
 	 * MAX_ASID_AVAILABLE and thus never have the switch bit set.
 	 */
 	 */
-	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_SWITCH_BIT));
+	VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT));
 #endif
 #endif
 	/*
 	/*
 	 * The dynamically-assigned ASIDs that get passed in are small
 	 * The dynamically-assigned ASIDs that get passed in are small
@@ -112,7 +112,7 @@ static inline u16 user_pcid(u16 asid)
 {
 {
 	u16 ret = kern_pcid(asid);
 	u16 ret = kern_pcid(asid);
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
-	ret |= 1 << X86_CR3_PTI_SWITCH_BIT;
+	ret |= 1 << X86_CR3_PTI_PCID_USER_BIT;
 #endif
 #endif
 	return ret;
 	return ret;
 }
 }

+ 3 - 2
arch/x86/include/asm/xen/hypercall.h

@@ -44,6 +44,7 @@
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/smap.h>
 #include <asm/smap.h>
+#include <asm/nospec-branch.h>
 
 
 #include <xen/interface/xen.h>
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
 #include <xen/interface/sched.h>
@@ -217,9 +218,9 @@ privcmd_call(unsigned call,
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
 
 	stac();
 	stac();
-	asm volatile("call *%[call]"
+	asm volatile(CALL_NOSPEC
 		     : __HYPERCALL_5PARAM
 		     : __HYPERCALL_5PARAM
-		     : [call] "a" (&hypercall_page[call])
+		     : [thunk_target] "a" (&hypercall_page[call])
 		     : __HYPERCALL_CLOBBER5);
 		     : __HYPERCALL_CLOBBER5);
 	clac();
 	clac();
 
 

+ 5 - 2
arch/x86/kernel/alternative.c

@@ -344,9 +344,12 @@ done:
 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr)
 {
 {
 	unsigned long flags;
 	unsigned long flags;
+	int i;
 
 
-	if (instr[0] != 0x90)
-		return;
+	for (i = 0; i < a->padlen; i++) {
+		if (instr[i] != 0x90)
+			return;
+	}
 
 
 	local_irq_save(flags);
 	local_irq_save(flags);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);

+ 26 - 2
arch/x86/kernel/cpu/amd.c

@@ -829,8 +829,32 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_K8);
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 
 	if (cpu_has(c, X86_FEATURE_XMM2)) {
 	if (cpu_has(c, X86_FEATURE_XMM2)) {
-		/* MFENCE stops RDTSC speculation */
-		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		unsigned long long val;
+		int ret;
+
+		/*
+		 * A serializing LFENCE has less overhead than MFENCE, so
+		 * use it for execution serialization.  On families which
+		 * don't have that MSR, LFENCE is already serializing.
+		 * msr_set_bit() uses the safe accessors, too, even if the MSR
+		 * is not present.
+		 */
+		msr_set_bit(MSR_F10H_DECFG,
+			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
+		/*
+		 * Verify that the MSR write was successful (could be running
+		 * under a hypervisor) and only then assume that LFENCE is
+		 * serializing.
+		 */
+		ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+		if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+			/* A serializing LFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+		} else {
+			/* MFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		}
 	}
 	}
 
 
 	/*
 	/*

+ 185 - 0
arch/x86/kernel/cpu/bugs.c

@@ -10,6 +10,10 @@
  */
  */
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/utsname.h>
+#include <linux/cpu.h>
+
+#include <asm/nospec-branch.h>
+#include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
 #include <asm/processor-flags.h>
@@ -20,6 +24,8 @@
 #include <asm/pgtable.h>
 #include <asm/pgtable.h>
 #include <asm/set_memory.h>
 #include <asm/set_memory.h>
 
 
+static void __init spectre_v2_select_mitigation(void);
+
 void __init check_bugs(void)
 void __init check_bugs(void)
 {
 {
 	identify_boot_cpu();
 	identify_boot_cpu();
@@ -29,6 +35,9 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 		print_cpu_info(&boot_cpu_data);
 	}
 	}
 
 
+	/* Select the proper spectre mitigation before patching alternatives */
+	spectre_v2_select_mitigation();
+
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	/*
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -60,3 +69,179 @@ void __init check_bugs(void)
 		set_memory_4k((unsigned long)__va(0), 1);
 		set_memory_4k((unsigned long)__va(0), 1);
 #endif
 #endif
 }
 }
+
+/* The kernel command line selection */
+enum spectre_v2_mitigation_cmd {
+	SPECTRE_V2_CMD_NONE,
+	SPECTRE_V2_CMD_AUTO,
+	SPECTRE_V2_CMD_FORCE,
+	SPECTRE_V2_CMD_RETPOLINE,
+	SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+	SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+static const char *spectre_v2_strings[] = {
+	[SPECTRE_V2_NONE]			= "Vulnerable",
+	[SPECTRE_V2_RETPOLINE_MINIMAL]		= "Vulnerable: Minimal generic ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
+	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
+};
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+
+static void __init spec2_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static void __init spec2_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static inline bool retp_compiler(void)
+{
+	return __is_defined(RETPOLINE);
+}
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+	int len = strlen(opt);
+
+	return len == arglen && !strncmp(arg, opt, len);
+}
+
+static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+{
+	char arg[20];
+	int ret;
+
+	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+				  sizeof(arg));
+	if (ret > 0)  {
+		if (match_option(arg, ret, "off")) {
+			goto disable;
+		} else if (match_option(arg, ret, "on")) {
+			spec2_print_if_secure("force enabled on command line.");
+			return SPECTRE_V2_CMD_FORCE;
+		} else if (match_option(arg, ret, "retpoline")) {
+			spec2_print_if_insecure("retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE;
+		} else if (match_option(arg, ret, "retpoline,amd")) {
+			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+				return SPECTRE_V2_CMD_AUTO;
+			}
+			spec2_print_if_insecure("AMD retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_AMD;
+		} else if (match_option(arg, ret, "retpoline,generic")) {
+			spec2_print_if_insecure("generic retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+		} else if (match_option(arg, ret, "auto")) {
+			return SPECTRE_V2_CMD_AUTO;
+		}
+	}
+
+	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+		return SPECTRE_V2_CMD_AUTO;
+disable:
+	spec2_print_if_insecure("disabled on command line.");
+	return SPECTRE_V2_CMD_NONE;
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
+	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+
+	/*
+	 * If the CPU is not affected and the command line mode is NONE or AUTO
+	 * then nothing to do.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+	    (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+		return;
+
+	switch (cmd) {
+	case SPECTRE_V2_CMD_NONE:
+		return;
+
+	case SPECTRE_V2_CMD_FORCE:
+		/* FALLTRHU */
+	case SPECTRE_V2_CMD_AUTO:
+		goto retpoline_auto;
+
+	case SPECTRE_V2_CMD_RETPOLINE_AMD:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_amd;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_generic;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_auto;
+		break;
+	}
+	pr_err("kernel not compiled with retpoline; no mitigation available!");
+	return;
+
+retpoline_auto:
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+	retpoline_amd:
+		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+			pr_err("LFENCE not serializing. Switching to generic retpoline\n");
+			goto retpoline_generic;
+		}
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
+					 SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	} else {
+	retpoline_generic:
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
+					 SPECTRE_V2_RETPOLINE_MINIMAL;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	}
+
+	spectre_v2_enabled = mode;
+	pr_info("%s\n", spectre_v2_strings[mode]);
+}
+
+#undef pr_fmt
+
+#ifdef CONFIG_SYSFS
+ssize_t cpu_show_meltdown(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		return sprintf(buf, "Not affected\n");
+	if (boot_cpu_has(X86_FEATURE_PTI))
+		return sprintf(buf, "Mitigation: PTI\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+		return sprintf(buf, "Not affected\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		return sprintf(buf, "Not affected\n");
+
+	return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
+}
+#endif

+ 3 - 0
arch/x86/kernel/cpu/common.c

@@ -926,6 +926,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	if (c->x86_vendor != X86_VENDOR_AMD)
 	if (c->x86_vendor != X86_VENDOR_AMD)
 		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
 
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
 	fpu__init_system(c);
 	fpu__init_system(c);
 
 
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32

+ 11 - 2
arch/x86/kernel/cpu/microcode/intel.c

@@ -910,8 +910,17 @@ static bool is_blacklisted(unsigned int cpu)
 {
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
 
-	if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) {
-		pr_err_once("late loading on model 79 is disabled.\n");
+	/*
+	 * Late loading on model 79 with microcode revision less than 0x0b000021
+	 * may result in a system hang. This behavior is documented in item
+	 * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+	 */
+	if (c->x86 == 6 &&
+	    c->x86_model == INTEL_FAM6_BROADWELL_X &&
+	    c->x86_mask == 0x01 &&
+	    c->microcode < 0x0b000021) {
+		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
 		return true;
 		return true;
 	}
 	}
 
 

+ 4 - 2
arch/x86/kernel/ftrace_32.S

@@ -8,6 +8,7 @@
 #include <asm/segment.h>
 #include <asm/segment.h>
 #include <asm/export.h>
 #include <asm/export.h>
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
+#include <asm/nospec-branch.h>
 
 
 #ifdef CC_USING_FENTRY
 #ifdef CC_USING_FENTRY
 # define function_hook	__fentry__
 # define function_hook	__fentry__
@@ -197,7 +198,8 @@ ftrace_stub:
 	movl	0x4(%ebp), %edx
 	movl	0x4(%ebp), %edx
 	subl	$MCOUNT_INSN_SIZE, %eax
 	subl	$MCOUNT_INSN_SIZE, %eax
 
 
-	call	*ftrace_trace_function
+	movl	ftrace_trace_function, %ecx
+	CALL_NOSPEC %ecx
 
 
 	popl	%edx
 	popl	%edx
 	popl	%ecx
 	popl	%ecx
@@ -241,5 +243,5 @@ return_to_handler:
 	movl	%eax, %ecx
 	movl	%eax, %ecx
 	popl	%edx
 	popl	%edx
 	popl	%eax
 	popl	%eax
-	jmp	*%ecx
+	JMP_NOSPEC %ecx
 #endif
 #endif

+ 4 - 4
arch/x86/kernel/ftrace_64.S

@@ -7,7 +7,7 @@
 #include <asm/ptrace.h>
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
 #include <asm/export.h>
 #include <asm/export.h>
-
+#include <asm/nospec-branch.h>
 
 
 	.code64
 	.code64
 	.section .entry.text, "ax"
 	.section .entry.text, "ax"
@@ -286,8 +286,8 @@ trace:
 	 * ip and parent ip are used and the list function is called when
 	 * ip and parent ip are used and the list function is called when
 	 * function tracing is enabled.
 	 * function tracing is enabled.
 	 */
 	 */
-	call   *ftrace_trace_function
-
+	movq ftrace_trace_function, %r8
+	CALL_NOSPEC %r8
 	restore_mcount_regs
 	restore_mcount_regs
 
 
 	jmp fgraph_trace
 	jmp fgraph_trace
@@ -329,5 +329,5 @@ GLOBAL(return_to_handler)
 	movq 8(%rsp), %rdx
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	movq (%rsp), %rax
 	addq $24, %rsp
 	addq $24, %rsp
-	jmp *%rdi
+	JMP_NOSPEC %rdi
 #endif
 #endif

+ 5 - 4
arch/x86/kernel/irq_32.c

@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 #include <linux/mm.h>
 
 
 #include <asm/apic.h>
 #include <asm/apic.h>
+#include <asm/nospec-branch.h>
 
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
 
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 static void call_on_stack(void *func, void *stack)
 {
 {
 	asm volatile("xchgl	%%ebx,%%esp	\n"
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     "movl	%%ebx,%%esp	\n"
 		     : "=b" (stack)
 		     : "=b" (stack)
 		     : "0" (stack),
 		     : "0" (stack),
-		       "D"(func)
+		       [thunk_target] "D"(func)
 		     : "memory", "cc", "edx", "ecx", "eax");
 		     : "memory", "cc", "edx", "ecx", "eax");
 }
 }
 
 
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 		call_on_stack(print_stack_overflow, isp);
 		call_on_stack(print_stack_overflow, isp);
 
 
 	asm volatile("xchgl	%%ebx,%%esp	\n"
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     "movl	%%ebx,%%esp	\n"
 		     : "=a" (arg1), "=b" (isp)
 		     : "=a" (arg1), "=b" (isp)
 		     :  "0" (desc),   "1" (isp),
 		     :  "0" (desc),   "1" (isp),
-			"D" (desc->handle_irq)
+			[thunk_target] "D" (desc->handle_irq)
 		     : "memory", "cc", "ecx");
 		     : "memory", "cc", "ecx");
 	return 1;
 	return 1;
 }
 }

+ 11 - 0
arch/x86/kernel/tboot.c

@@ -138,6 +138,17 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 		return -1;
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	pte_unmap(pte);
 	pte_unmap(pte);
+
+	/*
+	 * PTI poisons low addresses in the kernel page tables in the
+	 * name of making them unusable for userspace.  To execute
+	 * code at such a low address, the poison must be cleared.
+	 *
+	 * Note: 'pgd' actually gets set in p4d_alloc() _or_
+	 * pud_alloc() depending on 4/5-level paging.
+	 */
+	pgd->pgd &= ~_PAGE_NX;
+
 	return 0;
 	return 0;
 }
 }
 
 

+ 12 - 7
arch/x86/kvm/mmu.c

@@ -3781,7 +3781,8 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 {
 	if (unlikely(!lapic_in_kernel(vcpu) ||
 	if (unlikely(!lapic_in_kernel(vcpu) ||
-		     kvm_event_needs_reinjection(vcpu)))
+		     kvm_event_needs_reinjection(vcpu) ||
+		     vcpu->arch.exception.pending))
 		return false;
 		return false;
 
 
 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
 	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
@@ -5465,30 +5466,34 @@ static void mmu_destroy_caches(void)
 
 
 int kvm_mmu_module_init(void)
 int kvm_mmu_module_init(void)
 {
 {
+	int ret = -ENOMEM;
+
 	kvm_mmu_clear_all_pte_masks();
 	kvm_mmu_clear_all_pte_masks();
 
 
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
 					    sizeof(struct pte_list_desc),
 					    sizeof(struct pte_list_desc),
 					    0, SLAB_ACCOUNT, NULL);
 					    0, SLAB_ACCOUNT, NULL);
 	if (!pte_list_desc_cache)
 	if (!pte_list_desc_cache)
-		goto nomem;
+		goto out;
 
 
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
 						  sizeof(struct kvm_mmu_page),
 						  sizeof(struct kvm_mmu_page),
 						  0, SLAB_ACCOUNT, NULL);
 						  0, SLAB_ACCOUNT, NULL);
 	if (!mmu_page_header_cache)
 	if (!mmu_page_header_cache)
-		goto nomem;
+		goto out;
 
 
 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
 	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
-		goto nomem;
+		goto out;
 
 
-	register_shrinker(&mmu_shrinker);
+	ret = register_shrinker(&mmu_shrinker);
+	if (ret)
+		goto out;
 
 
 	return 0;
 	return 0;
 
 
-nomem:
+out:
 	mmu_destroy_caches();
 	mmu_destroy_caches();
-	return -ENOMEM;
+	return ret;
 }
 }
 
 
 /*
 /*

+ 5 - 8
arch/x86/kvm/svm.c

@@ -45,6 +45,7 @@
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
 #include <asm/kvm_para.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
+#include <asm/nospec-branch.h>
 
 
 #include <asm/virtext.h>
 #include <asm/virtext.h>
 #include "trace.h"
 #include "trace.h"
@@ -361,7 +362,6 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 {
 {
 	struct vmcb_control_area *c, *h;
 	struct vmcb_control_area *c, *h;
 	struct nested_state *g;
 	struct nested_state *g;
-	u32 h_intercept_exceptions;
 
 
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 
 
@@ -372,14 +372,9 @@ static void recalc_intercepts(struct vcpu_svm *svm)
 	h = &svm->nested.hsave->control;
 	h = &svm->nested.hsave->control;
 	g = &svm->nested;
 	g = &svm->nested;
 
 
-	/* No need to intercept #UD if L1 doesn't intercept it */
-	h_intercept_exceptions =
-		h->intercept_exceptions & ~(1U << UD_VECTOR);
-
 	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 	c->intercept_cr = h->intercept_cr | g->intercept_cr;
 	c->intercept_dr = h->intercept_dr | g->intercept_dr;
 	c->intercept_dr = h->intercept_dr | g->intercept_dr;
-	c->intercept_exceptions =
-		h_intercept_exceptions | g->intercept_exceptions;
+	c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
 	c->intercept = h->intercept | g->intercept;
 	c->intercept = h->intercept | g->intercept;
 }
 }
 
 
@@ -2202,7 +2197,6 @@ static int ud_interception(struct vcpu_svm *svm)
 {
 {
 	int er;
 	int er;
 
 
-	WARN_ON_ONCE(is_guest_mode(&svm->vcpu));
 	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
 	er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
 	if (er == EMULATE_USER_EXIT)
 	if (er == EMULATE_USER_EXIT)
 		return 0;
 		return 0;
@@ -5034,6 +5028,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 #endif
 		);
 		);
 
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 #else
 #else

+ 15 - 6
arch/x86/kvm/vmx.c

@@ -50,6 +50,7 @@
 #include <asm/apic.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
 #include <asm/irq_remapping.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
+#include <asm/nospec-branch.h>
 
 
 #include "trace.h"
 #include "trace.h"
 #include "pmu.h"
 #include "pmu.h"
@@ -899,8 +900,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
 {
 {
 	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 
 
-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-	    vmcs_field_to_offset_table[field] == 0)
+	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+		return -ENOENT;
+
+	/*
+	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+	 * generic mechanism.
+	 */
+	asm("lfence");
+
+	if (vmcs_field_to_offset_table[field] == 0)
 		return -ENOENT;
 		return -ENOENT;
 
 
 	return vmcs_field_to_offset_table[field];
 	return vmcs_field_to_offset_table[field];
@@ -1887,7 +1896,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 {
 {
 	u32 eb;
 	u32 eb;
 
 
-	eb = (1u << PF_VECTOR) | (1u << MC_VECTOR) |
+	eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	     (1u << DB_VECTOR) | (1u << AC_VECTOR);
 	if ((vcpu->guest_debug &
 	if ((vcpu->guest_debug &
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
 	     (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
@@ -1905,8 +1914,6 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
 	 */
 	 */
 	if (is_guest_mode(vcpu))
 	if (is_guest_mode(vcpu))
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
 		eb |= get_vmcs12(vcpu)->exception_bitmap;
-	else
-		eb |= 1u << UD_VECTOR;
 
 
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 	vmcs_write32(EXCEPTION_BITMAP, eb);
 }
 }
@@ -5917,7 +5924,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 		return 1;  /* already handled by vmx_vcpu_run() */
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 
 	if (is_invalid_opcode(intr_info)) {
 	if (is_invalid_opcode(intr_info)) {
-		WARN_ON_ONCE(is_guest_mode(vcpu));
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
 		if (er == EMULATE_USER_EXIT)
 		if (er == EMULATE_USER_EXIT)
 			return 0;
 			return 0;
@@ -9485,6 +9491,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 #endif
 	      );
 	      );
 
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	if (debugctlmsr)
 	if (debugctlmsr)
 		update_debugctlmsr(debugctlmsr);
 		update_debugctlmsr(debugctlmsr);

+ 1 - 0
arch/x86/lib/Makefile

@@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
+lib-$(CONFIG_RETPOLINE) += retpoline.o
 
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
 
 

+ 4 - 3
arch/x86/lib/checksum_32.S

@@ -29,7 +29,8 @@
 #include <asm/errno.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
 #include <asm/asm.h>
 #include <asm/export.h>
 #include <asm/export.h>
-				
+#include <asm/nospec-branch.h>
+
 /*
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
  * computes a partial checksum, e.g. for TCP/UDP fragments
  */
  */
@@ -156,7 +157,7 @@ ENTRY(csum_partial)
 	negl %ebx
 	negl %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	testl %esi, %esi
 	testl %esi, %esi
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 
 
 	# Handle 2-byte-aligned regions
 	# Handle 2-byte-aligned regions
 20:	addw (%esi), %ax
 20:	addw (%esi), %ax
@@ -439,7 +440,7 @@ ENTRY(csum_partial_copy_generic)
 	andl $-32,%edx
 	andl $-32,%edx
 	lea 3f(%ebx,%ebx), %ebx
 	lea 3f(%ebx,%ebx), %ebx
 	testl %esi, %esi 
 	testl %esi, %esi 
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 1:	addl $64,%esi
 1:	addl $64,%esi
 	addl $64,%edi 
 	addl $64,%edi 
 	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
 	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)

+ 48 - 0
arch/x86/lib/retpoline.S

@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative-asm.h>
+#include <asm/export.h>
+#include <asm/nospec-branch.h>
+
+.macro THUNK reg
+	.section .text.__x86.indirect_thunk.\reg
+
+ENTRY(__x86_indirect_thunk_\reg)
+	CFI_STARTPROC
+	JMP_NOSPEC %\reg
+	CFI_ENDPROC
+ENDPROC(__x86_indirect_thunk_\reg)
+.endm
+
+/*
+ * Despite being an assembler file we can't just use .irp here
+ * because __KSYM_DEPS__ only uses the C preprocessor and would
+ * only see one instance of "__x86_indirect_thunk_\reg" rather
+ * than one per register with the correct names. So we do it
+ * the simple and nasty way...
+ */
+#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
+#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
+
+GENERATE_THUNK(_ASM_AX)
+GENERATE_THUNK(_ASM_BX)
+GENERATE_THUNK(_ASM_CX)
+GENERATE_THUNK(_ASM_DX)
+GENERATE_THUNK(_ASM_SI)
+GENERATE_THUNK(_ASM_DI)
+GENERATE_THUNK(_ASM_BP)
+GENERATE_THUNK(_ASM_SP)
+#ifdef CONFIG_64BIT
+GENERATE_THUNK(r8)
+GENERATE_THUNK(r9)
+GENERATE_THUNK(r10)
+GENERATE_THUNK(r11)
+GENERATE_THUNK(r12)
+GENERATE_THUNK(r13)
+GENERATE_THUNK(r14)
+GENERATE_THUNK(r15)
+#endif

+ 6 - 26
arch/x86/mm/pti.c

@@ -149,7 +149,7 @@ pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd)
  *
  *
  * Returns a pointer to a P4D on success, or NULL on failure.
  * Returns a pointer to a P4D on success, or NULL on failure.
  */
  */
-static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
+static __init p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
 {
 {
 	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
 	pgd_t *pgd = kernel_to_user_pgdp(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
@@ -164,12 +164,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
 		if (!new_p4d_page)
 		if (!new_p4d_page)
 			return NULL;
 			return NULL;
 
 
-		if (pgd_none(*pgd)) {
-			set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
-			new_p4d_page = 0;
-		}
-		if (new_p4d_page)
-			free_page(new_p4d_page);
+		set_pgd(pgd, __pgd(_KERNPG_TABLE | __pa(new_p4d_page)));
 	}
 	}
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
 
@@ -182,7 +177,7 @@ static p4d_t *pti_user_pagetable_walk_p4d(unsigned long address)
  *
  *
  * Returns a pointer to a PMD on success, or NULL on failure.
  * Returns a pointer to a PMD on success, or NULL on failure.
  */
  */
-static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
+static __init pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 {
 {
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
 	p4d_t *p4d = pti_user_pagetable_walk_p4d(address);
@@ -194,12 +189,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 		if (!new_pud_page)
 		if (!new_pud_page)
 			return NULL;
 			return NULL;
 
 
-		if (p4d_none(*p4d)) {
-			set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
-			new_pud_page = 0;
-		}
-		if (new_pud_page)
-			free_page(new_pud_page);
+		set_p4d(p4d, __p4d(_KERNPG_TABLE | __pa(new_pud_page)));
 	}
 	}
 
 
 	pud = pud_offset(p4d, address);
 	pud = pud_offset(p4d, address);
@@ -213,12 +203,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address)
 		if (!new_pmd_page)
 		if (!new_pmd_page)
 			return NULL;
 			return NULL;
 
 
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
-			new_pmd_page = 0;
-		}
-		if (new_pmd_page)
-			free_page(new_pmd_page);
+		set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
 	}
 	}
 
 
 	return pmd_offset(pud, address);
 	return pmd_offset(pud, address);
@@ -251,12 +236,7 @@ static __init pte_t *pti_user_pagetable_walk_pte(unsigned long address)
 		if (!new_pte_page)
 		if (!new_pte_page)
 			return NULL;
 			return NULL;
 
 
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
-			new_pte_page = 0;
-		}
-		if (new_pte_page)
-			free_page(new_pte_page);
+		set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
 	}
 	}
 
 
 	pte = pte_offset_kernel(pmd, address);
 	pte = pte_offset_kernel(pmd, address);

+ 5 - 0
arch/x86/pci/common.c

@@ -594,6 +594,11 @@ char *__init pcibios_setup(char *str)
 	} else if (!strcmp(str, "nocrs")) {
 	} else if (!strcmp(str, "nocrs")) {
 		pci_probe |= PCI_ROOT_NO_CRS;
 		pci_probe |= PCI_ROOT_NO_CRS;
 		return NULL;
 		return NULL;
+#ifdef CONFIG_PHYS_ADDR_T_64BIT
+	} else if (!strcmp(str, "big_root_window")) {
+		pci_probe |= PCI_BIG_ROOT_WINDOW;
+		return NULL;
+#endif
 	} else if (!strcmp(str, "earlydump")) {
 	} else if (!strcmp(str, "earlydump")) {
 		pci_early_dump_regs = 1;
 		pci_early_dump_regs = 1;
 		return NULL;
 		return NULL;

+ 18 - 11
arch/x86/pci/fixup.c

@@ -662,10 +662,14 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2033, quirk_no_aersid);
  */
  */
 static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 {
 {
-	unsigned i;
 	u32 base, limit, high;
 	u32 base, limit, high;
-	struct resource *res, *conflict;
 	struct pci_dev *other;
 	struct pci_dev *other;
+	struct resource *res;
+	unsigned i;
+	int r;
+
+	if (!(pci_probe & PCI_BIG_ROOT_WINDOW))
+		return;
 
 
 	/* Check that we are the only device of that type */
 	/* Check that we are the only device of that type */
 	other = pci_get_device(dev->vendor, dev->device, NULL);
 	other = pci_get_device(dev->vendor, dev->device, NULL);
@@ -699,22 +703,25 @@ static void pci_amd_enable_64bit_bar(struct pci_dev *dev)
 	if (!res)
 	if (!res)
 		return;
 		return;
 
 
+	/*
+	 * Allocate a 256GB window directly below the 0xfd00000000 hardware
+	 * limit (see AMD Family 15h Models 30h-3Fh BKDG, sec 2.4.6).
+	 */
 	res->name = "PCI Bus 0000:00";
 	res->name = "PCI Bus 0000:00";
 	res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
 	res->flags = IORESOURCE_PREFETCH | IORESOURCE_MEM |
 		IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
 		IORESOURCE_MEM_64 | IORESOURCE_WINDOW;
-	res->start = 0x100000000ull;
+	res->start = 0xbd00000000ull;
 	res->end = 0xfd00000000ull - 1;
 	res->end = 0xfd00000000ull - 1;
 
 
-	/* Just grab the free area behind system memory for this */
-	while ((conflict = request_resource_conflict(&iomem_resource, res))) {
-		if (conflict->end >= res->end) {
-			kfree(res);
-			return;
-		}
-		res->start = conflict->end + 1;
+	r = request_resource(&iomem_resource, res);
+	if (r) {
+		kfree(res);
+		return;
 	}
 	}
 
 
-	dev_info(&dev->dev, "adding root bus resource %pR\n", res);
+	dev_info(&dev->dev, "adding root bus resource %pR (tainting kernel)\n",
+		 res);
+	add_taint(TAINT_FIRMWARE_WORKAROUND, LOCKDEP_STILL_OK);
 
 
 	base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
 	base = ((res->start >> 8) & AMD_141b_MMIO_BASE_MMIOBASE_MASK) |
 		AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;
 		AMD_141b_MMIO_BASE_RE_MASK | AMD_141b_MMIO_BASE_WE_MASK;

+ 2 - 0
arch/x86/platform/efi/efi_64.c

@@ -135,7 +135,9 @@ pgd_t * __init efi_call_phys_prolog(void)
 				pud[j] = *pud_offset(p4d_k, vaddr);
 				pud[j] = *pud_offset(p4d_k, vaddr);
 			}
 			}
 		}
 		}
+		pgd_offset_k(pgd * PGDIR_SIZE)->pgd &= ~_PAGE_NX;
 	}
 	}
+
 out:
 out:
 	__flush_tlb_all();
 	__flush_tlb_all();
 
 

+ 1 - 1
arch/x86/platform/intel-mid/device_libs/platform_bt.c

@@ -60,7 +60,7 @@ static int __init tng_bt_sfi_setup(struct bt_sfi_data *ddata)
 	return 0;
 	return 0;
 }
 }
 
 
-static const struct bt_sfi_data tng_bt_sfi_data __initdata = {
+static struct bt_sfi_data tng_bt_sfi_data __initdata = {
 	.setup	= tng_bt_sfi_setup,
 	.setup	= tng_bt_sfi_setup,
 };
 };
 
 

+ 3 - 5
arch/x86/xen/mmu_pv.c

@@ -1325,20 +1325,18 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 {
 {
 	struct {
 	struct {
 		struct mmuext_op op;
 		struct mmuext_op op;
-#ifdef CONFIG_SMP
-		DECLARE_BITMAP(mask, num_processors);
-#else
 		DECLARE_BITMAP(mask, NR_CPUS);
 		DECLARE_BITMAP(mask, NR_CPUS);
-#endif
 	} *args;
 	} *args;
 	struct multicall_space mcs;
 	struct multicall_space mcs;
+	const size_t mc_entry_size = sizeof(args->op) +
+		sizeof(args->mask[0]) * BITS_TO_LONGS(num_possible_cpus());
 
 
 	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
 	trace_xen_mmu_flush_tlb_others(cpus, info->mm, info->start, info->end);
 
 
 	if (cpumask_empty(cpus))
 	if (cpumask_empty(cpus))
 		return;		/* nothing to do */
 		return;		/* nothing to do */
 
 
-	mcs = xen_mc_entry(sizeof(*args));
+	mcs = xen_mc_entry(mc_entry_size);
 	args = mcs.args;
 	args = mcs.args;
 	args->op.arg2.vcpumask = to_cpumask(args->mask);
 	args->op.arg2.vcpumask = to_cpumask(args->mask);
 
 

+ 1 - 1
arch/x86/xen/xen-ops.h

@@ -72,7 +72,7 @@ u64 xen_clocksource_read(void);
 void xen_setup_cpu_clockevents(void);
 void xen_setup_cpu_clockevents(void);
 void xen_save_time_memory_area(void);
 void xen_save_time_memory_area(void);
 void xen_restore_time_memory_area(void);
 void xen_restore_time_memory_area(void);
-void __init xen_init_time_ops(void);
+void __ref xen_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
 void __init xen_hvm_init_time_ops(void);
 
 
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
 irqreturn_t xen_debug_interrupt(int irq, void *dev_id);

+ 7 - 2
block/blk-core.c

@@ -562,6 +562,13 @@ static void __blk_drain_queue(struct request_queue *q, bool drain_all)
 	}
 	}
 }
 }
 
 
+void blk_drain_queue(struct request_queue *q)
+{
+	spin_lock_irq(q->queue_lock);
+	__blk_drain_queue(q, true);
+	spin_unlock_irq(q->queue_lock);
+}
+
 /**
 /**
  * blk_queue_bypass_start - enter queue bypass mode
  * blk_queue_bypass_start - enter queue bypass mode
  * @q: queue of interest
  * @q: queue of interest
@@ -689,8 +696,6 @@ void blk_cleanup_queue(struct request_queue *q)
 	 */
 	 */
 	blk_freeze_queue(q);
 	blk_freeze_queue(q);
 	spin_lock_irq(lock);
 	spin_lock_irq(lock);
-	if (!q->mq_ops)
-		__blk_drain_queue(q, true);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	queue_flag_set(QUEUE_FLAG_DEAD, q);
 	spin_unlock_irq(lock);
 	spin_unlock_irq(lock);
 
 

+ 2 - 0
block/blk-mq.c

@@ -161,6 +161,8 @@ void blk_freeze_queue(struct request_queue *q)
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 * exported to drivers as the only user for unfreeze is blk_mq.
 	 */
 	 */
 	blk_freeze_queue_start(q);
 	blk_freeze_queue_start(q);
+	if (!q->mq_ops)
+		blk_drain_queue(q);
 	blk_mq_freeze_queue_wait(q);
 	blk_mq_freeze_queue_wait(q);
 }
 }
 
 

Some files were not shown because too many files changed in this diff