Эх сурвалжийг харах

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "One of the largest releases for KVM...  Hardly any generic
  changes, but lots of architecture-specific updates.

  ARM:
   - VHE support so that we can run the kernel at EL2 on ARMv8.1 systems
   - PMU support for guests
   - 32bit world switch rewritten in C
   - various optimizations to the vgic save/restore code.

  PPC:
   - enabled KVM-VFIO integration ("VFIO device")
   - optimizations to speed up IPIs between vcpus
   - in-kernel handling of IOMMU hypercalls
   - support for dynamic DMA windows (DDW).

  s390:
   - provide the floating point registers via sync regs;
   - separated instruction vs.  data accesses
   - dirty log improvements for huge guests
   - bugfixes and documentation improvements.

  x86:
   - Hyper-V VMBus hypercall userspace exit
   - alternative implementation of lowest-priority interrupts using
     vector hashing (for better VT-d posted interrupt support)
   - fixed guest debugging with nested virtualizations
   - improved interrupt tracking in the in-kernel IOAPIC
   - generic infrastructure for tracking writes to guest
     memory - currently its only use is to speedup the legacy shadow
     paging (pre-EPT) case, but in the future it will be used for
     virtual GPUs as well
   - much cleanup (LAPIC, kvmclock, MMU, PIT), including ubsan fixes"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (217 commits)
  KVM: x86: remove eager_fpu field of struct kvm_vcpu_arch
  KVM: x86: disable MPX if host did not enable MPX XSAVE features
  arm64: KVM: vgic-v3: Only wipe LRs on vcpu exit
  arm64: KVM: vgic-v3: Reset LRs at boot time
  arm64: KVM: vgic-v3: Do not save an LR known to be empty
  arm64: KVM: vgic-v3: Save maintenance interrupt state only if required
  arm64: KVM: vgic-v3: Avoid accessing ICH registers
  KVM: arm/arm64: vgic-v2: Make GICD_SGIR quicker to hit
  KVM: arm/arm64: vgic-v2: Only wipe LRs on vcpu exit
  KVM: arm/arm64: vgic-v2: Reset LRs at boot time
  KVM: arm/arm64: vgic-v2: Do not save an LR known to be empty
  KVM: arm/arm64: vgic-v2: Move GICH_ELRSR saving to its own function
  KVM: arm/arm64: vgic-v2: Save maintenance interrupt state only if required
  KVM: arm/arm64: vgic-v2: Avoid accessing GICH registers
  KVM: s390: allocate only one DMA page per VM
  KVM: s390: enable STFLE interpretation only if enabled for the guest
  KVM: s390: wake up when the VCPU cpu timer expires
  KVM: s390: step the VCPU timer while in enabled wait
  KVM: s390: protect VCPU cpu timer with a seqcount
  KVM: s390: step VCPU cpu timer during kvm_run ioctl
  ...
Linus Torvalds 9 жил өмнө
parent
commit
10dc374766
100 өөрчлөгдсөн 4594 нэмэгдсэн , 2201 устгасан
  1. 95 4
      Documentation/virtual/kvm/api.txt
  2. 2 0
      Documentation/virtual/kvm/devices/s390_flic.txt
  3. 33 0
      Documentation/virtual/kvm/devices/vcpu.txt
  4. 52 0
      Documentation/virtual/kvm/devices/vm.txt
  5. 3 3
      Documentation/virtual/kvm/mmu.txt
  6. 3 38
      arch/arm/include/asm/kvm_asm.h
  7. 10 10
      arch/arm/include/asm/kvm_emulate.h
  8. 70 10
      arch/arm/include/asm/kvm_host.h
  9. 139 0
      arch/arm/include/asm/kvm_hyp.h
  10. 1 1
      arch/arm/include/asm/kvm_mmu.h
  11. 9 0
      arch/arm/include/asm/virt.h
  12. 5 35
      arch/arm/kernel/asm-offsets.c
  13. 6 0
      arch/arm/kernel/vmlinux.lds.S
  14. 1 0
      arch/arm/kvm/Makefile
  15. 182 62
      arch/arm/kvm/arm.c
  16. 70 56
      arch/arm/kvm/coproc.c
  17. 12 12
      arch/arm/kvm/coproc.h
  18. 17 17
      arch/arm/kvm/emulate.c
  19. 2 3
      arch/arm/kvm/guest.c
  20. 0 7
      arch/arm/kvm/handle_exit.c
  21. 17 0
      arch/arm/kvm/hyp/Makefile
  22. 77 0
      arch/arm/kvm/hyp/banked-sr.c
  23. 84 0
      arch/arm/kvm/hyp/cp15-sr.c
  24. 101 0
      arch/arm/kvm/hyp/entry.S
  25. 169 0
      arch/arm/kvm/hyp/hyp-entry.S
  26. 33 0
      arch/arm/kvm/hyp/s2-setup.c
  27. 232 0
      arch/arm/kvm/hyp/switch.c
  28. 70 0
      arch/arm/kvm/hyp/tlb.c
  29. 68 0
      arch/arm/kvm/hyp/vfp.S
  30. 0 8
      arch/arm/kvm/init.S
  31. 3 477
      arch/arm/kvm/interrupts.S
  32. 0 648
      arch/arm/kvm/interrupts_head.S
  33. 23 0
      arch/arm/kvm/mmu.c
  34. 1 1
      arch/arm/kvm/reset.c
  35. 13 0
      arch/arm64/Kconfig
  36. 5 1
      arch/arm64/include/asm/cpufeature.h
  37. 13 5
      arch/arm64/include/asm/hw_breakpoint.h
  38. 5 1
      arch/arm64/include/asm/kvm_arm.h
  39. 3 3
      arch/arm64/include/asm/kvm_asm.h
  40. 8 0
      arch/arm64/include/asm/kvm_emulate.h
  41. 33 1
      arch/arm64/include/asm/kvm_host.h
  42. 181 0
      arch/arm64/include/asm/kvm_hyp.h
  43. 11 1
      arch/arm64/include/asm/kvm_mmu.h
  44. 68 0
      arch/arm64/include/asm/kvm_perf_event.h
  45. 10 0
      arch/arm64/include/asm/virt.h
  46. 6 0
      arch/arm64/include/uapi/asm/kvm.h
  47. 0 3
      arch/arm64/kernel/asm-offsets.c
  48. 11 0
      arch/arm64/kernel/cpufeature.c
  49. 27 1
      arch/arm64/kernel/head.S
  50. 5 1
      arch/arm64/kernel/perf_event.c
  51. 7 0
      arch/arm64/kvm/Kconfig
  52. 1 0
      arch/arm64/kvm/Makefile
  53. 51 0
      arch/arm64/kvm/guest.c
  54. 1 14
      arch/arm64/kvm/hyp-init.S
  55. 7 0
      arch/arm64/kvm/hyp.S
  56. 6 2
      arch/arm64/kvm/hyp/Makefile
  57. 1 3
      arch/arm64/kvm/hyp/debug-sr.c
  58. 6 0
      arch/arm64/kvm/hyp/entry.S
  59. 36 73
      arch/arm64/kvm/hyp/hyp-entry.S
  60. 0 90
      arch/arm64/kvm/hyp/hyp.h
  61. 43 0
      arch/arm64/kvm/hyp/s2-setup.c
  62. 186 20
      arch/arm64/kvm/hyp/switch.c
  63. 98 51
      arch/arm64/kvm/hyp/sysreg-sr.c
  64. 1 1
      arch/arm64/kvm/hyp/tlb.c
  65. 0 84
      arch/arm64/kvm/hyp/vgic-v2-sr.c
  66. 225 116
      arch/arm64/kvm/hyp/vgic-v3-sr.c
  67. 7 0
      arch/arm64/kvm/reset.c
  68. 562 47
      arch/arm64/kvm/sys_regs.c
  69. 0 2
      arch/powerpc/include/asm/kvm_book3s_64.h
  70. 4 1
      arch/powerpc/include/asm/kvm_host.h
  71. 50 1
      arch/powerpc/include/asm/kvm_ppc.h
  72. 3 0
      arch/powerpc/include/asm/pgtable.h
  73. 4 0
      arch/powerpc/include/asm/smp.h
  74. 1 0
      arch/powerpc/include/asm/xics.h
  75. 9 0
      arch/powerpc/include/uapi/asm/kvm.h
  76. 23 5
      arch/powerpc/kernel/smp.c
  77. 1 1
      arch/powerpc/kvm/Makefile
  78. 1 1
      arch/powerpc/kvm/book3s.c
  79. 136 20
      arch/powerpc/kvm/book3s_64_vio.c
  80. 288 42
      arch/powerpc/kvm/book3s_64_vio_hv.c
  81. 191 1
      arch/powerpc/kvm/book3s_hv.c
  82. 3 0
      arch/powerpc/kvm/book3s_hv_builtin.c
  83. 128 3
      arch/powerpc/kvm/book3s_hv_rm_xics.c
  84. 2 2
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  85. 35 0
      arch/powerpc/kvm/book3s_pr_papr.c
  86. 37 1
      arch/powerpc/kvm/powerpc.c
  87. 8 0
      arch/powerpc/mm/pgtable.c
  88. 0 8
      arch/powerpc/perf/hv-24x7.c
  89. 21 0
      arch/powerpc/sysdev/xics/icp-native.c
  90. 26 15
      arch/s390/include/asm/kvm_host.h
  91. 6 2
      arch/s390/include/uapi/asm/kvm.h
  92. 1 0
      arch/s390/include/uapi/asm/sie.h
  93. 30 27
      arch/s390/kvm/gaccess.c
  94. 32 6
      arch/s390/kvm/gaccess.h
  95. 47 31
      arch/s390/kvm/intercept.c
  96. 55 38
      arch/s390/kvm/interrupt.c
  97. 172 63
      arch/s390/kvm/kvm-s390.c
  98. 25 3
      arch/s390/kvm/kvm-s390.h
  99. 8 7
      arch/s390/kvm/priv.c
  100. 20 11
      arch/x86/include/asm/kvm_host.h

+ 95 - 4
Documentation/virtual/kvm/api.txt

@@ -2507,8 +2507,9 @@ struct kvm_create_device {
 
 4.80 KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
-Type: device ioctl, vm ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
+  KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
@@ -2533,8 +2534,9 @@ struct kvm_device_attr {
 
 4.81 KVM_HAS_DEVICE_ATTR
 
-Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device
-Type: device ioctl, vm ioctl
+Capability: KVM_CAP_DEVICE_CTRL, KVM_CAP_VM_ATTRIBUTES for vm device,
+  KVM_CAP_VCPU_ATTRIBUTES for vcpu device
+Type: device ioctl, vm ioctl, vcpu ioctl
 Parameters: struct kvm_device_attr
 Returns: 0 on success, -1 on error
 Errors:
@@ -2577,6 +2579,8 @@ Possible features:
 	  Depends on KVM_CAP_ARM_EL1_32BIT (arm64 only).
 	- KVM_ARM_VCPU_PSCI_0_2: Emulate PSCI v0.2 for the CPU.
 	  Depends on KVM_CAP_ARM_PSCI_0_2.
+	- KVM_ARM_VCPU_PMU_V3: Emulate PMUv3 for the CPU.
+	  Depends on KVM_CAP_ARM_PMU_V3.
 
 
 4.83 KVM_ARM_PREFERRED_TARGET
@@ -3035,6 +3039,87 @@ Returns: 0 on success, -1 on error
 
 Queues an SMI on the thread's vcpu.
 
+4.97 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significantly accelerates DMA operations for PPC KVM guests.
+User space should expect that its handlers for these hypercalls
+are not going to be called if user space previously registered LIOBN
+in KVM (via KVM_CREATE_SPAPR_TCE or similar calls).
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+user space might have to advertise it for the guest. For example,
+IBM pSeries (sPAPR) guest starts using them if "hcall-multi-tce" is
+present in the "ibm,hypertas-functions" device-tree property.
+
+The hypercalls mentioned above may or may not be processed successfully
+in the kernel based fast path. If they can not be handled by the kernel,
+they will get passed on to user space. So user space still has to have
+an implementation for these despite the in kernel acceleration.
+
+This capability is always enabled.
+
+4.98 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows, described in 4.62 KVM_CREATE_SPAPR_TCE
+
+This capability uses extended struct in ioctl interface:
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size; 	/* in pages */
+};
+
+The aim of extension is to support an additional bigger DMA window with
+a variable page size.
+KVM_CREATE_SPAPR_TCE_64 receives a 64bit window size, an IOMMU page shift and
+a bus offset of the corresponding DMA window, @size and @offset are numbers
+of IOMMU pages.
+
+@flags are not used at the moment.
+
+The rest of functionality is identical to KVM_CREATE_SPAPR_TCE.
+
+4.98 KVM_REINJECT_CONTROL
+
+Capability: KVM_CAP_REINJECT_CONTROL
+Architectures: x86
+Type: vm ioctl
+Parameters: struct kvm_reinject_control (in)
+Returns: 0 on success,
+         -EFAULT if struct kvm_reinject_control cannot be read,
+         -ENXIO if KVM_CREATE_PIT or KVM_CREATE_PIT2 didn't succeed earlier.
+
+i8254 (PIT) has two modes, reinject and !reinject.  The default is reinject,
+where KVM queues elapsed i8254 ticks and monitors completion of interrupt from
+vector(s) that i8254 injects.  Reinject mode dequeues a tick and injects its
+interrupt whenever there isn't a pending interrupt from i8254.
+!reinject mode injects an interrupt as soon as a tick arrives.
+
+struct kvm_reinject_control {
+	__u8 pit_reinject;
+	__u8 reserved[31];
+};
+
+pit_reinject = 0 (!reinject mode) is recommended, unless running an old
+operating system that uses the PIT for timing (e.g. Linux 2.4.x).
+
 5. The kvm_run structure
 ------------------------
 
@@ -3339,6 +3424,7 @@ EOI was received.
 
 		struct kvm_hyperv_exit {
 #define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
 			__u32 type;
 			union {
 				struct {
@@ -3347,6 +3433,11 @@ EOI was received.
 					__u64 evt_page;
 					__u64 msg_page;
 				} synic;
+				struct {
+					__u64 input;
+					__u64 result;
+					__u64 params[2];
+				} hcall;
 			} u;
 		};
 		/* KVM_EXIT_HYPERV */

+ 2 - 0
Documentation/virtual/kvm/devices/s390_flic.txt

@@ -88,6 +88,8 @@ struct kvm_s390_io_adapter_req {
       perform a gmap translation for the guest address provided in addr,
       pin a userspace page for the translated address and add it to the
       list of mappings
+      Note: A new mapping will be created unconditionally; therefore,
+            the calling code should avoid making duplicate mappings.
 
     KVM_S390_IO_ADAPTER_UNMAP
       release a userspace page for the translated address specified in addr

+ 33 - 0
Documentation/virtual/kvm/devices/vcpu.txt

@@ -0,0 +1,33 @@
+Generic vcpu interface
+====================================
+
+The virtual cpu "device" also accepts the ioctls KVM_SET_DEVICE_ATTR,
+KVM_GET_DEVICE_ATTR, and KVM_HAS_DEVICE_ATTR. The interface uses the same struct
+kvm_device_attr as other devices, but targets VCPU-wide settings and controls.
+
+The groups and attributes per virtual cpu, if any, are architecture specific.
+
+1. GROUP: KVM_ARM_VCPU_PMU_V3_CTRL
+Architectures: ARM64
+
+1.1. ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_IRQ
+Parameters: in kvm_device_attr.addr the address for PMU overflow interrupt is a
+            pointer to an int
+Returns: -EBUSY: The PMU overflow interrupt is already set
+         -ENXIO: The overflow interrupt not set when attempting to get it
+         -ENODEV: PMUv3 not supported
+         -EINVAL: Invalid PMU overflow interrupt number supplied
+
+A value describing the PMUv3 (Performance Monitor Unit v3) overflow interrupt
+number for this vcpu. This interrupt could be a PPI or SPI, but the interrupt
+type must be same for each vcpu. As a PPI, the interrupt number is the same for
+all vcpus, while as an SPI it must be a separate number per vcpu.
+
+1.2 ATTRIBUTE: KVM_ARM_VCPU_PMU_V3_INIT
+Parameters: no additional parameter in kvm_device_attr.addr
+Returns: -ENODEV: PMUv3 not supported
+         -ENXIO: PMUv3 not properly configured as required prior to calling this
+                 attribute
+         -EBUSY: PMUv3 already initialized
+
+Request the initialization of the PMUv3.

+ 52 - 0
Documentation/virtual/kvm/devices/vm.txt

@@ -84,3 +84,55 @@ Returns:    -EBUSY in case 1 or more vcpus are already activated (only in write
 	    -EFAULT if the given address is not accessible from kernel space
 	    -ENOMEM if not enough memory is available to process the ioctl
 	    0 in case of success
+
+3. GROUP: KVM_S390_VM_TOD
+Architectures: s390
+
+3.1. ATTRIBUTE: KVM_S390_VM_TOD_HIGH
+
+Allows user space to set/get the TOD clock extension (u8).
+
+Parameters: address of a buffer in user space to store the data (u8) to
+Returns:    -EFAULT if the given address is not accessible from kernel space
+	    -EINVAL if setting the TOD clock extension to != 0 is not supported
+
+3.2. ATTRIBUTE: KVM_S390_VM_TOD_LOW
+
+Allows user space to set/get bits 0-63 of the TOD clock register as defined in
+the POP (u64).
+
+Parameters: address of a buffer in user space to store the data (u64) to
+Returns:    -EFAULT if the given address is not accessible from kernel space
+
+4. GROUP: KVM_S390_VM_CRYPTO
+Architectures: s390
+
+4.1. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_AES_KW (w/o)
+
+Allows user space to enable aes key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.2. ATTRIBUTE: KVM_S390_VM_CRYPTO_ENABLE_DEA_KW (w/o)
+
+Allows user space to enable dea key wrapping, including generating a new
+wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.3. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_AES_KW (w/o)
+
+Allows user space to disable aes key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns:    0
+
+4.4. ATTRIBUTE: KVM_S390_VM_CRYPTO_DISABLE_DEA_KW (w/o)
+
+Allows user space to disable dea key wrapping, clearing the wrapping key.
+
+Parameters: none
+Returns:    0

+ 3 - 3
Documentation/virtual/kvm/mmu.txt

@@ -392,11 +392,11 @@ To instantiate a large spte, four constraints must be satisfied:
   write-protected pages
 - the guest page must be wholly contained by a single memory slot
 
-To check the last two conditions, the mmu maintains a ->write_count set of
+To check the last two conditions, the mmu maintains a ->disallow_lpage set of
 arrays for each memory slot and large page size.  Every write protected page
-causes its write_count to be incremented, thus preventing instantiation of
+causes its disallow_lpage to be incremented, thus preventing instantiation of
 a large spte.  The frames at the end of an unaligned memory slot have
-artificially inflated ->write_counts so they can never be instantiated.
+artificially inflated ->disallow_lpages so they can never be instantiated.
 
 Zapping all pages (page generation count)
 =========================================

+ 3 - 38
arch/arm/include/asm/kvm_asm.h

@@ -19,38 +19,7 @@
 #ifndef __ARM_KVM_ASM_H__
 #define __ARM_KVM_ASM_H__
 
-/* 0 is reserved as an invalid value. */
-#define c0_MPIDR	1	/* MultiProcessor ID Register */
-#define c0_CSSELR	2	/* Cache Size Selection Register */
-#define c1_SCTLR	3	/* System Control Register */
-#define c1_ACTLR	4	/* Auxiliary Control Register */
-#define c1_CPACR	5	/* Coprocessor Access Control */
-#define c2_TTBR0	6	/* Translation Table Base Register 0 */
-#define c2_TTBR0_high	7	/* TTBR0 top 32 bits */
-#define c2_TTBR1	8	/* Translation Table Base Register 1 */
-#define c2_TTBR1_high	9	/* TTBR1 top 32 bits */
-#define c2_TTBCR	10	/* Translation Table Base Control R. */
-#define c3_DACR		11	/* Domain Access Control Register */
-#define c5_DFSR		12	/* Data Fault Status Register */
-#define c5_IFSR		13	/* Instruction Fault Status Register */
-#define c5_ADFSR	14	/* Auxilary Data Fault Status R */
-#define c5_AIFSR	15	/* Auxilary Instrunction Fault Status R */
-#define c6_DFAR		16	/* Data Fault Address Register */
-#define c6_IFAR		17	/* Instruction Fault Address Register */
-#define c7_PAR		18	/* Physical Address Register */
-#define c7_PAR_high	19	/* PAR top 32 bits */
-#define c9_L2CTLR	20	/* Cortex A15/A7 L2 Control Register */
-#define c10_PRRR	21	/* Primary Region Remap Register */
-#define c10_NMRR	22	/* Normal Memory Remap Register */
-#define c12_VBAR	23	/* Vector Base Address Register */
-#define c13_CID		24	/* Context ID Register */
-#define c13_TID_URW	25	/* Thread ID, User R/W */
-#define c13_TID_URO	26	/* Thread ID, User R/O */
-#define c13_TID_PRIV	27	/* Thread ID, Privileged */
-#define c14_CNTKCTL	28	/* Timer Control Register (PL1) */
-#define c10_AMAIR0	29	/* Auxilary Memory Attribute Indirection Reg0 */
-#define c10_AMAIR1	30	/* Auxilary Memory Attribute Indirection Reg1 */
-#define NR_CP15_REGS	31	/* Number of regs (incl. invalid) */
+#include <asm/virt.h>
 
 #define ARM_EXCEPTION_RESET	  0
 #define ARM_EXCEPTION_UNDEFINED   1
@@ -86,19 +55,15 @@ struct kvm_vcpu;
 extern char __kvm_hyp_init[];
 extern char __kvm_hyp_init_end[];
 
-extern char __kvm_hyp_exit[];
-extern char __kvm_hyp_exit_end[];
-
 extern char __kvm_hyp_vector[];
 
-extern char __kvm_hyp_code_start[];
-extern char __kvm_hyp_code_end[];
-
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
+extern void __init_stage2_translation(void);
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */

+ 10 - 10
arch/arm/include/asm/kvm_emulate.h

@@ -68,12 +68,12 @@ static inline bool vcpu_mode_is_32bit(struct kvm_vcpu *vcpu)
 
 static inline unsigned long *vcpu_pc(struct kvm_vcpu *vcpu)
 {
-	return &vcpu->arch.regs.usr_regs.ARM_pc;
+	return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_pc;
 }
 
 static inline unsigned long *vcpu_cpsr(struct kvm_vcpu *vcpu)
 {
-	return &vcpu->arch.regs.usr_regs.ARM_cpsr;
+	return &vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr;
 }
 
 static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
@@ -83,13 +83,13 @@ static inline void vcpu_set_thumb(struct kvm_vcpu *vcpu)
 
 static inline bool mode_has_spsr(struct kvm_vcpu *vcpu)
 {
-	unsigned long cpsr_mode = vcpu->arch.regs.usr_regs.ARM_cpsr & MODE_MASK;
+	unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK;
 	return (cpsr_mode > USR_MODE && cpsr_mode < SYSTEM_MODE);
 }
 
 static inline bool vcpu_mode_priv(struct kvm_vcpu *vcpu)
 {
-	unsigned long cpsr_mode = vcpu->arch.regs.usr_regs.ARM_cpsr & MODE_MASK;
+	unsigned long cpsr_mode = vcpu->arch.ctxt.gp_regs.usr_regs.ARM_cpsr & MODE_MASK;
 	return cpsr_mode > USR_MODE;;
 }
 
@@ -108,11 +108,6 @@ static inline phys_addr_t kvm_vcpu_get_fault_ipa(struct kvm_vcpu *vcpu)
 	return ((phys_addr_t)vcpu->arch.fault.hpfar & HPFAR_MASK) << 8;
 }
 
-static inline unsigned long kvm_vcpu_get_hyp_pc(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.fault.hyp_pc;
-}
-
 static inline bool kvm_vcpu_dabt_isvalid(struct kvm_vcpu *vcpu)
 {
 	return kvm_vcpu_get_hsr(vcpu) & HSR_ISV;
@@ -143,6 +138,11 @@ static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu)
 	return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW;
 }
 
+static inline bool kvm_vcpu_dabt_is_cm(struct kvm_vcpu *vcpu)
+{
+	return !!(kvm_vcpu_get_hsr(vcpu) & HSR_DABT_CM);
+}
+
 /* Get Access Size from a data abort */
 static inline int kvm_vcpu_dabt_get_as(struct kvm_vcpu *vcpu)
 {
@@ -192,7 +192,7 @@ static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu)
 
 static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu)
 {
-	return vcpu->arch.cp15[c0_MPIDR] & MPIDR_HWID_BITMASK;
+	return vcpu_cp15(vcpu, c0_MPIDR) & MPIDR_HWID_BITMASK;
 }
 
 static inline void kvm_vcpu_set_be(struct kvm_vcpu *vcpu)

+ 70 - 10
arch/arm/include/asm/kvm_host.h

@@ -85,20 +85,61 @@ struct kvm_vcpu_fault_info {
 	u32 hsr;		/* Hyp Syndrome Register */
 	u32 hxfar;		/* Hyp Data/Inst. Fault Address Register */
 	u32 hpfar;		/* Hyp IPA Fault Address Register */
-	u32 hyp_pc;		/* PC when exception was taken from Hyp mode */
 };
 
-typedef struct vfp_hard_struct kvm_cpu_context_t;
+/*
+ * 0 is reserved as an invalid value.
+ * Order should be kept in sync with the save/restore code.
+ */
+enum vcpu_sysreg {
+	__INVALID_SYSREG__,
+	c0_MPIDR,		/* MultiProcessor ID Register */
+	c0_CSSELR,		/* Cache Size Selection Register */
+	c1_SCTLR,		/* System Control Register */
+	c1_ACTLR,		/* Auxiliary Control Register */
+	c1_CPACR,		/* Coprocessor Access Control */
+	c2_TTBR0,		/* Translation Table Base Register 0 */
+	c2_TTBR0_high,		/* TTBR0 top 32 bits */
+	c2_TTBR1,		/* Translation Table Base Register 1 */
+	c2_TTBR1_high,		/* TTBR1 top 32 bits */
+	c2_TTBCR,		/* Translation Table Base Control R. */
+	c3_DACR,		/* Domain Access Control Register */
+	c5_DFSR,		/* Data Fault Status Register */
+	c5_IFSR,		/* Instruction Fault Status Register */
+	c5_ADFSR,		/* Auxilary Data Fault Status R */
+	c5_AIFSR,		/* Auxilary Instrunction Fault Status R */
+	c6_DFAR,		/* Data Fault Address Register */
+	c6_IFAR,		/* Instruction Fault Address Register */
+	c7_PAR,			/* Physical Address Register */
+	c7_PAR_high,		/* PAR top 32 bits */
+	c9_L2CTLR,		/* Cortex A15/A7 L2 Control Register */
+	c10_PRRR,		/* Primary Region Remap Register */
+	c10_NMRR,		/* Normal Memory Remap Register */
+	c12_VBAR,		/* Vector Base Address Register */
+	c13_CID,		/* Context ID Register */
+	c13_TID_URW,		/* Thread ID, User R/W */
+	c13_TID_URO,		/* Thread ID, User R/O */
+	c13_TID_PRIV,		/* Thread ID, Privileged */
+	c14_CNTKCTL,		/* Timer Control Register (PL1) */
+	c10_AMAIR0,		/* Auxilary Memory Attribute Indirection Reg0 */
+	c10_AMAIR1,		/* Auxilary Memory Attribute Indirection Reg1 */
+	NR_CP15_REGS		/* Number of regs (incl. invalid) */
+};
+
+struct kvm_cpu_context {
+	struct kvm_regs	gp_regs;
+	struct vfp_hard_struct vfp;
+	u32 cp15[NR_CP15_REGS];
+};
+
+typedef struct kvm_cpu_context kvm_cpu_context_t;
 
 struct kvm_vcpu_arch {
-	struct kvm_regs regs;
+	struct kvm_cpu_context ctxt;
 
 	int target; /* Processor target */
 	DECLARE_BITMAP(features, KVM_VCPU_MAX_FEATURES);
 
-	/* System control coprocessor (cp15) */
-	u32 cp15[NR_CP15_REGS];
-
 	/* The CPU type we expose to the VM */
 	u32 midr;
 
@@ -111,9 +152,6 @@ struct kvm_vcpu_arch {
 	/* Exception Information */
 	struct kvm_vcpu_fault_info fault;
 
-	/* Floating point registers (VFP and Advanced SIMD/NEON) */
-	struct vfp_hard_struct vfp_guest;
-
 	/* Host FP context */
 	kvm_cpu_context_t *host_cpu_context;
 
@@ -158,12 +196,14 @@ struct kvm_vcpu_stat {
 	u64 exits;
 };
 
+#define vcpu_cp15(v,r)	(v)->arch.ctxt.cp15[r]
+
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init);
 unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu);
 int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices);
 int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
 int kvm_arm_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg);
-u64 kvm_call_hyp(void *hypfn, ...);
+unsigned long kvm_call_hyp(void *hypfn, ...);
 void force_vm_exit(const cpumask_t *mask);
 
 #define KVM_ARCH_WANT_MMU_NOTIFIER
@@ -220,6 +260,11 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 	kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
 }
 
+static inline void __cpu_init_stage2(void)
+{
+	kvm_call_hyp(__init_stage2_translation);
+}
+
 static inline int kvm_arch_dev_ioctl_check_extension(long ext)
 {
 	return 0;
@@ -242,5 +287,20 @@ static inline void kvm_arm_init_debug(void) {}
 static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
+static inline int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+					     struct kvm_device_attr *attr)
+{
+	return -ENXIO;
+}
+static inline int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+					     struct kvm_device_attr *attr)
+{
+	return -ENXIO;
+}
+static inline int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+					     struct kvm_device_attr *attr)
+{
+	return -ENXIO;
+}
 
 #endif /* __ARM_KVM_HOST_H__ */

+ 139 - 0
arch/arm/include/asm/kvm_hyp.h

@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM_KVM_HYP_H__
+#define __ARM_KVM_HYP_H__
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
+#include <asm/vfp.h>
+
+#define __hyp_text __section(.hyp.text) notrace
+
+#define kern_hyp_va(v) (v)
+#define hyp_kern_va(v) (v)
+
+#define __ACCESS_CP15(CRn, Op1, CRm, Op2)	\
+	"mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
+#define __ACCESS_CP15_64(Op1, CRm)		\
+	"mrrc", "mcrr", __stringify(p15, Op1, %Q0, %R0, CRm), u64
+#define __ACCESS_VFP(CRn)			\
+	"mrc", "mcr", __stringify(p10, 7, %0, CRn, cr0, 0), u32
+
+#define __write_sysreg(v, r, w, c, t)	asm volatile(w " " c : : "r" ((t)(v)))
+#define write_sysreg(v, ...)		__write_sysreg(v, __VA_ARGS__)
+
+#define __read_sysreg(r, w, c, t) ({				\
+	t __val;						\
+	asm volatile(r " " c : "=r" (__val));			\
+	__val;							\
+})
+#define read_sysreg(...)		__read_sysreg(__VA_ARGS__)
+
+#define write_special(v, r)					\
+	asm volatile("msr " __stringify(r) ", %0" : : "r" (v))
+#define read_special(r) ({					\
+	u32 __val;						\
+	asm volatile("mrs %0, " __stringify(r) : "=r" (__val));	\
+	__val;							\
+})
+
+#define TTBR0		__ACCESS_CP15_64(0, c2)
+#define TTBR1		__ACCESS_CP15_64(1, c2)
+#define VTTBR		__ACCESS_CP15_64(6, c2)
+#define PAR		__ACCESS_CP15_64(0, c7)
+#define CNTV_CVAL	__ACCESS_CP15_64(3, c14)
+#define CNTVOFF		__ACCESS_CP15_64(4, c14)
+
+#define MIDR		__ACCESS_CP15(c0, 0, c0, 0)
+#define CSSELR		__ACCESS_CP15(c0, 2, c0, 0)
+#define VPIDR		__ACCESS_CP15(c0, 4, c0, 0)
+#define VMPIDR		__ACCESS_CP15(c0, 4, c0, 5)
+#define SCTLR		__ACCESS_CP15(c1, 0, c0, 0)
+#define CPACR		__ACCESS_CP15(c1, 0, c0, 2)
+#define HCR		__ACCESS_CP15(c1, 4, c1, 0)
+#define HDCR		__ACCESS_CP15(c1, 4, c1, 1)
+#define HCPTR		__ACCESS_CP15(c1, 4, c1, 2)
+#define HSTR		__ACCESS_CP15(c1, 4, c1, 3)
+#define TTBCR		__ACCESS_CP15(c2, 0, c0, 2)
+#define HTCR		__ACCESS_CP15(c2, 4, c0, 2)
+#define VTCR		__ACCESS_CP15(c2, 4, c1, 2)
+#define DACR		__ACCESS_CP15(c3, 0, c0, 0)
+#define DFSR		__ACCESS_CP15(c5, 0, c0, 0)
+#define IFSR		__ACCESS_CP15(c5, 0, c0, 1)
+#define ADFSR		__ACCESS_CP15(c5, 0, c1, 0)
+#define AIFSR		__ACCESS_CP15(c5, 0, c1, 1)
+#define HSR		__ACCESS_CP15(c5, 4, c2, 0)
+#define DFAR		__ACCESS_CP15(c6, 0, c0, 0)
+#define IFAR		__ACCESS_CP15(c6, 0, c0, 2)
+#define HDFAR		__ACCESS_CP15(c6, 4, c0, 0)
+#define HIFAR		__ACCESS_CP15(c6, 4, c0, 2)
+#define HPFAR		__ACCESS_CP15(c6, 4, c0, 4)
+#define ICIALLUIS	__ACCESS_CP15(c7, 0, c1, 0)
+#define ATS1CPR		__ACCESS_CP15(c7, 0, c8, 0)
+#define TLBIALLIS	__ACCESS_CP15(c8, 0, c3, 0)
+#define TLBIALLNSNHIS	__ACCESS_CP15(c8, 4, c3, 4)
+#define PRRR		__ACCESS_CP15(c10, 0, c2, 0)
+#define NMRR		__ACCESS_CP15(c10, 0, c2, 1)
+#define AMAIR0		__ACCESS_CP15(c10, 0, c3, 0)
+#define AMAIR1		__ACCESS_CP15(c10, 0, c3, 1)
+#define VBAR		__ACCESS_CP15(c12, 0, c0, 0)
+#define CID		__ACCESS_CP15(c13, 0, c0, 1)
+#define TID_URW		__ACCESS_CP15(c13, 0, c0, 2)
+#define TID_URO		__ACCESS_CP15(c13, 0, c0, 3)
+#define TID_PRIV	__ACCESS_CP15(c13, 0, c0, 4)
+#define HTPIDR		__ACCESS_CP15(c13, 4, c0, 2)
+#define CNTKCTL		__ACCESS_CP15(c14, 0, c1, 0)
+#define CNTV_CTL	__ACCESS_CP15(c14, 0, c3, 1)
+#define CNTHCTL		__ACCESS_CP15(c14, 4, c1, 0)
+
+#define VFP_FPEXC	__ACCESS_VFP(FPEXC)
+
+/* AArch64 compatibility macros, only for the timer so far */
+#define read_sysreg_el0(r)		read_sysreg(r##_el0)
+#define write_sysreg_el0(v, r)		write_sysreg(v, r##_el0)
+
+#define cntv_ctl_el0			CNTV_CTL
+#define cntv_cval_el0			CNTV_CVAL
+#define cntvoff_el2			CNTVOFF
+#define cnthctl_el2			CNTHCTL
+
+void __timer_save_state(struct kvm_vcpu *vcpu);
+void __timer_restore_state(struct kvm_vcpu *vcpu);
+
+void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+void __sysreg_save_state(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
+
+void asmlinkage __vfp_save_state(struct vfp_hard_struct *vfp);
+void asmlinkage __vfp_restore_state(struct vfp_hard_struct *vfp);
+static inline bool __vfp_enabled(void)
+{
+	return !(read_sysreg(HCPTR) & (HCPTR_TCP(11) | HCPTR_TCP(10)));
+}
+
+void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt);
+void __hyp_text __banked_restore_state(struct kvm_cpu_context *ctxt);
+
+int asmlinkage __guest_enter(struct kvm_vcpu *vcpu,
+			     struct kvm_cpu_context *host);
+int asmlinkage __hyp_do_panic(const char *, int, u32);
+
+#endif /* __ARM_KVM_HYP_H__ */

+ 1 - 1
arch/arm/include/asm/kvm_mmu.h

@@ -179,7 +179,7 @@ struct kvm;
 
 static inline bool vcpu_has_cache_enabled(struct kvm_vcpu *vcpu)
 {
-	return (vcpu->arch.cp15[c1_SCTLR] & 0b101) == 0b101;
+	return (vcpu_cp15(vcpu, c1_SCTLR) & 0b101) == 0b101;
 }
 
 static inline void __coherent_cache_guest_page(struct kvm_vcpu *vcpu,

+ 9 - 0
arch/arm/include/asm/virt.h

@@ -74,6 +74,15 @@ static inline bool is_hyp_mode_mismatched(void)
 {
 	return !!(__boot_cpu_mode & BOOT_CPU_MODE_MISMATCH);
 }
+
+static inline bool is_kernel_in_hyp_mode(void)
+{
+	return false;
+}
+
+/* The section containing the hypervisor text */
+extern char __hyp_text_start[];
+extern char __hyp_text_end[];
 #endif
 
 #endif /* __ASSEMBLY__ */

+ 5 - 35
arch/arm/kernel/asm-offsets.c

@@ -170,41 +170,11 @@ int main(void)
   DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE);
   BLANK();
 #ifdef CONFIG_KVM_ARM_HOST
-  DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
-  DEFINE(VCPU_MIDR,		offsetof(struct kvm_vcpu, arch.midr));
-  DEFINE(VCPU_CP15,		offsetof(struct kvm_vcpu, arch.cp15));
-  DEFINE(VCPU_VFP_GUEST,	offsetof(struct kvm_vcpu, arch.vfp_guest));
-  DEFINE(VCPU_VFP_HOST,		offsetof(struct kvm_vcpu, arch.host_cpu_context));
-  DEFINE(VCPU_REGS,		offsetof(struct kvm_vcpu, arch.regs));
-  DEFINE(VCPU_USR_REGS,		offsetof(struct kvm_vcpu, arch.regs.usr_regs));
-  DEFINE(VCPU_SVC_REGS,		offsetof(struct kvm_vcpu, arch.regs.svc_regs));
-  DEFINE(VCPU_ABT_REGS,		offsetof(struct kvm_vcpu, arch.regs.abt_regs));
-  DEFINE(VCPU_UND_REGS,		offsetof(struct kvm_vcpu, arch.regs.und_regs));
-  DEFINE(VCPU_IRQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.irq_regs));
-  DEFINE(VCPU_FIQ_REGS,		offsetof(struct kvm_vcpu, arch.regs.fiq_regs));
-  DEFINE(VCPU_PC,		offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_pc));
-  DEFINE(VCPU_CPSR,		offsetof(struct kvm_vcpu, arch.regs.usr_regs.ARM_cpsr));
-  DEFINE(VCPU_HCR,		offsetof(struct kvm_vcpu, arch.hcr));
-  DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
-  DEFINE(VCPU_HSR,		offsetof(struct kvm_vcpu, arch.fault.hsr));
-  DEFINE(VCPU_HxFAR,		offsetof(struct kvm_vcpu, arch.fault.hxfar));
-  DEFINE(VCPU_HPFAR,		offsetof(struct kvm_vcpu, arch.fault.hpfar));
-  DEFINE(VCPU_HYP_PC,		offsetof(struct kvm_vcpu, arch.fault.hyp_pc));
-  DEFINE(VCPU_VGIC_CPU,		offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_V2_CPU_HCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
-  DEFINE(VGIC_V2_CPU_VMCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
-  DEFINE(VGIC_V2_CPU_MISR,	offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
-  DEFINE(VGIC_V2_CPU_EISR,	offsetof(struct vgic_cpu, vgic_v2.vgic_eisr));
-  DEFINE(VGIC_V2_CPU_ELRSR,	offsetof(struct vgic_cpu, vgic_v2.vgic_elrsr));
-  DEFINE(VGIC_V2_CPU_APR,	offsetof(struct vgic_cpu, vgic_v2.vgic_apr));
-  DEFINE(VGIC_V2_CPU_LR,	offsetof(struct vgic_cpu, vgic_v2.vgic_lr));
-  DEFINE(VGIC_CPU_NR_LR,	offsetof(struct vgic_cpu, nr_lr));
-  DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
-  DEFINE(VCPU_TIMER_CNTV_CVAL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
-  DEFINE(KVM_TIMER_CNTVOFF,	offsetof(struct kvm, arch.timer.cntvoff));
-  DEFINE(KVM_TIMER_ENABLED,	offsetof(struct kvm, arch.timer.enabled));
-  DEFINE(KVM_VGIC_VCTRL,	offsetof(struct kvm, arch.vgic.vctrl_base));
-  DEFINE(KVM_VTTBR,		offsetof(struct kvm, arch.vttbr));
+  DEFINE(VCPU_GUEST_CTXT,	offsetof(struct kvm_vcpu, arch.ctxt));
+  DEFINE(VCPU_HOST_CTXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
+  DEFINE(CPU_CTXT_VFP,		offsetof(struct kvm_cpu_context, vfp));
+  DEFINE(CPU_CTXT_GP_REGS,	offsetof(struct kvm_cpu_context, gp_regs));
+  DEFINE(GP_REGS_USR,		offsetof(struct kvm_regs, usr_regs));
 #endif
   BLANK();
 #ifdef CONFIG_VDSO

+ 6 - 0
arch/arm/kernel/vmlinux.lds.S

@@ -18,6 +18,11 @@
 	*(.proc.info.init)						\
 	VMLINUX_SYMBOL(__proc_info_end) = .;
 
+#define HYPERVISOR_TEXT							\
+	VMLINUX_SYMBOL(__hyp_text_start) = .;				\
+	*(.hyp.text)							\
+	VMLINUX_SYMBOL(__hyp_text_end) = .;
+
 #define IDMAP_TEXT							\
 	ALIGN_FUNCTION();						\
 	VMLINUX_SYMBOL(__idmap_text_start) = .;				\
@@ -108,6 +113,7 @@ SECTIONS
 			TEXT_TEXT
 			SCHED_TEXT
 			LOCK_TEXT
+			HYPERVISOR_TEXT
 			KPROBES_TEXT
 			*(.gnu.warning)
 			*(.glue_7)

+ 1 - 0
arch/arm/kvm/Makefile

@@ -17,6 +17,7 @@ AFLAGS_interrupts.o := -Wa,-march=armv7-a$(plus_virt)
 KVM := ../../../virt/kvm
 kvm-arm-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o $(KVM)/vfio.o
 
+obj-$(CONFIG_KVM_ARM_HOST) += hyp/
 obj-y += kvm-arm.o init.o interrupts.o
 obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
 obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o

+ 182 - 62
arch/arm/kvm/arm.c

@@ -28,6 +28,7 @@
 #include <linux/sched.h>
 #include <linux/kvm.h>
 #include <trace/events/kvm.h>
+#include <kvm/arm_pmu.h>
 
 #define CREATE_TRACE_POINTS
 #include "trace.h"
@@ -265,6 +266,7 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 	kvm_mmu_free_memory_caches(vcpu);
 	kvm_timer_vcpu_terminate(vcpu);
 	kvm_vgic_vcpu_destroy(vcpu);
+	kvm_pmu_vcpu_destroy(vcpu);
 	kmem_cache_free(kvm_vcpu_cache, vcpu);
 }
 
@@ -320,6 +322,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	vcpu->cpu = -1;
 
 	kvm_arm_set_running_vcpu(NULL);
+	kvm_timer_vcpu_put(vcpu);
 }
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
@@ -577,6 +580,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * non-preemptible context.
 		 */
 		preempt_disable();
+		kvm_pmu_flush_hwstate(vcpu);
 		kvm_timer_flush_hwstate(vcpu);
 		kvm_vgic_flush_hwstate(vcpu);
 
@@ -593,6 +597,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
 			vcpu->arch.power_off || vcpu->arch.pause) {
 			local_irq_enable();
+			kvm_pmu_sync_hwstate(vcpu);
 			kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			preempt_enable();
@@ -642,10 +647,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 
 		/*
-		 * We must sync the timer state before the vgic state so that
-		 * the vgic can properly sample the updated state of the
+		 * We must sync the PMU and timer state before the vgic state so
+		 * that the vgic can properly sample the updated state of the
 		 * interrupt line.
 		 */
+		kvm_pmu_sync_hwstate(vcpu);
 		kvm_timer_sync_hwstate(vcpu);
 
 		kvm_vgic_sync_hwstate(vcpu);
@@ -823,11 +829,54 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	return 0;
 }
 
+static int kvm_arm_vcpu_set_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_set_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_get_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_get_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
+static int kvm_arm_vcpu_has_attr(struct kvm_vcpu *vcpu,
+				 struct kvm_device_attr *attr)
+{
+	int ret = -ENXIO;
+
+	switch (attr->group) {
+	default:
+		ret = kvm_arm_vcpu_arch_has_attr(vcpu, attr);
+		break;
+	}
+
+	return ret;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
+	struct kvm_device_attr attr;
 
 	switch (ioctl) {
 	case KVM_ARM_VCPU_INIT: {
@@ -870,6 +919,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 			return -E2BIG;
 		return kvm_arm_copy_reg_indices(vcpu, user_list->reg);
 	}
+	case KVM_SET_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_set_attr(vcpu, &attr);
+	}
+	case KVM_GET_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_get_attr(vcpu, &attr);
+	}
+	case KVM_HAS_DEVICE_ATTR: {
+		if (copy_from_user(&attr, argp, sizeof(attr)))
+			return -EFAULT;
+		return kvm_arm_vcpu_has_attr(vcpu, &attr);
+	}
 	default:
 		return -EINVAL;
 	}
@@ -967,6 +1031,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	}
 }
 
+static void cpu_init_stage2(void *dummy)
+{
+	__cpu_init_stage2();
+}
+
 static void cpu_init_hyp_mode(void *dummy)
 {
 	phys_addr_t boot_pgd_ptr;
@@ -985,6 +1054,7 @@ static void cpu_init_hyp_mode(void *dummy)
 	vector_ptr = (unsigned long)__kvm_hyp_vector;
 
 	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+	__cpu_init_stage2();
 
 	kvm_arm_init_debug();
 }
@@ -1035,6 +1105,82 @@ static inline void hyp_cpu_pm_init(void)
 }
 #endif
 
+static void teardown_common_resources(void)
+{
+	free_percpu(kvm_host_cpu_state);
+}
+
+static int init_common_resources(void)
+{
+	kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
+	if (!kvm_host_cpu_state) {
+		kvm_err("Cannot allocate host CPU state\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int init_subsystems(void)
+{
+	int err;
+
+	/*
+	 * Init HYP view of VGIC
+	 */
+	err = kvm_vgic_hyp_init();
+	switch (err) {
+	case 0:
+		vgic_present = true;
+		break;
+	case -ENODEV:
+	case -ENXIO:
+		vgic_present = false;
+		break;
+	default:
+		return err;
+	}
+
+	/*
+	 * Init HYP architected timer support
+	 */
+	err = kvm_timer_hyp_init();
+	if (err)
+		return err;
+
+	kvm_perf_init();
+	kvm_coproc_table_init();
+
+	return 0;
+}
+
+static void teardown_hyp_mode(void)
+{
+	int cpu;
+
+	if (is_kernel_in_hyp_mode())
+		return;
+
+	free_hyp_pgds();
+	for_each_possible_cpu(cpu)
+		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+}
+
+static int init_vhe_mode(void)
+{
+	/*
+	 * Execute the init code on each CPU.
+	 */
+	on_each_cpu(cpu_init_stage2, NULL, 1);
+
+	/* set size of VMID supported by CPU */
+	kvm_vmid_bits = kvm_get_vmid_bits();
+	kvm_info("%d-bit VMID\n", kvm_vmid_bits);
+
+	kvm_info("VHE mode initialized successfully\n");
+	return 0;
+}
+
 /**
  * Inits Hyp-mode on all online CPUs
  */
@@ -1065,7 +1211,7 @@ static int init_hyp_mode(void)
 		stack_page = __get_free_page(GFP_KERNEL);
 		if (!stack_page) {
 			err = -ENOMEM;
-			goto out_free_stack_pages;
+			goto out_err;
 		}
 
 		per_cpu(kvm_arm_hyp_stack_page, cpu) = stack_page;
@@ -1074,16 +1220,16 @@ static int init_hyp_mode(void)
 	/*
 	 * Map the Hyp-code called directly from the host
 	 */
-	err = create_hyp_mappings(__kvm_hyp_code_start, __kvm_hyp_code_end);
+	err = create_hyp_mappings(__hyp_text_start, __hyp_text_end);
 	if (err) {
 		kvm_err("Cannot map world-switch code\n");
-		goto out_free_mappings;
+		goto out_err;
 	}
 
 	err = create_hyp_mappings(__start_rodata, __end_rodata);
 	if (err) {
 		kvm_err("Cannot map rodata section\n");
-		goto out_free_mappings;
+		goto out_err;
 	}
 
 	/*
@@ -1095,20 +1241,10 @@ static int init_hyp_mode(void)
 
 		if (err) {
 			kvm_err("Cannot map hyp stack\n");
-			goto out_free_mappings;
+			goto out_err;
 		}
 	}
 
-	/*
-	 * Map the host CPU structures
-	 */
-	kvm_host_cpu_state = alloc_percpu(kvm_cpu_context_t);
-	if (!kvm_host_cpu_state) {
-		err = -ENOMEM;
-		kvm_err("Cannot allocate host CPU state\n");
-		goto out_free_mappings;
-	}
-
 	for_each_possible_cpu(cpu) {
 		kvm_cpu_context_t *cpu_ctxt;
 
@@ -1117,7 +1253,7 @@ static int init_hyp_mode(void)
 
 		if (err) {
 			kvm_err("Cannot map host CPU state: %d\n", err);
-			goto out_free_context;
+			goto out_err;
 		}
 	}
 
@@ -1126,34 +1262,22 @@ static int init_hyp_mode(void)
 	 */
 	on_each_cpu(cpu_init_hyp_mode, NULL, 1);
 
-	/*
-	 * Init HYP view of VGIC
-	 */
-	err = kvm_vgic_hyp_init();
-	switch (err) {
-	case 0:
-		vgic_present = true;
-		break;
-	case -ENODEV:
-	case -ENXIO:
-		vgic_present = false;
-		break;
-	default:
-		goto out_free_context;
-	}
-
-	/*
-	 * Init HYP architected timer support
-	 */
-	err = kvm_timer_hyp_init();
-	if (err)
-		goto out_free_context;
-
 #ifndef CONFIG_HOTPLUG_CPU
 	free_boot_hyp_pgd();
 #endif
 
-	kvm_perf_init();
+	cpu_notifier_register_begin();
+
+	err = __register_cpu_notifier(&hyp_init_cpu_nb);
+
+	cpu_notifier_register_done();
+
+	if (err) {
+		kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+		goto out_err;
+	}
+
+	hyp_cpu_pm_init();
 
 	/* set size of VMID supported by CPU */
 	kvm_vmid_bits = kvm_get_vmid_bits();
@@ -1162,14 +1286,9 @@ static int init_hyp_mode(void)
 	kvm_info("Hyp mode initialized successfully\n");
 
 	return 0;
-out_free_context:
-	free_percpu(kvm_host_cpu_state);
-out_free_mappings:
-	free_hyp_pgds();
-out_free_stack_pages:
-	for_each_possible_cpu(cpu)
-		free_page(per_cpu(kvm_arm_hyp_stack_page, cpu));
+
 out_err:
+	teardown_hyp_mode();
 	kvm_err("error initializing Hyp mode: %d\n", err);
 	return err;
 }
@@ -1213,26 +1332,27 @@ int kvm_arch_init(void *opaque)
 		}
 	}
 
-	cpu_notifier_register_begin();
-
-	err = init_hyp_mode();
+	err = init_common_resources();
 	if (err)
-		goto out_err;
+		return err;
 
-	err = __register_cpu_notifier(&hyp_init_cpu_nb);
-	if (err) {
-		kvm_err("Cannot register HYP init CPU notifier (%d)\n", err);
+	if (is_kernel_in_hyp_mode())
+		err = init_vhe_mode();
+	else
+		err = init_hyp_mode();
+	if (err)
 		goto out_err;
-	}
-
-	cpu_notifier_register_done();
 
-	hyp_cpu_pm_init();
+	err = init_subsystems();
+	if (err)
+		goto out_hyp;
 
-	kvm_coproc_table_init();
 	return 0;
+
+out_hyp:
+	teardown_hyp_mode();
 out_err:
-	cpu_notifier_register_done();
+	teardown_common_resources();
 	return err;
 }
 

+ 70 - 56
arch/arm/kvm/coproc.c

@@ -16,6 +16,8 @@
  * along with this program; if not, write to the Free Software
  * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
  */
+
+#include <linux/bsearch.h>
 #include <linux/mm.h>
 #include <linux/kvm_host.h>
 #include <linux/uaccess.h>
@@ -54,8 +56,8 @@ static inline void vcpu_cp15_reg64_set(struct kvm_vcpu *vcpu,
 				       const struct coproc_reg *r,
 				       u64 val)
 {
-	vcpu->arch.cp15[r->reg] = val & 0xffffffff;
-	vcpu->arch.cp15[r->reg + 1] = val >> 32;
+	vcpu_cp15(vcpu, r->reg) = val & 0xffffffff;
+	vcpu_cp15(vcpu, r->reg + 1) = val >> 32;
 }
 
 static inline u64 vcpu_cp15_reg64_get(struct kvm_vcpu *vcpu,
@@ -63,9 +65,9 @@ static inline u64 vcpu_cp15_reg64_get(struct kvm_vcpu *vcpu,
 {
 	u64 val;
 
-	val = vcpu->arch.cp15[r->reg + 1];
+	val = vcpu_cp15(vcpu, r->reg + 1);
 	val = val << 32;
-	val = val | vcpu->arch.cp15[r->reg];
+	val = val | vcpu_cp15(vcpu, r->reg);
 	return val;
 }
 
@@ -104,7 +106,7 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 	 * vcpu_id, but we read the 'U' bit from the underlying
 	 * hardware directly.
 	 */
-	vcpu->arch.cp15[c0_MPIDR] = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) |
+	vcpu_cp15(vcpu, c0_MPIDR) = ((read_cpuid_mpidr() & MPIDR_SMP_BITMASK) |
 				     ((vcpu->vcpu_id >> 2) << MPIDR_LEVEL_BITS) |
 				     (vcpu->vcpu_id & 3));
 }
@@ -117,7 +119,7 @@ static bool access_actlr(struct kvm_vcpu *vcpu,
 	if (p->is_write)
 		return ignore_write(vcpu, p);
 
-	*vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c1_ACTLR];
+	*vcpu_reg(vcpu, p->Rt1) = vcpu_cp15(vcpu, c1_ACTLR);
 	return true;
 }
 
@@ -139,7 +141,7 @@ static bool access_l2ctlr(struct kvm_vcpu *vcpu,
 	if (p->is_write)
 		return ignore_write(vcpu, p);
 
-	*vcpu_reg(vcpu, p->Rt1) = vcpu->arch.cp15[c9_L2CTLR];
+	*vcpu_reg(vcpu, p->Rt1) = vcpu_cp15(vcpu, c9_L2CTLR);
 	return true;
 }
 
@@ -156,7 +158,7 @@ static void reset_l2ctlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 	ncores = min(ncores, 3U);
 	l2ctlr |= (ncores & 3) << 24;
 
-	vcpu->arch.cp15[c9_L2CTLR] = l2ctlr;
+	vcpu_cp15(vcpu, c9_L2CTLR) = l2ctlr;
 }
 
 static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
@@ -171,7 +173,7 @@ static void reset_actlr(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 	else
 		actlr &= ~(1U << 6);
 
-	vcpu->arch.cp15[c1_ACTLR] = actlr;
+	vcpu_cp15(vcpu, c1_ACTLR) = actlr;
 }
 
 /*
@@ -218,9 +220,9 @@ bool access_vm_reg(struct kvm_vcpu *vcpu,
 
 	BUG_ON(!p->is_write);
 
-	vcpu->arch.cp15[r->reg] = *vcpu_reg(vcpu, p->Rt1);
+	vcpu_cp15(vcpu, r->reg) = *vcpu_reg(vcpu, p->Rt1);
 	if (p->is_64bit)
-		vcpu->arch.cp15[r->reg + 1] = *vcpu_reg(vcpu, p->Rt2);
+		vcpu_cp15(vcpu, r->reg + 1) = *vcpu_reg(vcpu, p->Rt2);
 
 	kvm_toggle_cache(vcpu, was_enabled);
 	return true;
@@ -381,17 +383,26 @@ static const struct coproc_reg cp15_regs[] = {
 	{ CRn(15), CRm( 0), Op1( 4), Op2( 0), is32, access_cbar},
 };
 
+static int check_reg_table(const struct coproc_reg *table, unsigned int n)
+{
+	unsigned int i;
+
+	for (i = 1; i < n; i++) {
+		if (cmp_reg(&table[i-1], &table[i]) >= 0) {
+			kvm_err("reg table %p out of order (%d)\n", table, i - 1);
+			return 1;
+		}
+	}
+
+	return 0;
+}
+
 /* Target specific emulation tables */
 static struct kvm_coproc_target_table *target_tables[KVM_ARM_NUM_TARGETS];
 
 void kvm_register_target_coproc_table(struct kvm_coproc_target_table *table)
 {
-	unsigned int i;
-
-	for (i = 1; i < table->num; i++)
-		BUG_ON(cmp_reg(&table->table[i-1],
-			       &table->table[i]) >= 0);
-
+	BUG_ON(check_reg_table(table->table, table->num));
 	target_tables[table->target] = table;
 }
 
@@ -405,29 +416,32 @@ static const struct coproc_reg *get_target_table(unsigned target, size_t *num)
 	return table->table;
 }
 
+#define reg_to_match_value(x)						\
+	({								\
+		unsigned long val;					\
+		val  = (x)->CRn << 11;					\
+		val |= (x)->CRm << 7;					\
+		val |= (x)->Op1 << 4;					\
+		val |= (x)->Op2 << 1;					\
+		val |= !(x)->is_64bit;					\
+		val;							\
+	 })
+
+static int match_reg(const void *key, const void *elt)
+{
+	const unsigned long pval = (unsigned long)key;
+	const struct coproc_reg *r = elt;
+
+	return pval - reg_to_match_value(r);
+}
+
 static const struct coproc_reg *find_reg(const struct coproc_params *params,
 					 const struct coproc_reg table[],
 					 unsigned int num)
 {
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		const struct coproc_reg *r = &table[i];
-
-		if (params->is_64bit != r->is_64)
-			continue;
-		if (params->CRn != r->CRn)
-			continue;
-		if (params->CRm != r->CRm)
-			continue;
-		if (params->Op1 != r->Op1)
-			continue;
-		if (params->Op2 != r->Op2)
-			continue;
+	unsigned long pval = reg_to_match_value(params);
 
-		return r;
-	}
-	return NULL;
+	return bsearch((void *)pval, table, num, sizeof(table[0]), match_reg);
 }
 
 static int emulate_cp15(struct kvm_vcpu *vcpu,
@@ -645,6 +659,9 @@ static struct coproc_reg invariant_cp15[] = {
 	{ CRn( 0), CRm( 0), Op1( 0), Op2( 3), is32, NULL, get_TLBTR },
 	{ CRn( 0), CRm( 0), Op1( 0), Op2( 6), is32, NULL, get_REVIDR },
 
+	{ CRn( 0), CRm( 0), Op1( 1), Op2( 1), is32, NULL, get_CLIDR },
+	{ CRn( 0), CRm( 0), Op1( 1), Op2( 7), is32, NULL, get_AIDR },
+
 	{ CRn( 0), CRm( 1), Op1( 0), Op2( 0), is32, NULL, get_ID_PFR0 },
 	{ CRn( 0), CRm( 1), Op1( 0), Op2( 1), is32, NULL, get_ID_PFR1 },
 	{ CRn( 0), CRm( 1), Op1( 0), Op2( 2), is32, NULL, get_ID_DFR0 },
@@ -660,9 +677,6 @@ static struct coproc_reg invariant_cp15[] = {
 	{ CRn( 0), CRm( 2), Op1( 0), Op2( 3), is32, NULL, get_ID_ISAR3 },
 	{ CRn( 0), CRm( 2), Op1( 0), Op2( 4), is32, NULL, get_ID_ISAR4 },
 	{ CRn( 0), CRm( 2), Op1( 0), Op2( 5), is32, NULL, get_ID_ISAR5 },
-
-	{ CRn( 0), CRm( 0), Op1( 1), Op2( 1), is32, NULL, get_CLIDR },
-	{ CRn( 0), CRm( 0), Op1( 1), Op2( 7), is32, NULL, get_AIDR },
 };
 
 /*
@@ -901,7 +915,7 @@ static int vfp_get_reg(const struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
 	if (vfpid < num_fp_regs()) {
 		if (KVM_REG_SIZE(id) != 8)
 			return -ENOENT;
-		return reg_to_user(uaddr, &vcpu->arch.vfp_guest.fpregs[vfpid],
+		return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpregs[vfpid],
 				   id);
 	}
 
@@ -911,13 +925,13 @@ static int vfp_get_reg(const struct kvm_vcpu *vcpu, u64 id, void __user *uaddr)
 
 	switch (vfpid) {
 	case KVM_REG_ARM_VFP_FPEXC:
-		return reg_to_user(uaddr, &vcpu->arch.vfp_guest.fpexc, id);
+		return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpexc, id);
 	case KVM_REG_ARM_VFP_FPSCR:
-		return reg_to_user(uaddr, &vcpu->arch.vfp_guest.fpscr, id);
+		return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpscr, id);
 	case KVM_REG_ARM_VFP_FPINST:
-		return reg_to_user(uaddr, &vcpu->arch.vfp_guest.fpinst, id);
+		return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpinst, id);
 	case KVM_REG_ARM_VFP_FPINST2:
-		return reg_to_user(uaddr, &vcpu->arch.vfp_guest.fpinst2, id);
+		return reg_to_user(uaddr, &vcpu->arch.ctxt.vfp.fpinst2, id);
 	case KVM_REG_ARM_VFP_MVFR0:
 		val = fmrx(MVFR0);
 		return reg_to_user(uaddr, &val, id);
@@ -945,7 +959,7 @@ static int vfp_set_reg(struct kvm_vcpu *vcpu, u64 id, const void __user *uaddr)
 	if (vfpid < num_fp_regs()) {
 		if (KVM_REG_SIZE(id) != 8)
 			return -ENOENT;
-		return reg_from_user(&vcpu->arch.vfp_guest.fpregs[vfpid],
+		return reg_from_user(&vcpu->arch.ctxt.vfp.fpregs[vfpid],
 				     uaddr, id);
 	}
 
@@ -955,13 +969,13 @@ static int vfp_set_reg(struct kvm_vcpu *vcpu, u64 id, const void __user *uaddr)
 
 	switch (vfpid) {
 	case KVM_REG_ARM_VFP_FPEXC:
-		return reg_from_user(&vcpu->arch.vfp_guest.fpexc, uaddr, id);
+		return reg_from_user(&vcpu->arch.ctxt.vfp.fpexc, uaddr, id);
 	case KVM_REG_ARM_VFP_FPSCR:
-		return reg_from_user(&vcpu->arch.vfp_guest.fpscr, uaddr, id);
+		return reg_from_user(&vcpu->arch.ctxt.vfp.fpscr, uaddr, id);
 	case KVM_REG_ARM_VFP_FPINST:
-		return reg_from_user(&vcpu->arch.vfp_guest.fpinst, uaddr, id);
+		return reg_from_user(&vcpu->arch.ctxt.vfp.fpinst, uaddr, id);
 	case KVM_REG_ARM_VFP_FPINST2:
-		return reg_from_user(&vcpu->arch.vfp_guest.fpinst2, uaddr, id);
+		return reg_from_user(&vcpu->arch.ctxt.vfp.fpinst2, uaddr, id);
 	/* These are invariant. */
 	case KVM_REG_ARM_VFP_MVFR0:
 		if (reg_from_user(&val, uaddr, id))
@@ -1030,7 +1044,7 @@ int kvm_arm_coproc_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 		val = vcpu_cp15_reg64_get(vcpu, r);
 		ret = reg_to_user(uaddr, &val, reg->id);
 	} else if (KVM_REG_SIZE(reg->id) == 4) {
-		ret = reg_to_user(uaddr, &vcpu->arch.cp15[r->reg], reg->id);
+		ret = reg_to_user(uaddr, &vcpu_cp15(vcpu, r->reg), reg->id);
 	}
 
 	return ret;
@@ -1060,7 +1074,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 		if (!ret)
 			vcpu_cp15_reg64_set(vcpu, r, val);
 	} else if (KVM_REG_SIZE(reg->id) == 4) {
-		ret = reg_from_user(&vcpu->arch.cp15[r->reg], uaddr, reg->id);
+		ret = reg_from_user(&vcpu_cp15(vcpu, r->reg), uaddr, reg->id);
 	}
 
 	return ret;
@@ -1096,7 +1110,7 @@ static int write_demux_regids(u64 __user *uindices)
 static u64 cp15_to_index(const struct coproc_reg *reg)
 {
 	u64 val = KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT);
-	if (reg->is_64) {
+	if (reg->is_64bit) {
 		val |= KVM_REG_SIZE_U64;
 		val |= (reg->Op1 << KVM_REG_ARM_OPC1_SHIFT);
 		/*
@@ -1210,8 +1224,8 @@ void kvm_coproc_table_init(void)
 	unsigned int i;
 
 	/* Make sure tables are unique and in order. */
-	for (i = 1; i < ARRAY_SIZE(cp15_regs); i++)
-		BUG_ON(cmp_reg(&cp15_regs[i-1], &cp15_regs[i]) >= 0);
+	BUG_ON(check_reg_table(cp15_regs, ARRAY_SIZE(cp15_regs)));
+	BUG_ON(check_reg_table(invariant_cp15, ARRAY_SIZE(invariant_cp15)));
 
 	/* We abuse the reset function to overwrite the table itself. */
 	for (i = 0; i < ARRAY_SIZE(invariant_cp15); i++)
@@ -1248,7 +1262,7 @@ void kvm_reset_coprocs(struct kvm_vcpu *vcpu)
 	const struct coproc_reg *table;
 
 	/* Catch someone adding a register without putting in reset entry. */
-	memset(vcpu->arch.cp15, 0x42, sizeof(vcpu->arch.cp15));
+	memset(vcpu->arch.ctxt.cp15, 0x42, sizeof(vcpu->arch.ctxt.cp15));
 
 	/* Generic chip reset first (so target could override). */
 	reset_coproc_regs(vcpu, cp15_regs, ARRAY_SIZE(cp15_regs));
@@ -1257,6 +1271,6 @@ void kvm_reset_coprocs(struct kvm_vcpu *vcpu)
 	reset_coproc_regs(vcpu, table, num);
 
 	for (num = 1; num < NR_CP15_REGS; num++)
-		if (vcpu->arch.cp15[num] == 0x42424242)
-			panic("Didn't reset vcpu->arch.cp15[%zi]", num);
+		if (vcpu_cp15(vcpu, num) == 0x42424242)
+			panic("Didn't reset vcpu_cp15(vcpu, %zi)", num);
 }

+ 12 - 12
arch/arm/kvm/coproc.h

@@ -37,7 +37,7 @@ struct coproc_reg {
 	unsigned long Op1;
 	unsigned long Op2;
 
-	bool is_64;
+	bool is_64bit;
 
 	/* Trapped access from guest, if non-NULL. */
 	bool (*access)(struct kvm_vcpu *,
@@ -47,7 +47,7 @@ struct coproc_reg {
 	/* Initialization for vcpu. */
 	void (*reset)(struct kvm_vcpu *, const struct coproc_reg *);
 
-	/* Index into vcpu->arch.cp15[], or 0 if we don't need to save it. */
+	/* Index into vcpu_cp15(vcpu, ...), or 0 if we don't need to save it. */
 	unsigned long reg;
 
 	/* Value (usually reset value) */
@@ -104,25 +104,25 @@ static inline void reset_unknown(struct kvm_vcpu *vcpu,
 				 const struct coproc_reg *r)
 {
 	BUG_ON(!r->reg);
-	BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.cp15));
-	vcpu->arch.cp15[r->reg] = 0xdecafbad;
+	BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.ctxt.cp15));
+	vcpu_cp15(vcpu, r->reg) = 0xdecafbad;
 }
 
 static inline void reset_val(struct kvm_vcpu *vcpu, const struct coproc_reg *r)
 {
 	BUG_ON(!r->reg);
-	BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.cp15));
-	vcpu->arch.cp15[r->reg] = r->val;
+	BUG_ON(r->reg >= ARRAY_SIZE(vcpu->arch.ctxt.cp15));
+	vcpu_cp15(vcpu, r->reg) = r->val;
 }
 
 static inline void reset_unknown64(struct kvm_vcpu *vcpu,
 				   const struct coproc_reg *r)
 {
 	BUG_ON(!r->reg);
-	BUG_ON(r->reg + 1 >= ARRAY_SIZE(vcpu->arch.cp15));
+	BUG_ON(r->reg + 1 >= ARRAY_SIZE(vcpu->arch.ctxt.cp15));
 
-	vcpu->arch.cp15[r->reg] = 0xdecafbad;
-	vcpu->arch.cp15[r->reg+1] = 0xd0c0ffee;
+	vcpu_cp15(vcpu, r->reg) = 0xdecafbad;
+	vcpu_cp15(vcpu, r->reg+1) = 0xd0c0ffee;
 }
 
 static inline int cmp_reg(const struct coproc_reg *i1,
@@ -141,7 +141,7 @@ static inline int cmp_reg(const struct coproc_reg *i1,
 		return i1->Op1 - i2->Op1;
 	if (i1->Op2 != i2->Op2)
 		return i1->Op2 - i2->Op2;
-	return i2->is_64 - i1->is_64;
+	return i2->is_64bit - i1->is_64bit;
 }
 
 
@@ -150,8 +150,8 @@ static inline int cmp_reg(const struct coproc_reg *i1,
 #define CRm64(_x)       .CRn = _x, .CRm = 0
 #define Op1(_x) 	.Op1 = _x
 #define Op2(_x) 	.Op2 = _x
-#define is64		.is_64 = true
-#define is32		.is_64 = false
+#define is64		.is_64bit = true
+#define is32		.is_64bit = false
 
 bool access_vm_reg(struct kvm_vcpu *vcpu,
 		   const struct coproc_params *p,

+ 17 - 17
arch/arm/kvm/emulate.c

@@ -112,7 +112,7 @@ static const unsigned long vcpu_reg_offsets[VCPU_NR_MODES][15] = {
  */
 unsigned long *vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num)
 {
-	unsigned long *reg_array = (unsigned long *)&vcpu->arch.regs;
+	unsigned long *reg_array = (unsigned long *)&vcpu->arch.ctxt.gp_regs;
 	unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK;
 
 	switch (mode) {
@@ -147,15 +147,15 @@ unsigned long *vcpu_spsr(struct kvm_vcpu *vcpu)
 	unsigned long mode = *vcpu_cpsr(vcpu) & MODE_MASK;
 	switch (mode) {
 	case SVC_MODE:
-		return &vcpu->arch.regs.KVM_ARM_SVC_spsr;
+		return &vcpu->arch.ctxt.gp_regs.KVM_ARM_SVC_spsr;
 	case ABT_MODE:
-		return &vcpu->arch.regs.KVM_ARM_ABT_spsr;
+		return &vcpu->arch.ctxt.gp_regs.KVM_ARM_ABT_spsr;
 	case UND_MODE:
-		return &vcpu->arch.regs.KVM_ARM_UND_spsr;
+		return &vcpu->arch.ctxt.gp_regs.KVM_ARM_UND_spsr;
 	case IRQ_MODE:
-		return &vcpu->arch.regs.KVM_ARM_IRQ_spsr;
+		return &vcpu->arch.ctxt.gp_regs.KVM_ARM_IRQ_spsr;
 	case FIQ_MODE:
-		return &vcpu->arch.regs.KVM_ARM_FIQ_spsr;
+		return &vcpu->arch.ctxt.gp_regs.KVM_ARM_FIQ_spsr;
 	default:
 		BUG();
 	}
@@ -266,8 +266,8 @@ void kvm_skip_instr(struct kvm_vcpu *vcpu, bool is_wide_instr)
 
 static u32 exc_vector_base(struct kvm_vcpu *vcpu)
 {
-	u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
-	u32 vbar = vcpu->arch.cp15[c12_VBAR];
+	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+	u32 vbar = vcpu_cp15(vcpu, c12_VBAR);
 
 	if (sctlr & SCTLR_V)
 		return 0xffff0000;
@@ -282,7 +282,7 @@ static u32 exc_vector_base(struct kvm_vcpu *vcpu)
 static void kvm_update_psr(struct kvm_vcpu *vcpu, unsigned long mode)
 {
 	unsigned long cpsr = *vcpu_cpsr(vcpu);
-	u32 sctlr = vcpu->arch.cp15[c1_SCTLR];
+	u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
 
 	*vcpu_cpsr(vcpu) = (cpsr & ~MODE_MASK) | mode;
 
@@ -357,22 +357,22 @@ static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr)
 
 	if (is_pabt) {
 		/* Set IFAR and IFSR */
-		vcpu->arch.cp15[c6_IFAR] = addr;
-		is_lpae = (vcpu->arch.cp15[c2_TTBCR] >> 31);
+		vcpu_cp15(vcpu, c6_IFAR) = addr;
+		is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
 		/* Always give debug fault for now - should give guest a clue */
 		if (is_lpae)
-			vcpu->arch.cp15[c5_IFSR] = 1 << 9 | 0x22;
+			vcpu_cp15(vcpu, c5_IFSR) = 1 << 9 | 0x22;
 		else
-			vcpu->arch.cp15[c5_IFSR] = 2;
+			vcpu_cp15(vcpu, c5_IFSR) = 2;
 	} else { /* !iabt */
 		/* Set DFAR and DFSR */
-		vcpu->arch.cp15[c6_DFAR] = addr;
-		is_lpae = (vcpu->arch.cp15[c2_TTBCR] >> 31);
+		vcpu_cp15(vcpu, c6_DFAR) = addr;
+		is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
 		/* Always give debug fault for now - should give guest a clue */
 		if (is_lpae)
-			vcpu->arch.cp15[c5_DFSR] = 1 << 9 | 0x22;
+			vcpu_cp15(vcpu, c5_DFSR) = 1 << 9 | 0x22;
 		else
-			vcpu->arch.cp15[c5_DFSR] = 2;
+			vcpu_cp15(vcpu, c5_DFSR) = 2;
 	}
 
 }

+ 2 - 3
arch/arm/kvm/guest.c

@@ -25,7 +25,6 @@
 #include <asm/cputype.h>
 #include <asm/uaccess.h>
 #include <asm/kvm.h>
-#include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 
@@ -55,7 +54,7 @@ static u64 core_reg_offset_from_id(u64 id)
 static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
 	u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-	struct kvm_regs *regs = &vcpu->arch.regs;
+	struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
 	u64 off;
 
 	if (KVM_REG_SIZE(reg->id) != 4)
@@ -72,7 +71,7 @@ static int get_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 static int set_core_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg)
 {
 	u32 __user *uaddr = (u32 __user *)(long)reg->addr;
-	struct kvm_regs *regs = &vcpu->arch.regs;
+	struct kvm_regs *regs = &vcpu->arch.ctxt.gp_regs;
 	u64 off, val;
 
 	if (KVM_REG_SIZE(reg->id) != 4)

+ 0 - 7
arch/arm/kvm/handle_exit.c

@@ -147,13 +147,6 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
 	switch (exception_index) {
 	case ARM_EXCEPTION_IRQ:
 		return 1;
-	case ARM_EXCEPTION_UNDEFINED:
-		kvm_err("Undefined exception in Hyp mode at: %#08lx\n",
-			kvm_vcpu_get_hyp_pc(vcpu));
-		BUG();
-		panic("KVM: Hypervisor undefined exception!\n");
-	case ARM_EXCEPTION_DATA_ABORT:
-	case ARM_EXCEPTION_PREF_ABORT:
 	case ARM_EXCEPTION_HVC:
 		/*
 		 * See ARM ARM B1.14.1: "Hyp traps on instructions

+ 17 - 0
arch/arm/kvm/hyp/Makefile

@@ -0,0 +1,17 @@
+#
+# Makefile for Kernel-based Virtual Machine module, HYP part
+#
+
+KVM=../../../../virt/kvm
+
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
+
+obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
+obj-$(CONFIG_KVM_ARM_HOST) += cp15-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += vfp.o
+obj-$(CONFIG_KVM_ARM_HOST) += banked-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += switch.o
+obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o

+ 77 - 0
arch/arm/kvm/hyp/banked-sr.c

@@ -0,0 +1,77 @@
+/*
+ * Original code:
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/kvm_hyp.h>
+
+__asm__(".arch_extension     virt");
+
+void __hyp_text __banked_save_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->gp_regs.usr_regs.ARM_sp	= read_special(SP_usr);
+	ctxt->gp_regs.usr_regs.ARM_pc	= read_special(ELR_hyp);
+	ctxt->gp_regs.usr_regs.ARM_cpsr	= read_special(SPSR);
+	ctxt->gp_regs.KVM_ARM_SVC_sp	= read_special(SP_svc);
+	ctxt->gp_regs.KVM_ARM_SVC_lr	= read_special(LR_svc);
+	ctxt->gp_regs.KVM_ARM_SVC_spsr	= read_special(SPSR_svc);
+	ctxt->gp_regs.KVM_ARM_ABT_sp	= read_special(SP_abt);
+	ctxt->gp_regs.KVM_ARM_ABT_lr	= read_special(LR_abt);
+	ctxt->gp_regs.KVM_ARM_ABT_spsr	= read_special(SPSR_abt);
+	ctxt->gp_regs.KVM_ARM_UND_sp	= read_special(SP_und);
+	ctxt->gp_regs.KVM_ARM_UND_lr	= read_special(LR_und);
+	ctxt->gp_regs.KVM_ARM_UND_spsr	= read_special(SPSR_und);
+	ctxt->gp_regs.KVM_ARM_IRQ_sp	= read_special(SP_irq);
+	ctxt->gp_regs.KVM_ARM_IRQ_lr	= read_special(LR_irq);
+	ctxt->gp_regs.KVM_ARM_IRQ_spsr	= read_special(SPSR_irq);
+	ctxt->gp_regs.KVM_ARM_FIQ_r8	= read_special(R8_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_r9	= read_special(R9_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_r10	= read_special(R10_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_fp	= read_special(R11_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_ip	= read_special(R12_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_sp	= read_special(SP_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_lr	= read_special(LR_fiq);
+	ctxt->gp_regs.KVM_ARM_FIQ_spsr	= read_special(SPSR_fiq);
+}
+
+void __hyp_text __banked_restore_state(struct kvm_cpu_context *ctxt)
+{
+	write_special(ctxt->gp_regs.usr_regs.ARM_sp,	SP_usr);
+	write_special(ctxt->gp_regs.usr_regs.ARM_pc,	ELR_hyp);
+	write_special(ctxt->gp_regs.usr_regs.ARM_cpsr,	SPSR_cxsf);
+	write_special(ctxt->gp_regs.KVM_ARM_SVC_sp,	SP_svc);
+	write_special(ctxt->gp_regs.KVM_ARM_SVC_lr,	LR_svc);
+	write_special(ctxt->gp_regs.KVM_ARM_SVC_spsr,	SPSR_svc);
+	write_special(ctxt->gp_regs.KVM_ARM_ABT_sp,	SP_abt);
+	write_special(ctxt->gp_regs.KVM_ARM_ABT_lr,	LR_abt);
+	write_special(ctxt->gp_regs.KVM_ARM_ABT_spsr,	SPSR_abt);
+	write_special(ctxt->gp_regs.KVM_ARM_UND_sp,	SP_und);
+	write_special(ctxt->gp_regs.KVM_ARM_UND_lr,	LR_und);
+	write_special(ctxt->gp_regs.KVM_ARM_UND_spsr,	SPSR_und);
+	write_special(ctxt->gp_regs.KVM_ARM_IRQ_sp,	SP_irq);
+	write_special(ctxt->gp_regs.KVM_ARM_IRQ_lr,	LR_irq);
+	write_special(ctxt->gp_regs.KVM_ARM_IRQ_spsr,	SPSR_irq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_r8,	R8_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_r9,	R9_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_r10,	R10_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_fp,	R11_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_ip,	R12_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_sp,	SP_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_lr,	LR_fiq);
+	write_special(ctxt->gp_regs.KVM_ARM_FIQ_spsr,	SPSR_fiq);
+}

+ 84 - 0
arch/arm/kvm/hyp/cp15-sr.c

@@ -0,0 +1,84 @@
+/*
+ * Original code:
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/kvm_hyp.h>
+
+static u64 *cp15_64(struct kvm_cpu_context *ctxt, int idx)
+{
+	return (u64 *)(ctxt->cp15 + idx);
+}
+
+void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->cp15[c0_MPIDR]		= read_sysreg(VMPIDR);
+	ctxt->cp15[c0_CSSELR]		= read_sysreg(CSSELR);
+	ctxt->cp15[c1_SCTLR]		= read_sysreg(SCTLR);
+	ctxt->cp15[c1_CPACR]		= read_sysreg(CPACR);
+	*cp15_64(ctxt, c2_TTBR0)	= read_sysreg(TTBR0);
+	*cp15_64(ctxt, c2_TTBR1)	= read_sysreg(TTBR1);
+	ctxt->cp15[c2_TTBCR]		= read_sysreg(TTBCR);
+	ctxt->cp15[c3_DACR]		= read_sysreg(DACR);
+	ctxt->cp15[c5_DFSR]		= read_sysreg(DFSR);
+	ctxt->cp15[c5_IFSR]		= read_sysreg(IFSR);
+	ctxt->cp15[c5_ADFSR]		= read_sysreg(ADFSR);
+	ctxt->cp15[c5_AIFSR]		= read_sysreg(AIFSR);
+	ctxt->cp15[c6_DFAR]		= read_sysreg(DFAR);
+	ctxt->cp15[c6_IFAR]		= read_sysreg(IFAR);
+	*cp15_64(ctxt, c7_PAR)		= read_sysreg(PAR);
+	ctxt->cp15[c10_PRRR]		= read_sysreg(PRRR);
+	ctxt->cp15[c10_NMRR]		= read_sysreg(NMRR);
+	ctxt->cp15[c10_AMAIR0]		= read_sysreg(AMAIR0);
+	ctxt->cp15[c10_AMAIR1]		= read_sysreg(AMAIR1);
+	ctxt->cp15[c12_VBAR]		= read_sysreg(VBAR);
+	ctxt->cp15[c13_CID]		= read_sysreg(CID);
+	ctxt->cp15[c13_TID_URW]		= read_sysreg(TID_URW);
+	ctxt->cp15[c13_TID_URO]		= read_sysreg(TID_URO);
+	ctxt->cp15[c13_TID_PRIV]	= read_sysreg(TID_PRIV);
+	ctxt->cp15[c14_CNTKCTL]		= read_sysreg(CNTKCTL);
+}
+
+void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+{
+	write_sysreg(ctxt->cp15[c0_MPIDR],	VMPIDR);
+	write_sysreg(ctxt->cp15[c0_CSSELR],	CSSELR);
+	write_sysreg(ctxt->cp15[c1_SCTLR],	SCTLR);
+	write_sysreg(ctxt->cp15[c1_CPACR],	CPACR);
+	write_sysreg(*cp15_64(ctxt, c2_TTBR0),	TTBR0);
+	write_sysreg(*cp15_64(ctxt, c2_TTBR1),	TTBR1);
+	write_sysreg(ctxt->cp15[c2_TTBCR],	TTBCR);
+	write_sysreg(ctxt->cp15[c3_DACR],	DACR);
+	write_sysreg(ctxt->cp15[c5_DFSR],	DFSR);
+	write_sysreg(ctxt->cp15[c5_IFSR],	IFSR);
+	write_sysreg(ctxt->cp15[c5_ADFSR],	ADFSR);
+	write_sysreg(ctxt->cp15[c5_AIFSR],	AIFSR);
+	write_sysreg(ctxt->cp15[c6_DFAR],	DFAR);
+	write_sysreg(ctxt->cp15[c6_IFAR],	IFAR);
+	write_sysreg(*cp15_64(ctxt, c7_PAR),	PAR);
+	write_sysreg(ctxt->cp15[c10_PRRR],	PRRR);
+	write_sysreg(ctxt->cp15[c10_NMRR],	NMRR);
+	write_sysreg(ctxt->cp15[c10_AMAIR0],	AMAIR0);
+	write_sysreg(ctxt->cp15[c10_AMAIR1],	AMAIR1);
+	write_sysreg(ctxt->cp15[c12_VBAR],	VBAR);
+	write_sysreg(ctxt->cp15[c13_CID],	CID);
+	write_sysreg(ctxt->cp15[c13_TID_URW],	TID_URW);
+	write_sysreg(ctxt->cp15[c13_TID_URO],	TID_URO);
+	write_sysreg(ctxt->cp15[c13_TID_PRIV],	TID_PRIV);
+	write_sysreg(ctxt->cp15[c14_CNTKCTL],	CNTKCTL);
+}

+ 101 - 0
arch/arm/kvm/hyp/entry.S

@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <linux/linkage.h>
+#include <asm/asm-offsets.h>
+#include <asm/kvm_arm.h>
+
+	.arch_extension     virt
+
+	.text
+	.pushsection	.hyp.text, "ax"
+
+#define USR_REGS_OFFSET		(CPU_CTXT_GP_REGS + GP_REGS_USR)
+
+/* int __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host) */
+ENTRY(__guest_enter)
+	@ Save host registers
+	add	r1, r1, #(USR_REGS_OFFSET + S_R4)
+	stm	r1!, {r4-r12}
+	str	lr, [r1, #4]	@ Skip SP_usr (already saved)
+
+	@ Restore guest registers
+	add	r0, r0,  #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R0)
+	ldr	lr, [r0, #S_LR]
+	ldm	r0, {r0-r12}
+
+	clrex
+	eret
+ENDPROC(__guest_enter)
+
+ENTRY(__guest_exit)
+	/*
+	 * return convention:
+	 * guest r0, r1, r2 saved on the stack
+	 * r0: vcpu pointer
+	 * r1: exception code
+	 */
+
+	add	r2, r0, #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R3)
+	stm	r2!, {r3-r12}
+	str	lr, [r2, #4]
+	add	r2, r0, #(VCPU_GUEST_CTXT + USR_REGS_OFFSET + S_R0)
+	pop	{r3, r4, r5}		@ r0, r1, r2
+	stm	r2, {r3-r5}
+
+	ldr	r0, [r0, #VCPU_HOST_CTXT]
+	add	r0, r0, #(USR_REGS_OFFSET + S_R4)
+	ldm	r0!, {r4-r12}
+	ldr	lr, [r0, #4]
+
+	mov	r0, r1
+	bx	lr
+ENDPROC(__guest_exit)
+
+/*
+ * If VFPv3 support is not available, then we will not switch the VFP
+ * registers; however cp10 and cp11 accesses will still trap and fallback
+ * to the regular coprocessor emulation code, which currently will
+ * inject an undefined exception to the guest.
+ */
+#ifdef CONFIG_VFPv3
+ENTRY(__vfp_guest_restore)
+	push	{r3, r4, lr}
+
+	@ NEON/VFP used.  Turn on VFP access.
+	mrc	p15, 4, r1, c1, c1, 2		@ HCPTR
+	bic	r1, r1, #(HCPTR_TCP(10) | HCPTR_TCP(11))
+	mcr	p15, 4, r1, c1, c1, 2		@ HCPTR
+	isb
+
+	@ Switch VFP/NEON hardware state to the guest's
+	mov	r4, r0
+	ldr	r0, [r0, #VCPU_HOST_CTXT]
+	add	r0, r0, #CPU_CTXT_VFP
+	bl	__vfp_save_state
+	add	r0, r4, #(VCPU_GUEST_CTXT + CPU_CTXT_VFP)
+	bl	__vfp_restore_state
+
+	pop	{r3, r4, lr}
+	pop	{r0, r1, r2}
+	clrex
+	eret
+ENDPROC(__vfp_guest_restore)
+#endif
+
+	.popsection
+

+ 169 - 0
arch/arm/kvm/hyp/hyp-entry.S

@@ -0,0 +1,169 @@
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/linkage.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+
+	.arch_extension     virt
+
+	.text
+	.pushsection	.hyp.text, "ax"
+
+.macro load_vcpu	reg
+	mrc	p15, 4, \reg, c13, c0, 2	@ HTPIDR
+.endm
+
+/********************************************************************
+ * Hypervisor exception vector and handlers
+ *
+ *
+ * The KVM/ARM Hypervisor ABI is defined as follows:
+ *
+ * Entry to Hyp mode from the host kernel will happen _only_ when an HVC
+ * instruction is issued since all traps are disabled when running the host
+ * kernel as per the Hyp-mode initialization at boot time.
+ *
+ * HVC instructions cause a trap to the vector page + offset 0x14 (see hyp_hvc
+ * below) when the HVC instruction is called from SVC mode (i.e. a guest or the
+ * host kernel) and they cause a trap to the vector page + offset 0x8 when HVC
+ * instructions are called from within Hyp-mode.
+ *
+ * Hyp-ABI: Calling HYP-mode functions from host (in SVC mode):
+ *    Switching to Hyp mode is done through a simple HVC #0 instruction. The
+ *    exception vector code will check that the HVC comes from VMID==0.
+ *    - r0 contains a pointer to a HYP function
+ *    - r1, r2, and r3 contain arguments to the above function.
+ *    - The HYP function will be called with its arguments in r0, r1 and r2.
+ *    On HYP function return, we return directly to SVC.
+ *
+ * Note that the above is used to execute code in Hyp-mode from a host-kernel
+ * point of view, and is a different concept from performing a world-switch and
+ * executing guest code SVC mode (with a VMID != 0).
+ */
+
+	.align 5
+__kvm_hyp_vector:
+	.global __kvm_hyp_vector
+
+	@ Hyp-mode exception vector
+	W(b)	hyp_reset
+	W(b)	hyp_undef
+	W(b)	hyp_svc
+	W(b)	hyp_pabt
+	W(b)	hyp_dabt
+	W(b)	hyp_hvc
+	W(b)	hyp_irq
+	W(b)	hyp_fiq
+
+.macro invalid_vector label, cause
+	.align
+\label:	mov	r0, #\cause
+	b	__hyp_panic
+.endm
+
+	invalid_vector	hyp_reset	ARM_EXCEPTION_RESET
+	invalid_vector	hyp_undef	ARM_EXCEPTION_UNDEFINED
+	invalid_vector	hyp_svc		ARM_EXCEPTION_SOFTWARE
+	invalid_vector	hyp_pabt	ARM_EXCEPTION_PREF_ABORT
+	invalid_vector	hyp_dabt	ARM_EXCEPTION_DATA_ABORT
+	invalid_vector	hyp_fiq		ARM_EXCEPTION_FIQ
+
+ENTRY(__hyp_do_panic)
+	mrs	lr, cpsr
+	bic	lr, lr, #MODE_MASK
+	orr	lr, lr, #SVC_MODE
+THUMB(	orr	lr, lr, #PSR_T_BIT	)
+	msr	spsr_cxsf, lr
+	ldr	lr, =panic
+	msr	ELR_hyp, lr
+	ldr	lr, =kvm_call_hyp
+	clrex
+	eret
+ENDPROC(__hyp_do_panic)
+
+hyp_hvc:
+	/*
+	 * Getting here is either because of a trap from a guest,
+	 * or from executing HVC from the host kernel, which means
+	 * "do something in Hyp mode".
+	 */
+	push	{r0, r1, r2}
+
+	@ Check syndrome register
+	mrc	p15, 4, r1, c5, c2, 0	@ HSR
+	lsr	r0, r1, #HSR_EC_SHIFT
+	cmp	r0, #HSR_EC_HVC
+	bne	guest_trap		@ Not HVC instr.
+
+	/*
+	 * Let's check if the HVC came from VMID 0 and allow simple
+	 * switch to Hyp mode
+	 */
+	mrrc    p15, 6, r0, r2, c2
+	lsr     r2, r2, #16
+	and     r2, r2, #0xff
+	cmp     r2, #0
+	bne	guest_trap		@ Guest called HVC
+
+	/*
+	 * Getting here means host called HVC, we shift parameters and branch
+	 * to Hyp function.
+	 */
+	pop	{r0, r1, r2}
+
+	/* Check for __hyp_get_vectors */
+	cmp	r0, #-1
+	mrceq	p15, 4, r0, c12, c0, 0	@ get HVBAR
+	beq	1f
+
+	push	{lr}
+
+	mov	lr, r0
+	mov	r0, r1
+	mov	r1, r2
+	mov	r2, r3
+
+THUMB(	orr	lr, #1)
+	blx	lr			@ Call the HYP function
+
+	pop	{lr}
+1:	eret
+
+guest_trap:
+	load_vcpu r0			@ Load VCPU pointer to r0
+
+#ifdef CONFIG_VFPv3
+	@ Check for a VFP access
+	lsr	r1, r1, #HSR_EC_SHIFT
+	cmp	r1, #HSR_EC_CP_0_13
+	beq	__vfp_guest_restore
+#endif
+
+	mov	r1, #ARM_EXCEPTION_HVC
+	b	__guest_exit
+
+hyp_irq:
+	push	{r0, r1, r2}
+	mov	r1, #ARM_EXCEPTION_IRQ
+	load_vcpu r0			@ Load VCPU pointer to r0
+	b	__guest_exit
+
+	.ltorg
+
+	.popsection

+ 33 - 0
arch/arm/kvm/hyp/s2-setup.c

@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/types.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+
+void __hyp_text __init_stage2_translation(void)
+{
+	u64 val;
+
+	val = read_sysreg(VTCR) & ~VTCR_MASK;
+
+	val |= read_sysreg(HTCR) & VTCR_HTCR_SH;
+	val |= KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S;
+
+	write_sysreg(val, VTCR);
+}

+ 232 - 0
arch/arm/kvm/hyp/switch.c

@@ -0,0 +1,232 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+
+__asm__(".arch_extension     virt");
+
+/*
+ * Activate the traps, saving the host's fpexc register before
+ * overwriting it. We'll restore it on VM exit.
+ */
+static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu, u32 *fpexc_host)
+{
+	u32 val;
+
+	/*
+	 * We are about to set HCPTR.TCP10/11 to trap all floating point
+	 * register accesses to HYP, however, the ARM ARM clearly states that
+	 * traps are only taken to HYP if the operation would not otherwise
+	 * trap to SVC.  Therefore, always make sure that for 32-bit guests,
+	 * we set FPEXC.EN to prevent traps to SVC, when setting the TCP bits.
+	 */
+	val = read_sysreg(VFP_FPEXC);
+	*fpexc_host = val;
+	if (!(val & FPEXC_EN)) {
+		write_sysreg(val | FPEXC_EN, VFP_FPEXC);
+		isb();
+	}
+
+	write_sysreg(vcpu->arch.hcr | vcpu->arch.irq_lines, HCR);
+	/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
+	write_sysreg(HSTR_T(15), HSTR);
+	write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR);
+	val = read_sysreg(HDCR);
+	write_sysreg(val | HDCR_TPM | HDCR_TPMCR, HDCR);
+}
+
+static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
+{
+	u32 val;
+
+	write_sysreg(0, HCR);
+	write_sysreg(0, HSTR);
+	val = read_sysreg(HDCR);
+	write_sysreg(val & ~(HDCR_TPM | HDCR_TPMCR), HDCR);
+	write_sysreg(0, HCPTR);
+}
+
+static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
+{
+	struct kvm *kvm = kern_hyp_va(vcpu->kvm);
+	write_sysreg(kvm->arch.vttbr, VTTBR);
+	write_sysreg(vcpu->arch.midr, VPIDR);
+}
+
+static void __hyp_text __deactivate_vm(struct kvm_vcpu *vcpu)
+{
+	write_sysreg(0, VTTBR);
+	write_sysreg(read_sysreg(MIDR), VPIDR);
+}
+
+static void __hyp_text __vgic_save_state(struct kvm_vcpu *vcpu)
+{
+	__vgic_v2_save_state(vcpu);
+}
+
+static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
+{
+	__vgic_v2_restore_state(vcpu);
+}
+
+static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+	u32 hsr = read_sysreg(HSR);
+	u8 ec = hsr >> HSR_EC_SHIFT;
+	u32 hpfar, far;
+
+	vcpu->arch.fault.hsr = hsr;
+
+	if (ec == HSR_EC_IABT)
+		far = read_sysreg(HIFAR);
+	else if (ec == HSR_EC_DABT)
+		far = read_sysreg(HDFAR);
+	else
+		return true;
+
+	/*
+	 * B3.13.5 Reporting exceptions taken to the Non-secure PL2 mode:
+	 *
+	 * Abort on the stage 2 translation for a memory access from a
+	 * Non-secure PL1 or PL0 mode:
+	 *
+	 * For any Access flag fault or Translation fault, and also for any
+	 * Permission fault on the stage 2 translation of a memory access
+	 * made as part of a translation table walk for a stage 1 translation,
+	 * the HPFAR holds the IPA that caused the fault. Otherwise, the HPFAR
+	 * is UNKNOWN.
+	 */
+	if (!(hsr & HSR_DABT_S1PTW) && (hsr & HSR_FSC_TYPE) == FSC_PERM) {
+		u64 par, tmp;
+
+		par = read_sysreg(PAR);
+		write_sysreg(far, ATS1CPR);
+		isb();
+
+		tmp = read_sysreg(PAR);
+		write_sysreg(par, PAR);
+
+		if (unlikely(tmp & 1))
+			return false; /* Translation failed, back to guest */
+
+		hpfar = ((tmp >> 12) & ((1UL << 28) - 1)) << 4;
+	} else {
+		hpfar = read_sysreg(HPFAR);
+	}
+
+	vcpu->arch.fault.hxfar = far;
+	vcpu->arch.fault.hpfar = hpfar;
+	return true;
+}
+
+static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpu_context *host_ctxt;
+	struct kvm_cpu_context *guest_ctxt;
+	bool fp_enabled;
+	u64 exit_code;
+	u32 fpexc;
+
+	vcpu = kern_hyp_va(vcpu);
+	write_sysreg(vcpu, HTPIDR);
+
+	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+	guest_ctxt = &vcpu->arch.ctxt;
+
+	__sysreg_save_state(host_ctxt);
+	__banked_save_state(host_ctxt);
+
+	__activate_traps(vcpu, &fpexc);
+	__activate_vm(vcpu);
+
+	__vgic_restore_state(vcpu);
+	__timer_restore_state(vcpu);
+
+	__sysreg_restore_state(guest_ctxt);
+	__banked_restore_state(guest_ctxt);
+
+	/* Jump in the fire! */
+again:
+	exit_code = __guest_enter(vcpu, host_ctxt);
+	/* And we're baaack! */
+
+	if (exit_code == ARM_EXCEPTION_HVC && !__populate_fault_info(vcpu))
+		goto again;
+
+	fp_enabled = __vfp_enabled();
+
+	__banked_save_state(guest_ctxt);
+	__sysreg_save_state(guest_ctxt);
+	__timer_save_state(vcpu);
+	__vgic_save_state(vcpu);
+
+	__deactivate_traps(vcpu);
+	__deactivate_vm(vcpu);
+
+	__banked_restore_state(host_ctxt);
+	__sysreg_restore_state(host_ctxt);
+
+	if (fp_enabled) {
+		__vfp_save_state(&guest_ctxt->vfp);
+		__vfp_restore_state(&host_ctxt->vfp);
+	}
+
+	write_sysreg(fpexc, VFP_FPEXC);
+
+	return exit_code;
+}
+
+__alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
+
+static const char * const __hyp_panic_string[] = {
+	[ARM_EXCEPTION_RESET]      = "\nHYP panic: RST   PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_UNDEFINED]  = "\nHYP panic: UNDEF PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_SOFTWARE]   = "\nHYP panic: SVC   PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_PREF_ABORT] = "\nHYP panic: PABRT PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_DATA_ABORT] = "\nHYP panic: DABRT PC:%08x ADDR:%08x",
+	[ARM_EXCEPTION_IRQ]        = "\nHYP panic: IRQ   PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_FIQ]        = "\nHYP panic: FIQ   PC:%08x CPSR:%08x",
+	[ARM_EXCEPTION_HVC]        = "\nHYP panic: HVC   PC:%08x CPSR:%08x",
+};
+
+void __hyp_text __noreturn __hyp_panic(int cause)
+{
+	u32 elr = read_special(ELR_hyp);
+	u32 val;
+
+	if (cause == ARM_EXCEPTION_DATA_ABORT)
+		val = read_sysreg(HDFAR);
+	else
+		val = read_special(SPSR);
+
+	if (read_sysreg(VTTBR)) {
+		struct kvm_vcpu *vcpu;
+		struct kvm_cpu_context *host_ctxt;
+
+		vcpu = (struct kvm_vcpu *)read_sysreg(HTPIDR);
+		host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
+		__deactivate_traps(vcpu);
+		__deactivate_vm(vcpu);
+		__sysreg_restore_state(host_ctxt);
+	}
+
+	/* Call panic for real */
+	__hyp_do_panic(__hyp_panic_string[cause], elr, val);
+
+	unreachable();
+}

+ 70 - 0
arch/arm/kvm/hyp/tlb.c

@@ -0,0 +1,70 @@
+/*
+ * Original code:
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * Mostly rewritten in C by Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <asm/kvm_hyp.h>
+
+/**
+ * Flush per-VMID TLBs
+ *
+ * __kvm_tlb_flush_vmid(struct kvm *kvm);
+ *
+ * We rely on the hardware to broadcast the TLB invalidation to all CPUs
+ * inside the inner-shareable domain (which is the case for all v7
+ * implementations).  If we come across a non-IS SMP implementation, we'll
+ * have to use an IPI based mechanism. Until then, we stick to the simple
+ * hardware assisted version.
+ *
+ * As v7 does not support flushing per IPA, just nuke the whole TLB
+ * instead, ignoring the ipa value.
+ */
+static void __hyp_text __tlb_flush_vmid(struct kvm *kvm)
+{
+	dsb(ishst);
+
+	/* Switch to requested VMID */
+	kvm = kern_hyp_va(kvm);
+	write_sysreg(kvm->arch.vttbr, VTTBR);
+	isb();
+
+	write_sysreg(0, TLBIALLIS);
+	dsb(ish);
+	isb();
+
+	write_sysreg(0, VTTBR);
+}
+
+__alias(__tlb_flush_vmid) void __kvm_tlb_flush_vmid(struct kvm *kvm);
+
+static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
+{
+	__tlb_flush_vmid(kvm);
+}
+
+__alias(__tlb_flush_vmid_ipa) void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm,
+							    phys_addr_t ipa);
+
+static void __hyp_text __tlb_flush_vm_context(void)
+{
+	write_sysreg(0, TLBIALLNSNHIS);
+	write_sysreg(0, ICIALLUIS);
+	dsb(ish);
+}
+
+__alias(__tlb_flush_vm_context) void __kvm_flush_vm_context(void);

+ 68 - 0
arch/arm/kvm/hyp/vfp.S

@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2012 - Virtual Open Systems and Columbia University
+ * Author: Christoffer Dall <c.dall@virtualopensystems.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/linkage.h>
+#include <asm/vfpmacros.h>
+
+	.text
+	.pushsection	.hyp.text, "ax"
+
+/* void __vfp_save_state(struct vfp_hard_struct *vfp); */
+ENTRY(__vfp_save_state)
+	push	{r4, r5}
+	VFPFMRX	r1, FPEXC
+
+	@ Make sure *really* VFP is enabled so we can touch the registers.
+	orr	r5, r1, #FPEXC_EN
+	tst	r5, #FPEXC_EX		@ Check for VFP Subarchitecture
+	bic	r5, r5, #FPEXC_EX	@ FPEXC_EX disable
+	VFPFMXR	FPEXC, r5
+	isb
+
+	VFPFMRX	r2, FPSCR
+	beq	1f
+
+	@ If FPEXC_EX is 0, then FPINST/FPINST2 reads are upredictable, so
+	@ we only need to save them if FPEXC_EX is set.
+	VFPFMRX r3, FPINST
+	tst	r5, #FPEXC_FP2V
+	VFPFMRX r4, FPINST2, ne		@ vmrsne
+1:
+	VFPFSTMIA r0, r5		@ Save VFP registers
+	stm	r0, {r1-r4}		@ Save FPEXC, FPSCR, FPINST, FPINST2
+	pop	{r4, r5}
+	bx	lr
+ENDPROC(__vfp_save_state)
+
+/* void __vfp_restore_state(struct vfp_hard_struct *vfp);
+ * Assume FPEXC_EN is on and FPEXC_EX is off */
+ENTRY(__vfp_restore_state)
+	VFPFLDMIA r0, r1		@ Load VFP registers
+	ldm	r0, {r0-r3}		@ Load FPEXC, FPSCR, FPINST, FPINST2
+
+	VFPFMXR FPSCR, r1
+	tst	r0, #FPEXC_EX		@ Check for VFP Subarchitecture
+	beq	1f
+	VFPFMXR FPINST, r2
+	tst	r0, #FPEXC_FP2V
+	VFPFMXR FPINST2, r3, ne
+1:
+	VFPFMXR FPEXC, r0		@ FPEXC	(last, in case !EN)
+	bx	lr
+ENDPROC(__vfp_restore_state)
+
+	.popsection

+ 0 - 8
arch/arm/kvm/init.S

@@ -84,14 +84,6 @@ __do_hyp_init:
 	orr	r0, r0, r1
 	mcr	p15, 4, r0, c2, c0, 2	@ HTCR
 
-	mrc	p15, 4, r1, c2, c1, 2	@ VTCR
-	ldr	r2, =VTCR_MASK
-	bic	r1, r1, r2
-	bic	r0, r0, #(~VTCR_HTCR_SH)	@ clear non-reusable HTCR bits
-	orr	r1, r0, r1
-	orr	r1, r1, #(KVM_VTCR_SL0 | KVM_VTCR_T0SZ | KVM_VTCR_S)
-	mcr	p15, 4, r1, c2, c1, 2	@ VTCR
-
 	@ Use the same memory attributes for hyp. accesses as the kernel
 	@ (copy MAIRx ro HMAIRx).
 	mrc	p15, 0, r0, c10, c2, 0

+ 3 - 477
arch/arm/kvm/interrupts.S

@@ -17,211 +17,14 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/const.h>
-#include <asm/unified.h>
-#include <asm/page.h>
-#include <asm/ptrace.h>
-#include <asm/asm-offsets.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_arm.h>
-#include <asm/vfpmacros.h>
-#include "interrupts_head.S"
 
 	.text
 
-__kvm_hyp_code_start:
-	.globl __kvm_hyp_code_start
-
-/********************************************************************
- * Flush per-VMID TLBs
- *
- * void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
- *
- * We rely on the hardware to broadcast the TLB invalidation to all CPUs
- * inside the inner-shareable domain (which is the case for all v7
- * implementations).  If we come across a non-IS SMP implementation, we'll
- * have to use an IPI based mechanism. Until then, we stick to the simple
- * hardware assisted version.
- *
- * As v7 does not support flushing per IPA, just nuke the whole TLB
- * instead, ignoring the ipa value.
- */
-ENTRY(__kvm_tlb_flush_vmid_ipa)
-	push	{r2, r3}
-
-	dsb	ishst
-	add	r0, r0, #KVM_VTTBR
-	ldrd	r2, r3, [r0]
-	mcrr	p15, 6, rr_lo_hi(r2, r3), c2	@ Write VTTBR
-	isb
-	mcr     p15, 0, r0, c8, c3, 0	@ TLBIALLIS (rt ignored)
-	dsb	ish
-	isb
-	mov	r2, #0
-	mov	r3, #0
-	mcrr	p15, 6, r2, r3, c2	@ Back to VMID #0
-	isb				@ Not necessary if followed by eret
-
-	pop	{r2, r3}
-	bx	lr
-ENDPROC(__kvm_tlb_flush_vmid_ipa)
-
-/**
- * void __kvm_tlb_flush_vmid(struct kvm *kvm) - Flush per-VMID TLBs
- *
- * Reuses __kvm_tlb_flush_vmid_ipa() for ARMv7, without passing address
- * parameter
- */
-
-ENTRY(__kvm_tlb_flush_vmid)
-	b	__kvm_tlb_flush_vmid_ipa
-ENDPROC(__kvm_tlb_flush_vmid)
-
-/********************************************************************
- * Flush TLBs and instruction caches of all CPUs inside the inner-shareable
- * domain, for all VMIDs
- *
- * void __kvm_flush_vm_context(void);
- */
-ENTRY(__kvm_flush_vm_context)
-	mov	r0, #0			@ rn parameter for c15 flushes is SBZ
-
-	/* Invalidate NS Non-Hyp TLB Inner Shareable (TLBIALLNSNHIS) */
-	mcr     p15, 4, r0, c8, c3, 4
-	/* Invalidate instruction caches Inner Shareable (ICIALLUIS) */
-	mcr     p15, 0, r0, c7, c1, 0
-	dsb	ish
-	isb				@ Not necessary if followed by eret
-
-	bx	lr
-ENDPROC(__kvm_flush_vm_context)
-
-
-/********************************************************************
- *  Hypervisor world-switch code
- *
- *
- * int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
- */
-ENTRY(__kvm_vcpu_run)
-	@ Save the vcpu pointer
-	mcr	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
-
-	save_host_regs
-
-	restore_vgic_state
-	restore_timer_state
-
-	@ Store hardware CP15 state and load guest state
-	read_cp15_state store_to_vcpu = 0
-	write_cp15_state read_from_vcpu = 1
-
-	@ If the host kernel has not been configured with VFPv3 support,
-	@ then it is safer if we deny guests from using it as well.
-#ifdef CONFIG_VFPv3
-	@ Set FPEXC_EN so the guest doesn't trap floating point instructions
-	VFPFMRX r2, FPEXC		@ VMRS
-	push	{r2}
-	orr	r2, r2, #FPEXC_EN
-	VFPFMXR FPEXC, r2		@ VMSR
-#endif
-
-	@ Configure Hyp-role
-	configure_hyp_role vmentry
-
-	@ Trap coprocessor CRx accesses
-	set_hstr vmentry
-	set_hcptr vmentry, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11))
-	set_hdcr vmentry
-
-	@ Write configured ID register into MIDR alias
-	ldr	r1, [vcpu, #VCPU_MIDR]
-	mcr	p15, 4, r1, c0, c0, 0
-
-	@ Write guest view of MPIDR into VMPIDR
-	ldr	r1, [vcpu, #CP15_OFFSET(c0_MPIDR)]
-	mcr	p15, 4, r1, c0, c0, 5
-
-	@ Set up guest memory translation
-	ldr	r1, [vcpu, #VCPU_KVM]
-	add	r1, r1, #KVM_VTTBR
-	ldrd	r2, r3, [r1]
-	mcrr	p15, 6, rr_lo_hi(r2, r3), c2	@ Write VTTBR
-
-	@ We're all done, just restore the GPRs and go to the guest
-	restore_guest_regs
-	clrex				@ Clear exclusive monitor
-	eret
-
-__kvm_vcpu_return:
-	/*
-	 * return convention:
-	 * guest r0, r1, r2 saved on the stack
-	 * r0: vcpu pointer
-	 * r1: exception code
-	 */
-	save_guest_regs
-
-	@ Set VMID == 0
-	mov	r2, #0
-	mov	r3, #0
-	mcrr	p15, 6, r2, r3, c2	@ Write VTTBR
-
-	@ Don't trap coprocessor accesses for host kernel
-	set_hstr vmexit
-	set_hdcr vmexit
-	set_hcptr vmexit, (HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11)), after_vfp_restore
-
-#ifdef CONFIG_VFPv3
-	@ Switch VFP/NEON hardware state to the host's
-	add	r7, vcpu, #VCPU_VFP_GUEST
-	store_vfp_state r7
-	add	r7, vcpu, #VCPU_VFP_HOST
-	ldr	r7, [r7]
-	restore_vfp_state r7
-
-after_vfp_restore:
-	@ Restore FPEXC_EN which we clobbered on entry
-	pop	{r2}
-	VFPFMXR FPEXC, r2
-#else
-after_vfp_restore:
-#endif
-
-	@ Reset Hyp-role
-	configure_hyp_role vmexit
-
-	@ Let host read hardware MIDR
-	mrc	p15, 0, r2, c0, c0, 0
-	mcr	p15, 4, r2, c0, c0, 0
-
-	@ Back to hardware MPIDR
-	mrc	p15, 0, r2, c0, c0, 5
-	mcr	p15, 4, r2, c0, c0, 5
-
-	@ Store guest CP15 state and restore host state
-	read_cp15_state store_to_vcpu = 1
-	write_cp15_state read_from_vcpu = 0
-
-	save_timer_state
-	save_vgic_state
-
-	restore_host_regs
-	clrex				@ Clear exclusive monitor
-#ifndef CONFIG_CPU_ENDIAN_BE8
-	mov	r0, r1			@ Return the return code
-	mov	r1, #0			@ Clear upper bits in return value
-#else
-	@ r1 already has return code
-	mov	r0, #0			@ Clear upper bits in return value
-#endif /* CONFIG_CPU_ENDIAN_BE8 */
-	bx	lr			@ return to IOCTL
-
 /********************************************************************
  *  Call function in Hyp mode
  *
  *
- * u64 kvm_call_hyp(void *hypfn, ...);
+ * unsigned long kvm_call_hyp(void *hypfn, ...);
  *
  * This is not really a variadic function in the classic C-way and care must
  * be taken when calling this to ensure parameters are passed in registers
@@ -232,7 +35,7 @@ after_vfp_restore:
  * passed as r0, r1, and r2 (a maximum of 3 arguments in addition to the
  * function pointer can be passed).  The function being called must be mapped
  * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c).  Return values are
- * passed in r0 and r1.
+ * passed in r0 (strictly 32bit).
  *
  * A function pointer with a value of 0xffffffff has a special meaning,
  * and is used to implement __hyp_get_vectors in the same way as in
@@ -246,281 +49,4 @@ after_vfp_restore:
 ENTRY(kvm_call_hyp)
 	hvc	#0
 	bx	lr
-
-/********************************************************************
- * Hypervisor exception vector and handlers
- *
- *
- * The KVM/ARM Hypervisor ABI is defined as follows:
- *
- * Entry to Hyp mode from the host kernel will happen _only_ when an HVC
- * instruction is issued since all traps are disabled when running the host
- * kernel as per the Hyp-mode initialization at boot time.
- *
- * HVC instructions cause a trap to the vector page + offset 0x14 (see hyp_hvc
- * below) when the HVC instruction is called from SVC mode (i.e. a guest or the
- * host kernel) and they cause a trap to the vector page + offset 0x8 when HVC
- * instructions are called from within Hyp-mode.
- *
- * Hyp-ABI: Calling HYP-mode functions from host (in SVC mode):
- *    Switching to Hyp mode is done through a simple HVC #0 instruction. The
- *    exception vector code will check that the HVC comes from VMID==0 and if
- *    so will push the necessary state (SPSR, lr_usr) on the Hyp stack.
- *    - r0 contains a pointer to a HYP function
- *    - r1, r2, and r3 contain arguments to the above function.
- *    - The HYP function will be called with its arguments in r0, r1 and r2.
- *    On HYP function return, we return directly to SVC.
- *
- * Note that the above is used to execute code in Hyp-mode from a host-kernel
- * point of view, and is a different concept from performing a world-switch and
- * executing guest code SVC mode (with a VMID != 0).
- */
-
-/* Handle undef, svc, pabt, or dabt by crashing with a user notice */
-.macro bad_exception exception_code, panic_str
-	push	{r0-r2}
-	mrrc	p15, 6, r0, r1, c2	@ Read VTTBR
-	lsr	r1, r1, #16
-	ands	r1, r1, #0xff
-	beq	99f
-
-	load_vcpu			@ Load VCPU pointer
-	.if \exception_code == ARM_EXCEPTION_DATA_ABORT
-	mrc	p15, 4, r2, c5, c2, 0	@ HSR
-	mrc	p15, 4, r1, c6, c0, 0	@ HDFAR
-	str	r2, [vcpu, #VCPU_HSR]
-	str	r1, [vcpu, #VCPU_HxFAR]
-	.endif
-	.if \exception_code == ARM_EXCEPTION_PREF_ABORT
-	mrc	p15, 4, r2, c5, c2, 0	@ HSR
-	mrc	p15, 4, r1, c6, c0, 2	@ HIFAR
-	str	r2, [vcpu, #VCPU_HSR]
-	str	r1, [vcpu, #VCPU_HxFAR]
-	.endif
-	mov	r1, #\exception_code
-	b	__kvm_vcpu_return
-
-	@ We were in the host already. Let's craft a panic-ing return to SVC.
-99:	mrs	r2, cpsr
-	bic	r2, r2, #MODE_MASK
-	orr	r2, r2, #SVC_MODE
-THUMB(	orr	r2, r2, #PSR_T_BIT	)
-	msr	spsr_cxsf, r2
-	mrs	r1, ELR_hyp
-	ldr	r2, =panic
-	msr	ELR_hyp, r2
-	ldr	r0, =\panic_str
-	clrex				@ Clear exclusive monitor
-	eret
-.endm
-
-	.text
-
-	.align 5
-__kvm_hyp_vector:
-	.globl __kvm_hyp_vector
-
-	@ Hyp-mode exception vector
-	W(b)	hyp_reset
-	W(b)	hyp_undef
-	W(b)	hyp_svc
-	W(b)	hyp_pabt
-	W(b)	hyp_dabt
-	W(b)	hyp_hvc
-	W(b)	hyp_irq
-	W(b)	hyp_fiq
-
-	.align
-hyp_reset:
-	b	hyp_reset
-
-	.align
-hyp_undef:
-	bad_exception ARM_EXCEPTION_UNDEFINED, und_die_str
-
-	.align
-hyp_svc:
-	bad_exception ARM_EXCEPTION_HVC, svc_die_str
-
-	.align
-hyp_pabt:
-	bad_exception ARM_EXCEPTION_PREF_ABORT, pabt_die_str
-
-	.align
-hyp_dabt:
-	bad_exception ARM_EXCEPTION_DATA_ABORT, dabt_die_str
-
-	.align
-hyp_hvc:
-	/*
-	 * Getting here is either becuase of a trap from a guest or from calling
-	 * HVC from the host kernel, which means "switch to Hyp mode".
-	 */
-	push	{r0, r1, r2}
-
-	@ Check syndrome register
-	mrc	p15, 4, r1, c5, c2, 0	@ HSR
-	lsr	r0, r1, #HSR_EC_SHIFT
-	cmp	r0, #HSR_EC_HVC
-	bne	guest_trap		@ Not HVC instr.
-
-	/*
-	 * Let's check if the HVC came from VMID 0 and allow simple
-	 * switch to Hyp mode
-	 */
-	mrrc    p15, 6, r0, r2, c2
-	lsr     r2, r2, #16
-	and     r2, r2, #0xff
-	cmp     r2, #0
-	bne	guest_trap		@ Guest called HVC
-
-	/*
-	 * Getting here means host called HVC, we shift parameters and branch
-	 * to Hyp function.
-	 */
-	pop	{r0, r1, r2}
-
-	/* Check for __hyp_get_vectors */
-	cmp	r0, #-1
-	mrceq	p15, 4, r0, c12, c0, 0	@ get HVBAR
-	beq	1f
-
-	push	{lr}
-	mrs	lr, SPSR
-	push	{lr}
-
-	mov	lr, r0
-	mov	r0, r1
-	mov	r1, r2
-	mov	r2, r3
-
-THUMB(	orr	lr, #1)
-	blx	lr			@ Call the HYP function
-
-	pop	{lr}
-	msr	SPSR_csxf, lr
-	pop	{lr}
-1:	eret
-
-guest_trap:
-	load_vcpu			@ Load VCPU pointer to r0
-	str	r1, [vcpu, #VCPU_HSR]
-
-	@ Check if we need the fault information
-	lsr	r1, r1, #HSR_EC_SHIFT
-#ifdef CONFIG_VFPv3
-	cmp	r1, #HSR_EC_CP_0_13
-	beq	switch_to_guest_vfp
-#endif
-	cmp	r1, #HSR_EC_IABT
-	mrceq	p15, 4, r2, c6, c0, 2	@ HIFAR
-	beq	2f
-	cmp	r1, #HSR_EC_DABT
-	bne	1f
-	mrc	p15, 4, r2, c6, c0, 0	@ HDFAR
-
-2:	str	r2, [vcpu, #VCPU_HxFAR]
-
-	/*
-	 * B3.13.5 Reporting exceptions taken to the Non-secure PL2 mode:
-	 *
-	 * Abort on the stage 2 translation for a memory access from a
-	 * Non-secure PL1 or PL0 mode:
-	 *
-	 * For any Access flag fault or Translation fault, and also for any
-	 * Permission fault on the stage 2 translation of a memory access
-	 * made as part of a translation table walk for a stage 1 translation,
-	 * the HPFAR holds the IPA that caused the fault. Otherwise, the HPFAR
-	 * is UNKNOWN.
-	 */
-
-	/* Check for permission fault, and S1PTW */
-	mrc	p15, 4, r1, c5, c2, 0	@ HSR
-	and	r0, r1, #HSR_FSC_TYPE
-	cmp	r0, #FSC_PERM
-	tsteq	r1, #(1 << 7)		@ S1PTW
-	mrcne	p15, 4, r2, c6, c0, 4	@ HPFAR
-	bne	3f
-
-	/* Preserve PAR */
-	mrrc	p15, 0, r0, r1, c7	@ PAR
-	push	{r0, r1}
-
-	/* Resolve IPA using the xFAR */
-	mcr	p15, 0, r2, c7, c8, 0	@ ATS1CPR
-	isb
-	mrrc	p15, 0, r0, r1, c7	@ PAR
-	tst	r0, #1
-	bne	4f			@ Failed translation
-	ubfx	r2, r0, #12, #20
-	lsl	r2, r2, #4
-	orr	r2, r2, r1, lsl #24
-
-	/* Restore PAR */
-	pop	{r0, r1}
-	mcrr	p15, 0, r0, r1, c7	@ PAR
-
-3:	load_vcpu			@ Load VCPU pointer to r0
-	str	r2, [r0, #VCPU_HPFAR]
-
-1:	mov	r1, #ARM_EXCEPTION_HVC
-	b	__kvm_vcpu_return
-
-4:	pop	{r0, r1}		@ Failed translation, return to guest
-	mcrr	p15, 0, r0, r1, c7	@ PAR
-	clrex
-	pop	{r0, r1, r2}
-	eret
-
-/*
- * If VFPv3 support is not available, then we will not switch the VFP
- * registers; however cp10 and cp11 accesses will still trap and fallback
- * to the regular coprocessor emulation code, which currently will
- * inject an undefined exception to the guest.
- */
-#ifdef CONFIG_VFPv3
-switch_to_guest_vfp:
-	push	{r3-r7}
-
-	@ NEON/VFP used.  Turn on VFP access.
-	set_hcptr vmtrap, (HCPTR_TCP(10) | HCPTR_TCP(11))
-
-	@ Switch VFP/NEON hardware state to the guest's
-	add	r7, r0, #VCPU_VFP_HOST
-	ldr	r7, [r7]
-	store_vfp_state r7
-	add	r7, r0, #VCPU_VFP_GUEST
-	restore_vfp_state r7
-
-	pop	{r3-r7}
-	pop	{r0-r2}
-	clrex
-	eret
-#endif
-
-	.align
-hyp_irq:
-	push	{r0, r1, r2}
-	mov	r1, #ARM_EXCEPTION_IRQ
-	load_vcpu			@ Load VCPU pointer to r0
-	b	__kvm_vcpu_return
-
-	.align
-hyp_fiq:
-	b	hyp_fiq
-
-	.ltorg
-
-__kvm_hyp_code_end:
-	.globl	__kvm_hyp_code_end
-
-	.section ".rodata"
-
-und_die_str:
-	.ascii	"unexpected undefined exception in Hyp mode at: %#08x\n"
-pabt_die_str:
-	.ascii	"unexpected prefetch abort in Hyp mode at: %#08x\n"
-dabt_die_str:
-	.ascii	"unexpected data abort in Hyp mode at: %#08x\n"
-svc_die_str:
-	.ascii	"unexpected HVC/SVC trap in Hyp mode at: %#08x\n"
+ENDPROC(kvm_call_hyp)

+ 0 - 648
arch/arm/kvm/interrupts_head.S

@@ -1,648 +0,0 @@
-#include <linux/irqchip/arm-gic.h>
-#include <asm/assembler.h>
-
-#define VCPU_USR_REG(_reg_nr)	(VCPU_USR_REGS + (_reg_nr * 4))
-#define VCPU_USR_SP		(VCPU_USR_REG(13))
-#define VCPU_USR_LR		(VCPU_USR_REG(14))
-#define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4))
-
-/*
- * Many of these macros need to access the VCPU structure, which is always
- * held in r0. These macros should never clobber r1, as it is used to hold the
- * exception code on the return path (except of course the macro that switches
- * all the registers before the final jump to the VM).
- */
-vcpu	.req	r0		@ vcpu pointer always in r0
-
-/* Clobbers {r2-r6} */
-.macro store_vfp_state vfp_base
-	@ The VFPFMRX and VFPFMXR macros are the VMRS and VMSR instructions
-	VFPFMRX	r2, FPEXC
-	@ Make sure VFP is enabled so we can touch the registers.
-	orr	r6, r2, #FPEXC_EN
-	VFPFMXR	FPEXC, r6
-
-	VFPFMRX	r3, FPSCR
-	tst	r2, #FPEXC_EX		@ Check for VFP Subarchitecture
-	beq	1f
-	@ If FPEXC_EX is 0, then FPINST/FPINST2 reads are upredictable, so
-	@ we only need to save them if FPEXC_EX is set.
-	VFPFMRX r4, FPINST
-	tst	r2, #FPEXC_FP2V
-	VFPFMRX r5, FPINST2, ne		@ vmrsne
-	bic	r6, r2, #FPEXC_EX	@ FPEXC_EX disable
-	VFPFMXR	FPEXC, r6
-1:
-	VFPFSTMIA \vfp_base, r6		@ Save VFP registers
-	stm	\vfp_base, {r2-r5}	@ Save FPEXC, FPSCR, FPINST, FPINST2
-.endm
-
-/* Assume FPEXC_EN is on and FPEXC_EX is off, clobbers {r2-r6} */
-.macro restore_vfp_state vfp_base
-	VFPFLDMIA \vfp_base, r6		@ Load VFP registers
-	ldm	\vfp_base, {r2-r5}	@ Load FPEXC, FPSCR, FPINST, FPINST2
-
-	VFPFMXR FPSCR, r3
-	tst	r2, #FPEXC_EX		@ Check for VFP Subarchitecture
-	beq	1f
-	VFPFMXR FPINST, r4
-	tst	r2, #FPEXC_FP2V
-	VFPFMXR FPINST2, r5, ne
-1:
-	VFPFMXR FPEXC, r2	@ FPEXC	(last, in case !EN)
-.endm
-
-/* These are simply for the macros to work - value don't have meaning */
-.equ usr, 0
-.equ svc, 1
-.equ abt, 2
-.equ und, 3
-.equ irq, 4
-.equ fiq, 5
-
-.macro push_host_regs_mode mode
-	mrs	r2, SP_\mode
-	mrs	r3, LR_\mode
-	mrs	r4, SPSR_\mode
-	push	{r2, r3, r4}
-.endm
-
-/*
- * Store all host persistent registers on the stack.
- * Clobbers all registers, in all modes, except r0 and r1.
- */
-.macro save_host_regs
-	/* Hyp regs. Only ELR_hyp (SPSR_hyp already saved) */
-	mrs	r2, ELR_hyp
-	push	{r2}
-
-	/* usr regs */
-	push	{r4-r12}	@ r0-r3 are always clobbered
-	mrs	r2, SP_usr
-	mov	r3, lr
-	push	{r2, r3}
-
-	push_host_regs_mode svc
-	push_host_regs_mode abt
-	push_host_regs_mode und
-	push_host_regs_mode irq
-
-	/* fiq regs */
-	mrs	r2, r8_fiq
-	mrs	r3, r9_fiq
-	mrs	r4, r10_fiq
-	mrs	r5, r11_fiq
-	mrs	r6, r12_fiq
-	mrs	r7, SP_fiq
-	mrs	r8, LR_fiq
-	mrs	r9, SPSR_fiq
-	push	{r2-r9}
-.endm
-
-.macro pop_host_regs_mode mode
-	pop	{r2, r3, r4}
-	msr	SP_\mode, r2
-	msr	LR_\mode, r3
-	msr	SPSR_\mode, r4
-.endm
-
-/*
- * Restore all host registers from the stack.
- * Clobbers all registers, in all modes, except r0 and r1.
- */
-.macro restore_host_regs
-	pop	{r2-r9}
-	msr	r8_fiq, r2
-	msr	r9_fiq, r3
-	msr	r10_fiq, r4
-	msr	r11_fiq, r5
-	msr	r12_fiq, r6
-	msr	SP_fiq, r7
-	msr	LR_fiq, r8
-	msr	SPSR_fiq, r9
-
-	pop_host_regs_mode irq
-	pop_host_regs_mode und
-	pop_host_regs_mode abt
-	pop_host_regs_mode svc
-
-	pop	{r2, r3}
-	msr	SP_usr, r2
-	mov	lr, r3
-	pop	{r4-r12}
-
-	pop	{r2}
-	msr	ELR_hyp, r2
-.endm
-
-/*
- * Restore SP, LR and SPSR for a given mode. offset is the offset of
- * this mode's registers from the VCPU base.
- *
- * Assumes vcpu pointer in vcpu reg
- *
- * Clobbers r1, r2, r3, r4.
- */
-.macro restore_guest_regs_mode mode, offset
-	add	r1, vcpu, \offset
-	ldm	r1, {r2, r3, r4}
-	msr	SP_\mode, r2
-	msr	LR_\mode, r3
-	msr	SPSR_\mode, r4
-.endm
-
-/*
- * Restore all guest registers from the vcpu struct.
- *
- * Assumes vcpu pointer in vcpu reg
- *
- * Clobbers *all* registers.
- */
-.macro restore_guest_regs
-	restore_guest_regs_mode svc, #VCPU_SVC_REGS
-	restore_guest_regs_mode abt, #VCPU_ABT_REGS
-	restore_guest_regs_mode und, #VCPU_UND_REGS
-	restore_guest_regs_mode irq, #VCPU_IRQ_REGS
-
-	add	r1, vcpu, #VCPU_FIQ_REGS
-	ldm	r1, {r2-r9}
-	msr	r8_fiq, r2
-	msr	r9_fiq, r3
-	msr	r10_fiq, r4
-	msr	r11_fiq, r5
-	msr	r12_fiq, r6
-	msr	SP_fiq, r7
-	msr	LR_fiq, r8
-	msr	SPSR_fiq, r9
-
-	@ Load return state
-	ldr	r2, [vcpu, #VCPU_PC]
-	ldr	r3, [vcpu, #VCPU_CPSR]
-	msr	ELR_hyp, r2
-	msr	SPSR_cxsf, r3
-
-	@ Load user registers
-	ldr	r2, [vcpu, #VCPU_USR_SP]
-	ldr	r3, [vcpu, #VCPU_USR_LR]
-	msr	SP_usr, r2
-	mov	lr, r3
-	add	vcpu, vcpu, #(VCPU_USR_REGS)
-	ldm	vcpu, {r0-r12}
-.endm
-
-/*
- * Save SP, LR and SPSR for a given mode. offset is the offset of
- * this mode's registers from the VCPU base.
- *
- * Assumes vcpu pointer in vcpu reg
- *
- * Clobbers r2, r3, r4, r5.
- */
-.macro save_guest_regs_mode mode, offset
-	add	r2, vcpu, \offset
-	mrs	r3, SP_\mode
-	mrs	r4, LR_\mode
-	mrs	r5, SPSR_\mode
-	stm	r2, {r3, r4, r5}
-.endm
-
-/*
- * Save all guest registers to the vcpu struct
- * Expects guest's r0, r1, r2 on the stack.
- *
- * Assumes vcpu pointer in vcpu reg
- *
- * Clobbers r2, r3, r4, r5.
- */
-.macro save_guest_regs
-	@ Store usr registers
-	add	r2, vcpu, #VCPU_USR_REG(3)
-	stm	r2, {r3-r12}
-	add	r2, vcpu, #VCPU_USR_REG(0)
-	pop	{r3, r4, r5}		@ r0, r1, r2
-	stm	r2, {r3, r4, r5}
-	mrs	r2, SP_usr
-	mov	r3, lr
-	str	r2, [vcpu, #VCPU_USR_SP]
-	str	r3, [vcpu, #VCPU_USR_LR]
-
-	@ Store return state
-	mrs	r2, ELR_hyp
-	mrs	r3, spsr
-	str	r2, [vcpu, #VCPU_PC]
-	str	r3, [vcpu, #VCPU_CPSR]
-
-	@ Store other guest registers
-	save_guest_regs_mode svc, #VCPU_SVC_REGS
-	save_guest_regs_mode abt, #VCPU_ABT_REGS
-	save_guest_regs_mode und, #VCPU_UND_REGS
-	save_guest_regs_mode irq, #VCPU_IRQ_REGS
-.endm
-
-/* Reads cp15 registers from hardware and stores them in memory
- * @store_to_vcpu: If 0, registers are written in-order to the stack,
- * 		   otherwise to the VCPU struct pointed to by vcpup
- *
- * Assumes vcpu pointer in vcpu reg
- *
- * Clobbers r2 - r12
- */
-.macro read_cp15_state store_to_vcpu
-	mrc	p15, 0, r2, c1, c0, 0	@ SCTLR
-	mrc	p15, 0, r3, c1, c0, 2	@ CPACR
-	mrc	p15, 0, r4, c2, c0, 2	@ TTBCR
-	mrc	p15, 0, r5, c3, c0, 0	@ DACR
-	mrrc	p15, 0, r6, r7, c2	@ TTBR 0
-	mrrc	p15, 1, r8, r9, c2	@ TTBR 1
-	mrc	p15, 0, r10, c10, c2, 0	@ PRRR
-	mrc	p15, 0, r11, c10, c2, 1	@ NMRR
-	mrc	p15, 2, r12, c0, c0, 0	@ CSSELR
-
-	.if \store_to_vcpu == 0
-	push	{r2-r12}		@ Push CP15 registers
-	.else
-	str	r2, [vcpu, #CP15_OFFSET(c1_SCTLR)]
-	str	r3, [vcpu, #CP15_OFFSET(c1_CPACR)]
-	str	r4, [vcpu, #CP15_OFFSET(c2_TTBCR)]
-	str	r5, [vcpu, #CP15_OFFSET(c3_DACR)]
-	add	r2, vcpu, #CP15_OFFSET(c2_TTBR0)
-	strd	r6, r7, [r2]
-	add	r2, vcpu, #CP15_OFFSET(c2_TTBR1)
-	strd	r8, r9, [r2]
-	str	r10, [vcpu, #CP15_OFFSET(c10_PRRR)]
-	str	r11, [vcpu, #CP15_OFFSET(c10_NMRR)]
-	str	r12, [vcpu, #CP15_OFFSET(c0_CSSELR)]
-	.endif
-
-	mrc	p15, 0, r2, c13, c0, 1	@ CID
-	mrc	p15, 0, r3, c13, c0, 2	@ TID_URW
-	mrc	p15, 0, r4, c13, c0, 3	@ TID_URO
-	mrc	p15, 0, r5, c13, c0, 4	@ TID_PRIV
-	mrc	p15, 0, r6, c5, c0, 0	@ DFSR
-	mrc	p15, 0, r7, c5, c0, 1	@ IFSR
-	mrc	p15, 0, r8, c5, c1, 0	@ ADFSR
-	mrc	p15, 0, r9, c5, c1, 1	@ AIFSR
-	mrc	p15, 0, r10, c6, c0, 0	@ DFAR
-	mrc	p15, 0, r11, c6, c0, 2	@ IFAR
-	mrc	p15, 0, r12, c12, c0, 0	@ VBAR
-
-	.if \store_to_vcpu == 0
-	push	{r2-r12}		@ Push CP15 registers
-	.else
-	str	r2, [vcpu, #CP15_OFFSET(c13_CID)]
-	str	r3, [vcpu, #CP15_OFFSET(c13_TID_URW)]
-	str	r4, [vcpu, #CP15_OFFSET(c13_TID_URO)]
-	str	r5, [vcpu, #CP15_OFFSET(c13_TID_PRIV)]
-	str	r6, [vcpu, #CP15_OFFSET(c5_DFSR)]
-	str	r7, [vcpu, #CP15_OFFSET(c5_IFSR)]
-	str	r8, [vcpu, #CP15_OFFSET(c5_ADFSR)]
-	str	r9, [vcpu, #CP15_OFFSET(c5_AIFSR)]
-	str	r10, [vcpu, #CP15_OFFSET(c6_DFAR)]
-	str	r11, [vcpu, #CP15_OFFSET(c6_IFAR)]
-	str	r12, [vcpu, #CP15_OFFSET(c12_VBAR)]
-	.endif
-
-	mrc	p15, 0, r2, c14, c1, 0	@ CNTKCTL
-	mrrc	p15, 0, r4, r5, c7	@ PAR
-	mrc	p15, 0, r6, c10, c3, 0	@ AMAIR0
-	mrc	p15, 0, r7, c10, c3, 1	@ AMAIR1
-
-	.if \store_to_vcpu == 0
-	push	{r2,r4-r7}
-	.else
-	str	r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
-	add	r12, vcpu, #CP15_OFFSET(c7_PAR)
-	strd	r4, r5, [r12]
-	str	r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
-	str	r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
-	.endif
-.endm
-
-/*
- * Reads cp15 registers from memory and writes them to hardware
- * @read_from_vcpu: If 0, registers are read in-order from the stack,
- *		    otherwise from the VCPU struct pointed to by vcpup
- *
- * Assumes vcpu pointer in vcpu reg
- */
-.macro write_cp15_state read_from_vcpu
-	.if \read_from_vcpu == 0
-	pop	{r2,r4-r7}
-	.else
-	ldr	r2, [vcpu, #CP15_OFFSET(c14_CNTKCTL)]
-	add	r12, vcpu, #CP15_OFFSET(c7_PAR)
-	ldrd	r4, r5, [r12]
-	ldr	r6, [vcpu, #CP15_OFFSET(c10_AMAIR0)]
-	ldr	r7, [vcpu, #CP15_OFFSET(c10_AMAIR1)]
-	.endif
-
-	mcr	p15, 0, r2, c14, c1, 0	@ CNTKCTL
-	mcrr	p15, 0, r4, r5, c7	@ PAR
-	mcr	p15, 0, r6, c10, c3, 0	@ AMAIR0
-	mcr	p15, 0, r7, c10, c3, 1	@ AMAIR1
-
-	.if \read_from_vcpu == 0
-	pop	{r2-r12}
-	.else
-	ldr	r2, [vcpu, #CP15_OFFSET(c13_CID)]
-	ldr	r3, [vcpu, #CP15_OFFSET(c13_TID_URW)]
-	ldr	r4, [vcpu, #CP15_OFFSET(c13_TID_URO)]
-	ldr	r5, [vcpu, #CP15_OFFSET(c13_TID_PRIV)]
-	ldr	r6, [vcpu, #CP15_OFFSET(c5_DFSR)]
-	ldr	r7, [vcpu, #CP15_OFFSET(c5_IFSR)]
-	ldr	r8, [vcpu, #CP15_OFFSET(c5_ADFSR)]
-	ldr	r9, [vcpu, #CP15_OFFSET(c5_AIFSR)]
-	ldr	r10, [vcpu, #CP15_OFFSET(c6_DFAR)]
-	ldr	r11, [vcpu, #CP15_OFFSET(c6_IFAR)]
-	ldr	r12, [vcpu, #CP15_OFFSET(c12_VBAR)]
-	.endif
-
-	mcr	p15, 0, r2, c13, c0, 1	@ CID
-	mcr	p15, 0, r3, c13, c0, 2	@ TID_URW
-	mcr	p15, 0, r4, c13, c0, 3	@ TID_URO
-	mcr	p15, 0, r5, c13, c0, 4	@ TID_PRIV
-	mcr	p15, 0, r6, c5, c0, 0	@ DFSR
-	mcr	p15, 0, r7, c5, c0, 1	@ IFSR
-	mcr	p15, 0, r8, c5, c1, 0	@ ADFSR
-	mcr	p15, 0, r9, c5, c1, 1	@ AIFSR
-	mcr	p15, 0, r10, c6, c0, 0	@ DFAR
-	mcr	p15, 0, r11, c6, c0, 2	@ IFAR
-	mcr	p15, 0, r12, c12, c0, 0	@ VBAR
-
-	.if \read_from_vcpu == 0
-	pop	{r2-r12}
-	.else
-	ldr	r2, [vcpu, #CP15_OFFSET(c1_SCTLR)]
-	ldr	r3, [vcpu, #CP15_OFFSET(c1_CPACR)]
-	ldr	r4, [vcpu, #CP15_OFFSET(c2_TTBCR)]
-	ldr	r5, [vcpu, #CP15_OFFSET(c3_DACR)]
-	add	r12, vcpu, #CP15_OFFSET(c2_TTBR0)
-	ldrd	r6, r7, [r12]
-	add	r12, vcpu, #CP15_OFFSET(c2_TTBR1)
-	ldrd	r8, r9, [r12]
-	ldr	r10, [vcpu, #CP15_OFFSET(c10_PRRR)]
-	ldr	r11, [vcpu, #CP15_OFFSET(c10_NMRR)]
-	ldr	r12, [vcpu, #CP15_OFFSET(c0_CSSELR)]
-	.endif
-
-	mcr	p15, 0, r2, c1, c0, 0	@ SCTLR
-	mcr	p15, 0, r3, c1, c0, 2	@ CPACR
-	mcr	p15, 0, r4, c2, c0, 2	@ TTBCR
-	mcr	p15, 0, r5, c3, c0, 0	@ DACR
-	mcrr	p15, 0, r6, r7, c2	@ TTBR 0
-	mcrr	p15, 1, r8, r9, c2	@ TTBR 1
-	mcr	p15, 0, r10, c10, c2, 0	@ PRRR
-	mcr	p15, 0, r11, c10, c2, 1	@ NMRR
-	mcr	p15, 2, r12, c0, c0, 0	@ CSSELR
-.endm
-
-/*
- * Save the VGIC CPU state into memory
- *
- * Assumes vcpu pointer in vcpu reg
- */
-.macro save_vgic_state
-	/* Get VGIC VCTRL base into r2 */
-	ldr	r2, [vcpu, #VCPU_KVM]
-	ldr	r2, [r2, #KVM_VGIC_VCTRL]
-	cmp	r2, #0
-	beq	2f
-
-	/* Compute the address of struct vgic_cpu */
-	add	r11, vcpu, #VCPU_VGIC_CPU
-
-	/* Save all interesting registers */
-	ldr	r4, [r2, #GICH_VMCR]
-	ldr	r5, [r2, #GICH_MISR]
-	ldr	r6, [r2, #GICH_EISR0]
-	ldr	r7, [r2, #GICH_EISR1]
-	ldr	r8, [r2, #GICH_ELRSR0]
-	ldr	r9, [r2, #GICH_ELRSR1]
-	ldr	r10, [r2, #GICH_APR]
-ARM_BE8(rev	r4, r4	)
-ARM_BE8(rev	r5, r5	)
-ARM_BE8(rev	r6, r6	)
-ARM_BE8(rev	r7, r7	)
-ARM_BE8(rev	r8, r8	)
-ARM_BE8(rev	r9, r9	)
-ARM_BE8(rev	r10, r10	)
-
-	str	r4, [r11, #VGIC_V2_CPU_VMCR]
-	str	r5, [r11, #VGIC_V2_CPU_MISR]
-#ifdef CONFIG_CPU_ENDIAN_BE8
-	str	r6, [r11, #(VGIC_V2_CPU_EISR + 4)]
-	str	r7, [r11, #VGIC_V2_CPU_EISR]
-	str	r8, [r11, #(VGIC_V2_CPU_ELRSR + 4)]
-	str	r9, [r11, #VGIC_V2_CPU_ELRSR]
-#else
-	str	r6, [r11, #VGIC_V2_CPU_EISR]
-	str	r7, [r11, #(VGIC_V2_CPU_EISR + 4)]
-	str	r8, [r11, #VGIC_V2_CPU_ELRSR]
-	str	r9, [r11, #(VGIC_V2_CPU_ELRSR + 4)]
-#endif
-	str	r10, [r11, #VGIC_V2_CPU_APR]
-
-	/* Clear GICH_HCR */
-	mov	r5, #0
-	str	r5, [r2, #GICH_HCR]
-
-	/* Save list registers */
-	add	r2, r2, #GICH_LR0
-	add	r3, r11, #VGIC_V2_CPU_LR
-	ldr	r4, [r11, #VGIC_CPU_NR_LR]
-1:	ldr	r6, [r2], #4
-ARM_BE8(rev	r6, r6	)
-	str	r6, [r3], #4
-	subs	r4, r4, #1
-	bne	1b
-2:
-.endm
-
-/*
- * Restore the VGIC CPU state from memory
- *
- * Assumes vcpu pointer in vcpu reg
- */
-.macro restore_vgic_state
-	/* Get VGIC VCTRL base into r2 */
-	ldr	r2, [vcpu, #VCPU_KVM]
-	ldr	r2, [r2, #KVM_VGIC_VCTRL]
-	cmp	r2, #0
-	beq	2f
-
-	/* Compute the address of struct vgic_cpu */
-	add	r11, vcpu, #VCPU_VGIC_CPU
-
-	/* We only restore a minimal set of registers */
-	ldr	r3, [r11, #VGIC_V2_CPU_HCR]
-	ldr	r4, [r11, #VGIC_V2_CPU_VMCR]
-	ldr	r8, [r11, #VGIC_V2_CPU_APR]
-ARM_BE8(rev	r3, r3	)
-ARM_BE8(rev	r4, r4	)
-ARM_BE8(rev	r8, r8	)
-
-	str	r3, [r2, #GICH_HCR]
-	str	r4, [r2, #GICH_VMCR]
-	str	r8, [r2, #GICH_APR]
-
-	/* Restore list registers */
-	add	r2, r2, #GICH_LR0
-	add	r3, r11, #VGIC_V2_CPU_LR
-	ldr	r4, [r11, #VGIC_CPU_NR_LR]
-1:	ldr	r6, [r3], #4
-ARM_BE8(rev	r6, r6  )
-	str	r6, [r2], #4
-	subs	r4, r4, #1
-	bne	1b
-2:
-.endm
-
-#define CNTHCTL_PL1PCTEN	(1 << 0)
-#define CNTHCTL_PL1PCEN		(1 << 1)
-
-/*
- * Save the timer state onto the VCPU and allow physical timer/counter access
- * for the host.
- *
- * Assumes vcpu pointer in vcpu reg
- * Clobbers r2-r5
- */
-.macro save_timer_state
-	ldr	r4, [vcpu, #VCPU_KVM]
-	ldr	r2, [r4, #KVM_TIMER_ENABLED]
-	cmp	r2, #0
-	beq	1f
-
-	mrc	p15, 0, r2, c14, c3, 1	@ CNTV_CTL
-	str	r2, [vcpu, #VCPU_TIMER_CNTV_CTL]
-
-	isb
-
-	mrrc	p15, 3, rr_lo_hi(r2, r3), c14	@ CNTV_CVAL
-	ldr	r4, =VCPU_TIMER_CNTV_CVAL
-	add	r5, vcpu, r4
-	strd	r2, r3, [r5]
-
-	@ Ensure host CNTVCT == CNTPCT
-	mov	r2, #0
-	mcrr	p15, 4, r2, r2, c14	@ CNTVOFF
-
-1:
-	mov	r2, #0			@ Clear ENABLE
-	mcr	p15, 0, r2, c14, c3, 1	@ CNTV_CTL
-
-	@ Allow physical timer/counter access for the host
-	mrc	p15, 4, r2, c14, c1, 0	@ CNTHCTL
-	orr	r2, r2, #(CNTHCTL_PL1PCEN | CNTHCTL_PL1PCTEN)
-	mcr	p15, 4, r2, c14, c1, 0	@ CNTHCTL
-.endm
-
-/*
- * Load the timer state from the VCPU and deny physical timer/counter access
- * for the host.
- *
- * Assumes vcpu pointer in vcpu reg
- * Clobbers r2-r5
- */
-.macro restore_timer_state
-	@ Disallow physical timer access for the guest
-	@ Physical counter access is allowed
-	mrc	p15, 4, r2, c14, c1, 0	@ CNTHCTL
-	orr	r2, r2, #CNTHCTL_PL1PCTEN
-	bic	r2, r2, #CNTHCTL_PL1PCEN
-	mcr	p15, 4, r2, c14, c1, 0	@ CNTHCTL
-
-	ldr	r4, [vcpu, #VCPU_KVM]
-	ldr	r2, [r4, #KVM_TIMER_ENABLED]
-	cmp	r2, #0
-	beq	1f
-
-	ldr	r2, [r4, #KVM_TIMER_CNTVOFF]
-	ldr	r3, [r4, #(KVM_TIMER_CNTVOFF + 4)]
-	mcrr	p15, 4, rr_lo_hi(r2, r3), c14	@ CNTVOFF
-
-	ldr	r4, =VCPU_TIMER_CNTV_CVAL
-	add	r5, vcpu, r4
-	ldrd	r2, r3, [r5]
-	mcrr	p15, 3, rr_lo_hi(r2, r3), c14	@ CNTV_CVAL
-	isb
-
-	ldr	r2, [vcpu, #VCPU_TIMER_CNTV_CTL]
-	and	r2, r2, #3
-	mcr	p15, 0, r2, c14, c3, 1	@ CNTV_CTL
-1:
-.endm
-
-.equ vmentry,	0
-.equ vmexit,	1
-
-/* Configures the HSTR (Hyp System Trap Register) on entry/return
- * (hardware reset value is 0) */
-.macro set_hstr operation
-	mrc	p15, 4, r2, c1, c1, 3
-	ldr	r3, =HSTR_T(15)
-	.if \operation == vmentry
-	orr	r2, r2, r3		@ Trap CR{15}
-	.else
-	bic	r2, r2, r3		@ Don't trap any CRx accesses
-	.endif
-	mcr	p15, 4, r2, c1, c1, 3
-.endm
-
-/* Configures the HCPTR (Hyp Coprocessor Trap Register) on entry/return
- * (hardware reset value is 0). Keep previous value in r2.
- * An ISB is emited on vmexit/vmtrap, but executed on vmexit only if
- * VFP wasn't already enabled (always executed on vmtrap).
- * If a label is specified with vmexit, it is branched to if VFP wasn't
- * enabled.
- */
-.macro set_hcptr operation, mask, label = none
-	mrc	p15, 4, r2, c1, c1, 2
-	ldr	r3, =\mask
-	.if \operation == vmentry
-	orr	r3, r2, r3		@ Trap coproc-accesses defined in mask
-	.else
-	bic	r3, r2, r3		@ Don't trap defined coproc-accesses
-	.endif
-	mcr	p15, 4, r3, c1, c1, 2
-	.if \operation != vmentry
-	.if \operation == vmexit
-	tst	r2, #(HCPTR_TCP(10) | HCPTR_TCP(11))
-	beq	1f
-	.endif
-	isb
-	.if \label != none
-	b	\label
-	.endif
-1:
-	.endif
-.endm
-
-/* Configures the HDCR (Hyp Debug Configuration Register) on entry/return
- * (hardware reset value is 0) */
-.macro set_hdcr operation
-	mrc	p15, 4, r2, c1, c1, 1
-	ldr	r3, =(HDCR_TPM|HDCR_TPMCR)
-	.if \operation == vmentry
-	orr	r2, r2, r3		@ Trap some perfmon accesses
-	.else
-	bic	r2, r2, r3		@ Don't trap any perfmon accesses
-	.endif
-	mcr	p15, 4, r2, c1, c1, 1
-.endm
-
-/* Enable/Disable: stage-2 trans., trap interrupts, trap wfi, trap smc */
-.macro configure_hyp_role operation
-	.if \operation == vmentry
-	ldr	r2, [vcpu, #VCPU_HCR]
-	ldr	r3, [vcpu, #VCPU_IRQ_LINES]
-	orr	r2, r2, r3
-	.else
-	mov	r2, #0
-	.endif
-	mcr	p15, 4, r2, c1, c1, 0	@ HCR
-.endm
-
-.macro load_vcpu
-	mrc	p15, 4, vcpu, c13, c0, 2	@ HTPIDR
-.endm

+ 23 - 0
arch/arm/kvm/mmu.c

@@ -28,6 +28,7 @@
 #include <asm/kvm_mmio.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_emulate.h>
+#include <asm/virt.h>
 
 #include "trace.h"
 
@@ -598,6 +599,9 @@ int create_hyp_mappings(void *from, void *to)
 	unsigned long start = KERN_TO_HYP((unsigned long)from);
 	unsigned long end = KERN_TO_HYP((unsigned long)to);
 
+	if (is_kernel_in_hyp_mode())
+		return 0;
+
 	start = start & PAGE_MASK;
 	end = PAGE_ALIGN(end);
 
@@ -630,6 +634,9 @@ int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
 	unsigned long start = KERN_TO_HYP((unsigned long)from);
 	unsigned long end = KERN_TO_HYP((unsigned long)to);
 
+	if (is_kernel_in_hyp_mode())
+		return 0;
+
 	/* Check for a valid kernel IO mapping */
 	if (!is_vmalloc_addr(from) || !is_vmalloc_addr(to - 1))
 		return -EINVAL;
@@ -1430,6 +1437,22 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			goto out_unlock;
 		}
 
+		/*
+		 * Check for a cache maintenance operation. Since we
+		 * ended-up here, we know it is outside of any memory
+		 * slot. But we can't find out if that is for a device,
+		 * or if the guest is just being stupid. The only thing
+		 * we know for sure is that this range cannot be cached.
+		 *
+		 * So let's assume that the guest is just being
+		 * cautious, and skip the instruction.
+		 */
+		if (kvm_vcpu_dabt_is_cm(vcpu)) {
+			kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
+			ret = 1;
+			goto out_unlock;
+		}
+
 		/*
 		 * The IPA is reported as [MAX:12], so we need to
 		 * complement it with the bottom 12 bits from the

+ 1 - 1
arch/arm/kvm/reset.c

@@ -71,7 +71,7 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	}
 
 	/* Reset core registers */
-	memcpy(&vcpu->arch.regs, reset_regs, sizeof(vcpu->arch.regs));
+	memcpy(&vcpu->arch.ctxt.gp_regs, reset_regs, sizeof(vcpu->arch.ctxt.gp_regs));
 
 	/* Reset CP15 registers */
 	kvm_reset_coprocs(vcpu);

+ 13 - 0
arch/arm64/Kconfig

@@ -750,6 +750,19 @@ config ARM64_LSE_ATOMICS
 	  not support these instructions and requires the kernel to be
 	  built with binutils >= 2.25.
 
+config ARM64_VHE
+	bool "Enable support for Virtualization Host Extensions (VHE)"
+	default y
+	help
+	  Virtualization Host Extensions (VHE) allow the kernel to run
+	  directly at EL2 (instead of EL1) on processors that support
+	  it. This leads to better performance for KVM, as they reduce
+	  the cost of the world switch.
+
+	  Selecting this option allows the VHE feature to be detected
+	  at runtime, and does not affect processors that do not
+	  implement this feature.
+
 endmenu
 
 endmenu

+ 5 - 1
arch/arm64/include/asm/cpufeature.h

@@ -30,8 +30,12 @@
 #define ARM64_HAS_LSE_ATOMICS			5
 #define ARM64_WORKAROUND_CAVIUM_23154		6
 #define ARM64_WORKAROUND_834220			7
+/* #define ARM64_HAS_NO_HW_PREFETCH		8 */
+/* #define ARM64_HAS_UAO			9 */
+/* #define ARM64_ALT_PAN_NOT_UAO		10 */
+#define ARM64_HAS_VIRT_HOST_EXTN		11
 
-#define ARM64_NCAPS				8
+#define ARM64_NCAPS				12
 
 #ifndef __ASSEMBLY__
 

+ 13 - 5
arch/arm64/include/asm/hw_breakpoint.h

@@ -18,6 +18,7 @@
 
 #include <asm/cputype.h>
 #include <asm/cpufeature.h>
+#include <asm/virt.h>
 
 #ifdef __KERNEL__
 
@@ -35,10 +36,21 @@ struct arch_hw_breakpoint {
 	struct arch_hw_breakpoint_ctrl ctrl;
 };
 
+/* Privilege Levels */
+#define AARCH64_BREAKPOINT_EL1	1
+#define AARCH64_BREAKPOINT_EL0	2
+
+#define DBG_HMC_HYP		(1 << 13)
+
 static inline u32 encode_ctrl_reg(struct arch_hw_breakpoint_ctrl ctrl)
 {
-	return (ctrl.len << 5) | (ctrl.type << 3) | (ctrl.privilege << 1) |
+	u32 val = (ctrl.len << 5) | (ctrl.type << 3) | (ctrl.privilege << 1) |
 		ctrl.enabled;
+
+	if (is_kernel_in_hyp_mode() && ctrl.privilege == AARCH64_BREAKPOINT_EL1)
+		val |= DBG_HMC_HYP;
+
+	return val;
 }
 
 static inline void decode_ctrl_reg(u32 reg,
@@ -61,10 +73,6 @@ static inline void decode_ctrl_reg(u32 reg,
 #define ARM_BREAKPOINT_STORE	2
 #define AARCH64_ESR_ACCESS_MASK	(1 << 6)
 
-/* Privilege Levels */
-#define AARCH64_BREAKPOINT_EL1	1
-#define AARCH64_BREAKPOINT_EL0	2
-
 /* Lengths */
 #define ARM_BREAKPOINT_LEN_1	0x1
 #define ARM_BREAKPOINT_LEN_2	0x3

+ 5 - 1
arch/arm64/include/asm/kvm_arm.h

@@ -23,6 +23,7 @@
 #include <asm/types.h>
 
 /* Hyp Configuration Register (HCR) bits */
+#define HCR_E2H		(UL(1) << 34)
 #define HCR_ID		(UL(1) << 33)
 #define HCR_CD		(UL(1) << 32)
 #define HCR_RW_SHIFT	31
@@ -81,7 +82,7 @@
 			 HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW)
 #define HCR_VIRT_EXCP_MASK (HCR_VA | HCR_VI | HCR_VF)
 #define HCR_INT_OVERRIDE   (HCR_FMO | HCR_IMO)
-
+#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
 
 /* Hyp System Control Register (SCTLR_EL2) bits */
 #define SCTLR_EL2_EE	(1 << 25)
@@ -216,4 +217,7 @@
 	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
 	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
 
+#define CPACR_EL1_FPEN		(3 << 20)
+#define CPACR_EL1_TTA		(1 << 28)
+
 #endif /* __ARM64_KVM_ARM_H__ */

+ 3 - 3
arch/arm64/include/asm/kvm_asm.h

@@ -35,9 +35,6 @@ extern char __kvm_hyp_init_end[];
 
 extern char __kvm_hyp_vector[];
 
-#define	__kvm_hyp_code_start	__hyp_text_start
-#define	__kvm_hyp_code_end	__hyp_text_end
-
 extern void __kvm_flush_vm_context(void);
 extern void __kvm_tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa);
 extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
@@ -45,9 +42,12 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
 extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
+extern void __vgic_v3_init_lrs(void);
 
 extern u32 __kvm_get_mdcr_el2(void);
 
+extern void __init_stage2_translation(void);
+
 #endif
 
 #endif /* __ARM_KVM_ASM_H__ */

+ 8 - 0
arch/arm64/include/asm/kvm_emulate.h

@@ -29,6 +29,7 @@
 #include <asm/kvm_mmio.h>
 #include <asm/ptrace.h>
 #include <asm/cputype.h>
+#include <asm/virt.h>
 
 unsigned long *vcpu_reg32(const struct kvm_vcpu *vcpu, u8 reg_num);
 unsigned long *vcpu_spsr32(const struct kvm_vcpu *vcpu);
@@ -43,6 +44,8 @@ void kvm_inject_pabt(struct kvm_vcpu *vcpu, unsigned long addr);
 static inline void vcpu_reset_hcr(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.hcr_el2 = HCR_GUEST_FLAGS;
+	if (is_kernel_in_hyp_mode())
+		vcpu->arch.hcr_el2 |= HCR_E2H;
 	if (test_bit(KVM_ARM_VCPU_EL1_32BIT, vcpu->arch.features))
 		vcpu->arch.hcr_el2 &= ~HCR_RW;
 }
@@ -189,6 +192,11 @@ static inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu)
 	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW);
 }
 
+static inline bool kvm_vcpu_dabt_is_cm(const struct kvm_vcpu *vcpu)
+{
+	return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_CM);
+}
+
 static inline int kvm_vcpu_dabt_get_as(const struct kvm_vcpu *vcpu)
 {
 	return 1 << ((kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT);

+ 33 - 1
arch/arm64/include/asm/kvm_host.h

@@ -25,7 +25,9 @@
 #include <linux/types.h>
 #include <linux/kvm_types.h>
 #include <asm/kvm.h>
+#include <asm/kvm_asm.h>
 #include <asm/kvm_mmio.h>
+#include <asm/kvm_perf_event.h>
 
 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
 
@@ -36,10 +38,11 @@
 
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
+#include <kvm/arm_pmu.h>
 
 #define KVM_MAX_VCPUS VGIC_V3_MAX_CPUS
 
-#define KVM_VCPU_MAX_FEATURES 3
+#define KVM_VCPU_MAX_FEATURES 4
 
 int __attribute_const__ kvm_target_cpu(void);
 int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
@@ -114,6 +117,21 @@ enum vcpu_sysreg {
 	MDSCR_EL1,	/* Monitor Debug System Control Register */
 	MDCCINT_EL1,	/* Monitor Debug Comms Channel Interrupt Enable Reg */
 
+	/* Performance Monitors Registers */
+	PMCR_EL0,	/* Control Register */
+	PMSELR_EL0,	/* Event Counter Selection Register */
+	PMEVCNTR0_EL0,	/* Event Counter Register (0-30) */
+	PMEVCNTR30_EL0 = PMEVCNTR0_EL0 + 30,
+	PMCCNTR_EL0,	/* Cycle Counter Register */
+	PMEVTYPER0_EL0,	/* Event Type Register (0-30) */
+	PMEVTYPER30_EL0 = PMEVTYPER0_EL0 + 30,
+	PMCCFILTR_EL0,	/* Cycle Count Filter Register */
+	PMCNTENSET_EL0,	/* Count Enable Set Register */
+	PMINTENSET_EL1,	/* Interrupt Enable Set Register */
+	PMOVSSET_EL0,	/* Overflow Flag Status Set Register */
+	PMSWINC_EL0,	/* Software Increment Register */
+	PMUSERENR_EL0,	/* User Enable Register */
+
 	/* 32bit specific registers. Keep them at the end of the range */
 	DACR32_EL2,	/* Domain Access Control Register */
 	IFSR32_EL2,	/* Instruction Fault Status Register */
@@ -211,6 +229,7 @@ struct kvm_vcpu_arch {
 	/* VGIC state */
 	struct vgic_cpu vgic_cpu;
 	struct arch_timer_cpu timer_cpu;
+	struct kvm_pmu pmu;
 
 	/*
 	 * Anything that is not used directly from assembly code goes
@@ -342,5 +361,18 @@ void kvm_arm_init_debug(void);
 void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
 void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
+int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
+int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr);
+
+/* #define kvm_call_hyp(f, ...) __kvm_call_hyp(kvm_ksym_ref(f), ##__VA_ARGS__) */
+
+static inline void __cpu_init_stage2(void)
+{
+	kvm_call_hyp(__init_stage2_translation);
+}
 
 #endif /* __ARM64_KVM_HOST_H__ */

+ 181 - 0
arch/arm64/include/asm/kvm_hyp.h

@@ -0,0 +1,181 @@
+/*
+ * Copyright (C) 2015 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ARM64_KVM_HYP_H__
+#define __ARM64_KVM_HYP_H__
+
+#include <linux/compiler.h>
+#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
+#include <asm/kvm_perf_event.h>
+#include <asm/sysreg.h>
+
+#define __hyp_text __section(.hyp.text) notrace
+
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+	asm volatile(ALTERNATIVE("and %0, %0, %1",
+				 "nop",
+				 ARM64_HAS_VIRT_HOST_EXTN)
+		     : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
+	return v;
+}
+
+#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
+
+static inline unsigned long __hyp_kern_va(unsigned long v)
+{
+	u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
+	asm volatile(ALTERNATIVE("add %0, %0, %1",
+				 "nop",
+				 ARM64_HAS_VIRT_HOST_EXTN)
+		     : "+r" (v) : "r" (offset));
+	return v;
+}
+
+#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
+
+#define read_sysreg_elx(r,nvh,vh)					\
+	({								\
+		u64 reg;						\
+		asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##nvh),\
+					 "mrs_s %0, " __stringify(r##vh),\
+					 ARM64_HAS_VIRT_HOST_EXTN)	\
+			     : "=r" (reg));				\
+		reg;							\
+	})
+
+#define write_sysreg_elx(v,r,nvh,vh)					\
+	do {								\
+		u64 __val = (u64)(v);					\
+		asm volatile(ALTERNATIVE("msr " __stringify(r##nvh) ", %x0",\
+					 "msr_s " __stringify(r##vh) ", %x0",\
+					 ARM64_HAS_VIRT_HOST_EXTN)	\
+					 : : "rZ" (__val));		\
+	} while (0)
+
+/*
+ * Unified accessors for registers that have a different encoding
+ * between VHE and non-VHE. They must be specified without their "ELx"
+ * encoding.
+ */
+#define read_sysreg_el2(r)						\
+	({								\
+		u64 reg;						\
+		asm volatile(ALTERNATIVE("mrs %0, " __stringify(r##_EL2),\
+					 "mrs %0, " __stringify(r##_EL1),\
+					 ARM64_HAS_VIRT_HOST_EXTN)	\
+			     : "=r" (reg));				\
+		reg;							\
+	})
+
+#define write_sysreg_el2(v,r)						\
+	do {								\
+		u64 __val = (u64)(v);					\
+		asm volatile(ALTERNATIVE("msr " __stringify(r##_EL2) ", %x0",\
+					 "msr " __stringify(r##_EL1) ", %x0",\
+					 ARM64_HAS_VIRT_HOST_EXTN)	\
+					 : : "rZ" (__val));		\
+	} while (0)
+
+#define read_sysreg_el0(r)	read_sysreg_elx(r, _EL0, _EL02)
+#define write_sysreg_el0(v,r)	write_sysreg_elx(v, r, _EL0, _EL02)
+#define read_sysreg_el1(r)	read_sysreg_elx(r, _EL1, _EL12)
+#define write_sysreg_el1(v,r)	write_sysreg_elx(v, r, _EL1, _EL12)
+
+/* The VHE specific system registers and their encoding */
+#define sctlr_EL12              sys_reg(3, 5, 1, 0, 0)
+#define cpacr_EL12              sys_reg(3, 5, 1, 0, 2)
+#define ttbr0_EL12              sys_reg(3, 5, 2, 0, 0)
+#define ttbr1_EL12              sys_reg(3, 5, 2, 0, 1)
+#define tcr_EL12                sys_reg(3, 5, 2, 0, 2)
+#define afsr0_EL12              sys_reg(3, 5, 5, 1, 0)
+#define afsr1_EL12              sys_reg(3, 5, 5, 1, 1)
+#define esr_EL12                sys_reg(3, 5, 5, 2, 0)
+#define far_EL12                sys_reg(3, 5, 6, 0, 0)
+#define mair_EL12               sys_reg(3, 5, 10, 2, 0)
+#define amair_EL12              sys_reg(3, 5, 10, 3, 0)
+#define vbar_EL12               sys_reg(3, 5, 12, 0, 0)
+#define contextidr_EL12         sys_reg(3, 5, 13, 0, 1)
+#define cntkctl_EL12            sys_reg(3, 5, 14, 1, 0)
+#define cntp_tval_EL02          sys_reg(3, 5, 14, 2, 0)
+#define cntp_ctl_EL02           sys_reg(3, 5, 14, 2, 1)
+#define cntp_cval_EL02          sys_reg(3, 5, 14, 2, 2)
+#define cntv_tval_EL02          sys_reg(3, 5, 14, 3, 0)
+#define cntv_ctl_EL02           sys_reg(3, 5, 14, 3, 1)
+#define cntv_cval_EL02          sys_reg(3, 5, 14, 3, 2)
+#define spsr_EL12               sys_reg(3, 5, 4, 0, 0)
+#define elr_EL12                sys_reg(3, 5, 4, 0, 1)
+
+/**
+ * hyp_alternate_select - Generates patchable code sequences that are
+ * used to switch between two implementations of a function, depending
+ * on the availability of a feature.
+ *
+ * @fname: a symbol name that will be defined as a function returning a
+ * function pointer whose type will match @orig and @alt
+ * @orig: A pointer to the default function, as returned by @fname when
+ * @cond doesn't hold
+ * @alt: A pointer to the alternate function, as returned by @fname
+ * when @cond holds
+ * @cond: a CPU feature (as described in asm/cpufeature.h)
+ */
+#define hyp_alternate_select(fname, orig, alt, cond)			\
+typeof(orig) * __hyp_text fname(void)					\
+{									\
+	typeof(alt) *val = orig;					\
+	asm volatile(ALTERNATIVE("nop		\n",			\
+				 "mov	%0, %1	\n",			\
+				 cond)					\
+		     : "+r" (val) : "r" (alt));				\
+	return val;							\
+}
+
+void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
+
+void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
+void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
+
+void __timer_save_state(struct kvm_vcpu *vcpu);
+void __timer_restore_state(struct kvm_vcpu *vcpu);
+
+void __sysreg_save_host_state(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_host_state(struct kvm_cpu_context *ctxt);
+void __sysreg_save_guest_state(struct kvm_cpu_context *ctxt);
+void __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt);
+void __sysreg32_save_state(struct kvm_vcpu *vcpu);
+void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
+
+void __debug_save_state(struct kvm_vcpu *vcpu,
+			struct kvm_guest_debug_arch *dbg,
+			struct kvm_cpu_context *ctxt);
+void __debug_restore_state(struct kvm_vcpu *vcpu,
+			   struct kvm_guest_debug_arch *dbg,
+			   struct kvm_cpu_context *ctxt);
+void __debug_cond_save_host_state(struct kvm_vcpu *vcpu);
+void __debug_cond_restore_host_state(struct kvm_vcpu *vcpu);
+
+void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
+void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
+bool __fpsimd_enabled(void);
+
+u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
+void __noreturn __hyp_do_panic(unsigned long, ...);
+
+#endif /* __ARM64_KVM_HYP_H__ */
+

+ 11 - 1
arch/arm64/include/asm/kvm_mmu.h

@@ -23,13 +23,16 @@
 #include <asm/cpufeature.h>
 
 /*
- * As we only have the TTBR0_EL2 register, we cannot express
+ * As ARMv8.0 only has the TTBR0_EL2 register, we cannot express
  * "negative" addresses. This makes it impossible to directly share
  * mappings with the kernel.
  *
  * Instead, give the HYP mode its own VA region at a fixed offset from
  * the kernel by just masking the top bits (which are all ones for a
  * kernel address).
+ *
+ * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
+ * macros (the entire kernel runs at EL2).
  */
 #define HYP_PAGE_OFFSET_SHIFT	VA_BITS
 #define HYP_PAGE_OFFSET_MASK	((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
@@ -56,12 +59,19 @@
 
 #ifdef __ASSEMBLY__
 
+#include <asm/alternative.h>
+#include <asm/cpufeature.h>
+
 /*
  * Convert a kernel VA into a HYP VA.
  * reg: VA to be converted.
  */
 .macro kern_hyp_va	reg
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN	
 	and	\reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_else
+	nop
+alternative_endif
 .endm
 
 #else

+ 68 - 0
arch/arm64/include/asm/kvm_perf_event.h

@@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2012 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_KVM_PERF_EVENT_H
+#define __ASM_KVM_PERF_EVENT_H
+
+#define	ARMV8_PMU_MAX_COUNTERS	32
+#define	ARMV8_PMU_COUNTER_MASK	(ARMV8_PMU_MAX_COUNTERS - 1)
+
+/*
+ * Per-CPU PMCR: config reg
+ */
+#define ARMV8_PMU_PMCR_E	(1 << 0) /* Enable all counters */
+#define ARMV8_PMU_PMCR_P	(1 << 1) /* Reset all counters */
+#define ARMV8_PMU_PMCR_C	(1 << 2) /* Cycle counter reset */
+#define ARMV8_PMU_PMCR_D	(1 << 3) /* CCNT counts every 64th cpu cycle */
+#define ARMV8_PMU_PMCR_X	(1 << 4) /* Export to ETM */
+#define ARMV8_PMU_PMCR_DP	(1 << 5) /* Disable CCNT if non-invasive debug*/
+/* Determines which bit of PMCCNTR_EL0 generates an overflow */
+#define ARMV8_PMU_PMCR_LC	(1 << 6)
+#define	ARMV8_PMU_PMCR_N_SHIFT	11	 /* Number of counters supported */
+#define	ARMV8_PMU_PMCR_N_MASK	0x1f
+#define	ARMV8_PMU_PMCR_MASK	0x7f	 /* Mask for writable bits */
+
+/*
+ * PMOVSR: counters overflow flag status reg
+ */
+#define	ARMV8_PMU_OVSR_MASK		0xffffffff	/* Mask for writable bits */
+#define	ARMV8_PMU_OVERFLOWED_MASK	ARMV8_PMU_OVSR_MASK
+
+/*
+ * PMXEVTYPER: Event selection reg
+ */
+#define	ARMV8_PMU_EVTYPE_MASK	0xc80003ff	/* Mask for writable bits */
+#define	ARMV8_PMU_EVTYPE_EVENT	0x3ff		/* Mask for EVENT bits */
+
+#define ARMV8_PMU_EVTYPE_EVENT_SW_INCR	0	/* Software increment event */
+
+/*
+ * Event filters for PMUv3
+ */
+#define	ARMV8_PMU_EXCLUDE_EL1	(1 << 31)
+#define	ARMV8_PMU_EXCLUDE_EL0	(1 << 30)
+#define	ARMV8_PMU_INCLUDE_EL2	(1 << 27)
+
+/*
+ * PMUSERENR: user enable reg
+ */
+#define ARMV8_PMU_USERENR_MASK	0xf		/* Mask for writable bits */
+#define ARMV8_PMU_USERENR_EN	(1 << 0) /* PMU regs can be accessed at EL0 */
+#define ARMV8_PMU_USERENR_SW	(1 << 1) /* PMSWINC can be written at EL0 */
+#define ARMV8_PMU_USERENR_CR	(1 << 2) /* Cycle counter can be read at EL0 */
+#define ARMV8_PMU_USERENR_ER	(1 << 3) /* Event counter can be read at EL0 */
+
+#endif

+ 10 - 0
arch/arm64/include/asm/virt.h

@@ -23,6 +23,8 @@
 
 #ifndef __ASSEMBLY__
 
+#include <asm/ptrace.h>
+
 /*
  * __boot_cpu_mode records what mode CPUs were booted in.
  * A correctly-implemented bootloader must start all CPUs in the same mode:
@@ -50,6 +52,14 @@ static inline bool is_hyp_mode_mismatched(void)
 	return __boot_cpu_mode[0] != __boot_cpu_mode[1];
 }
 
+static inline bool is_kernel_in_hyp_mode(void)
+{
+	u64 el;
+
+	asm("mrs %0, CurrentEL" : "=r" (el));
+	return el == CurrentEL_EL2;
+}
+
 /* The section containing the hypervisor text */
 extern char __hyp_text_start[];
 extern char __hyp_text_end[];

+ 6 - 0
arch/arm64/include/uapi/asm/kvm.h

@@ -94,6 +94,7 @@ struct kvm_regs {
 #define KVM_ARM_VCPU_POWER_OFF		0 /* CPU is started in OFF state */
 #define KVM_ARM_VCPU_EL1_32BIT		1 /* CPU running a 32bit VM */
 #define KVM_ARM_VCPU_PSCI_0_2		2 /* CPU uses PSCI v0.2 */
+#define KVM_ARM_VCPU_PMU_V3		3 /* Support guest PMUv3 */
 
 struct kvm_vcpu_init {
 	__u32 target;
@@ -204,6 +205,11 @@ struct kvm_arch_memory_slot {
 #define KVM_DEV_ARM_VGIC_GRP_CTRL	4
 #define   KVM_DEV_ARM_VGIC_CTRL_INIT	0
 
+/* Device Control API on vcpu fd */
+#define KVM_ARM_VCPU_PMU_V3_CTRL	0
+#define   KVM_ARM_VCPU_PMU_V3_IRQ	0
+#define   KVM_ARM_VCPU_PMU_V3_INIT	1
+
 /* KVM_IRQ_LINE irq field index values */
 #define KVM_ARM_IRQ_TYPE_SHIFT		24
 #define KVM_ARM_IRQ_TYPE_MASK		0xff

+ 0 - 3
arch/arm64/kernel/asm-offsets.c

@@ -110,9 +110,6 @@ int main(void)
   DEFINE(CPU_USER_PT_REGS,	offsetof(struct kvm_regs, regs));
   DEFINE(CPU_FP_REGS,		offsetof(struct kvm_regs, fp_regs));
   DEFINE(VCPU_FPEXC32_EL2,	offsetof(struct kvm_vcpu, arch.ctxt.sys_regs[FPEXC32_EL2]));
-  DEFINE(VCPU_ESR_EL2,		offsetof(struct kvm_vcpu, arch.fault.esr_el2));
-  DEFINE(VCPU_FAR_EL2,		offsetof(struct kvm_vcpu, arch.fault.far_el2));
-  DEFINE(VCPU_HPFAR_EL2,	offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
 #endif
 #ifdef CONFIG_CPU_PM

+ 11 - 0
arch/arm64/kernel/cpufeature.c

@@ -26,6 +26,7 @@
 #include <asm/cpu_ops.h>
 #include <asm/processor.h>
 #include <asm/sysreg.h>
+#include <asm/virt.h>
 
 unsigned long elf_hwcap __read_mostly;
 EXPORT_SYMBOL_GPL(elf_hwcap);
@@ -621,6 +622,11 @@ static bool has_useable_gicv3_cpuif(const struct arm64_cpu_capabilities *entry)
 	return has_sre;
 }
 
+static bool runs_at_el2(const struct arm64_cpu_capabilities *entry)
+{
+	return is_kernel_in_hyp_mode();
+}
+
 static const struct arm64_cpu_capabilities arm64_features[] = {
 	{
 		.desc = "GIC system register CPU interface",
@@ -651,6 +657,11 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
 		.min_field_value = 2,
 	},
 #endif /* CONFIG_AS_LSE && CONFIG_ARM64_LSE_ATOMICS */
+	{
+		.desc = "Virtualization Host Extensions",
+		.capability = ARM64_HAS_VIRT_HOST_EXTN,
+		.matches = runs_at_el2,
+	},
 	{},
 };
 

+ 27 - 1
arch/arm64/kernel/head.S

@@ -30,6 +30,7 @@
 #include <asm/cache.h>
 #include <asm/cputype.h>
 #include <asm/kernel-pgtable.h>
+#include <asm/kvm_arm.h>
 #include <asm/memory.h>
 #include <asm/pgtable-hwdef.h>
 #include <asm/pgtable.h>
@@ -464,9 +465,27 @@ CPU_LE(	bic	x0, x0, #(3 << 24)	)	// Clear the EE and E0E bits for EL1
 	isb
 	ret
 
+2:
+#ifdef CONFIG_ARM64_VHE
+	/*
+	 * Check for VHE being present. For the rest of the EL2 setup,
+	 * x2 being non-zero indicates that we do have VHE, and that the
+	 * kernel is intended to run at EL2.
+	 */
+	mrs	x2, id_aa64mmfr1_el1
+	ubfx	x2, x2, #8, #4
+#else
+	mov	x2, xzr
+#endif
+
 	/* Hyp configuration. */
-2:	mov	x0, #(1 << 31)			// 64-bit EL1
+	mov	x0, #HCR_RW			// 64-bit EL1
+	cbz	x2, set_hcr
+	orr	x0, x0, #HCR_TGE		// Enable Host Extensions
+	orr	x0, x0, #HCR_E2H
+set_hcr:
 	msr	hcr_el2, x0
+	isb
 
 	/* Generic timers. */
 	mrs	x0, cnthctl_el2
@@ -526,6 +545,13 @@ CPU_LE(	movk	x0, #0x30d0, lsl #16	)	// Clear EE and E0E on LE systems
 	/* Stage-2 translation */
 	msr	vttbr_el2, xzr
 
+	cbz	x2, install_el2_stub
+
+	mov	w20, #BOOT_CPU_MODE_EL2		// This CPU booted in EL2
+	isb
+	ret
+
+install_el2_stub:
 	/* Hypervisor stub */
 	adrp	x0, __hyp_stub_vectors
 	add	x0, x0, #:lo12:__hyp_stub_vectors

+ 5 - 1
arch/arm64/kernel/perf_event.c

@@ -20,6 +20,7 @@
  */
 
 #include <asm/irq_regs.h>
+#include <asm/virt.h>
 
 #include <linux/of.h>
 #include <linux/perf/arm_pmu.h>
@@ -691,9 +692,12 @@ static int armv8pmu_set_event_filter(struct hw_perf_event *event,
 
 	if (attr->exclude_idle)
 		return -EPERM;
+	if (is_kernel_in_hyp_mode() &&
+	    attr->exclude_kernel != attr->exclude_hv)
+		return -EINVAL;
 	if (attr->exclude_user)
 		config_base |= ARMV8_EXCLUDE_EL0;
-	if (attr->exclude_kernel)
+	if (!is_kernel_in_hyp_mode() && attr->exclude_kernel)
 		config_base |= ARMV8_EXCLUDE_EL1;
 	if (!attr->exclude_hv)
 		config_base |= ARMV8_INCLUDE_EL2;

+ 7 - 0
arch/arm64/kvm/Kconfig

@@ -36,6 +36,7 @@ config KVM
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_IRQFD
 	select KVM_ARM_VGIC_V3
+	select KVM_ARM_PMU if HW_PERF_EVENTS
 	---help---
 	  Support hosting virtualized guest machines.
 	  We don't support KVM with 16K page tables yet, due to the multiple
@@ -48,6 +49,12 @@ config KVM_ARM_HOST
 	---help---
 	  Provides host support for ARM processors.
 
+config KVM_ARM_PMU
+	bool
+	---help---
+	  Adds support for a virtual Performance Monitoring Unit (PMU) in
+	  virtual machines.
+
 source drivers/vhost/Kconfig
 
 endif # VIRTUALIZATION

+ 1 - 0
arch/arm64/kvm/Makefile

@@ -26,3 +26,4 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
+kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o

+ 51 - 0
arch/arm64/kvm/guest.c

@@ -380,3 +380,54 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	}
 	return 0;
 }
+
+int kvm_arm_vcpu_arch_set_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_PMU_V3_CTRL:
+		ret = kvm_arm_pmu_v3_set_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}
+
+int kvm_arm_vcpu_arch_get_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_PMU_V3_CTRL:
+		ret = kvm_arm_pmu_v3_get_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}
+
+int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu,
+			       struct kvm_device_attr *attr)
+{
+	int ret;
+
+	switch (attr->group) {
+	case KVM_ARM_VCPU_PMU_V3_CTRL:
+		ret = kvm_arm_pmu_v3_has_attr(vcpu, attr);
+		break;
+	default:
+		ret = -ENXIO;
+		break;
+	}
+
+	return ret;
+}

+ 1 - 14
arch/arm64/kvm/hyp-init.S

@@ -87,26 +87,13 @@ __do_hyp_init:
 #endif
 	/*
 	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS bits in
-	 * TCR_EL2 and VTCR_EL2.
+	 * TCR_EL2.
 	 */
 	mrs	x5, ID_AA64MMFR0_EL1
 	bfi	x4, x5, #16, #3
 
 	msr	tcr_el2, x4
 
-	ldr	x4, =VTCR_EL2_FLAGS
-	bfi	x4, x5, #16, #3
-	/*
-	 * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS bit in
-	 * VTCR_EL2.
-	 */
-	mrs	x5, ID_AA64MMFR1_EL1
-	ubfx	x5, x5, #5, #1
-	lsl	x5, x5, #VTCR_EL2_VS
-	orr	x4, x4, x5
-
-	msr	vtcr_el2, x4
-
 	mrs	x4, mair_el1
 	msr	mair_el2, x4
 	isb

+ 7 - 0
arch/arm64/kvm/hyp.S

@@ -17,7 +17,9 @@
 
 #include <linux/linkage.h>
 
+#include <asm/alternative.h>
 #include <asm/assembler.h>
+#include <asm/cpufeature.h>
 
 /*
  * u64 kvm_call_hyp(void *hypfn, ...);
@@ -38,6 +40,11 @@
  * arch/arm64/kernel/hyp_stub.S.
  */
 ENTRY(kvm_call_hyp)
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN	
 	hvc	#0
 	ret
+alternative_else
+	b	__vhe_hyp_call
+	nop
+alternative_endif
 ENDPROC(kvm_call_hyp)

+ 6 - 2
arch/arm64/kvm/hyp/Makefile

@@ -2,9 +2,12 @@
 # Makefile for Kernel-based Virtual Machine module, HYP part
 #
 
-obj-$(CONFIG_KVM_ARM_HOST) += vgic-v2-sr.o
+KVM=../../../../virt/kvm
+
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o
+obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/timer-sr.o
+
 obj-$(CONFIG_KVM_ARM_HOST) += vgic-v3-sr.o
-obj-$(CONFIG_KVM_ARM_HOST) += timer-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += sysreg-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += debug-sr.o
 obj-$(CONFIG_KVM_ARM_HOST) += entry.o
@@ -12,3 +15,4 @@ obj-$(CONFIG_KVM_ARM_HOST) += switch.o
 obj-$(CONFIG_KVM_ARM_HOST) += fpsimd.o
 obj-$(CONFIG_KVM_ARM_HOST) += tlb.o
 obj-$(CONFIG_KVM_ARM_HOST) += hyp-entry.o
+obj-$(CONFIG_KVM_ARM_HOST) += s2-setup.o

+ 1 - 3
arch/arm64/kvm/hyp/debug-sr.c

@@ -19,9 +19,7 @@
 #include <linux/kvm_host.h>
 
 #include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-#include "hyp.h"
+#include <asm/kvm_hyp.h>
 
 #define read_debug(r,n)		read_sysreg(r##n##_el1)
 #define write_debug(v,r,n)	write_sysreg(v, r##n##_el1)

+ 6 - 0
arch/arm64/kvm/hyp/entry.S

@@ -130,9 +130,15 @@ ENDPROC(__guest_exit)
 ENTRY(__fpsimd_guest_restore)
 	stp	x4, lr, [sp, #-16]!
 
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	x2, cptr_el2
 	bic	x2, x2, #CPTR_EL2_TFP
 	msr	cptr_el2, x2
+alternative_else
+	mrs	x2, cpacr_el1
+	orr	x2, x2, #CPACR_EL1_FPEN
+	msr	cpacr_el1, x2
+alternative_endif
 	isb
 
 	mrs	x3, tpidr_el2

+ 36 - 73
arch/arm64/kvm/hyp/hyp-entry.S

@@ -19,7 +19,6 @@
 
 #include <asm/alternative.h>
 #include <asm/assembler.h>
-#include <asm/asm-offsets.h>
 #include <asm/cpufeature.h>
 #include <asm/kvm_arm.h>
 #include <asm/kvm_asm.h>
@@ -38,10 +37,42 @@
 	ldp	x0, x1, [sp], #16
 .endm
 
+.macro do_el2_call
+	/*
+	 * Shuffle the parameters before calling the function
+	 * pointed to in x0. Assumes parameters in x[1,2,3].
+	 */
+	sub	sp, sp, #16
+	str	lr, [sp]
+	mov	lr, x0
+	mov	x0, x1
+	mov	x1, x2
+	mov	x2, x3
+	blr	lr
+	ldr	lr, [sp]
+	add	sp, sp, #16
+.endm
+
+ENTRY(__vhe_hyp_call)
+	do_el2_call
+	/*
+	 * We used to rely on having an exception return to get
+	 * an implicit isb. In the E2H case, we don't have it anymore.
+	 * rather than changing all the leaf functions, just do it here
+	 * before returning to the rest of the kernel.
+	 */
+	isb
+	ret
+ENDPROC(__vhe_hyp_call)
+	
 el1_sync:				// Guest trapped into EL2
 	save_x0_to_x3
 
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
 	mrs	x1, esr_el2
+alternative_else
+	mrs	x1, esr_el1
+alternative_endif
 	lsr	x2, x1, #ESR_ELx_EC_SHIFT
 
 	cmp	x2, #ESR_ELx_EC_HVC64
@@ -58,19 +89,13 @@ el1_sync:				// Guest trapped into EL2
 	mrs	x0, vbar_el2
 	b	2f
 
-1:	stp	lr, xzr, [sp, #-16]!
-
+1:
 	/*
-	 * Compute the function address in EL2, and shuffle the parameters.
+	 * Perform the EL2 call
 	 */
 	kern_hyp_va	x0
-	mov	lr, x0
-	mov	x0, x1
-	mov	x1, x2
-	mov	x2, x3
-	blr	lr
+	do_el2_call
 
-	ldp	lr, xzr, [sp], #16
 2:	eret
 
 el1_trap:
@@ -83,72 +108,10 @@ el1_trap:
 	cmp	x2, #ESR_ELx_EC_FP_ASIMD
 	b.eq	__fpsimd_guest_restore
 
-	cmp	x2, #ESR_ELx_EC_DABT_LOW
-	mov	x0, #ESR_ELx_EC_IABT_LOW
-	ccmp	x2, x0, #4, ne
-	b.ne	1f		// Not an abort we care about
-
-	/* This is an abort. Check for permission fault */
-alternative_if_not ARM64_WORKAROUND_834220
-	and	x2, x1, #ESR_ELx_FSC_TYPE
-	cmp	x2, #FSC_PERM
-	b.ne	1f		// Not a permission fault
-alternative_else
-	nop			// Use the permission fault path to
-	nop			// check for a valid S1 translation,
-	nop			// regardless of the ESR value.
-alternative_endif
-
-	/*
-	 * Check for Stage-1 page table walk, which is guaranteed
-	 * to give a valid HPFAR_EL2.
-	 */
-	tbnz	x1, #7, 1f	// S1PTW is set
-
-	/* Preserve PAR_EL1 */
-	mrs	x3, par_el1
-	stp	x3, xzr, [sp, #-16]!
-
-	/*
-	 * Permission fault, HPFAR_EL2 is invalid.
-	 * Resolve the IPA the hard way using the guest VA.
-	 * Stage-1 translation already validated the memory access rights.
-	 * As such, we can use the EL1 translation regime, and don't have
-	 * to distinguish between EL0 and EL1 access.
-	 */
-	mrs	x2, far_el2
-	at	s1e1r, x2
-	isb
-
-	/* Read result */
-	mrs	x3, par_el1
-	ldp	x0, xzr, [sp], #16	// Restore PAR_EL1 from the stack
-	msr	par_el1, x0
-	tbnz	x3, #0, 3f		// Bail out if we failed the translation
-	ubfx	x3, x3, #12, #36	// Extract IPA
-	lsl	x3, x3, #4		// and present it like HPFAR
-	b	2f
-
-1:	mrs	x3, hpfar_el2
-	mrs	x2, far_el2
-
-2:	mrs	x0, tpidr_el2
-	str	w1, [x0, #VCPU_ESR_EL2]
-	str	x2, [x0, #VCPU_FAR_EL2]
-	str	x3, [x0, #VCPU_HPFAR_EL2]
-
+	mrs	x0, tpidr_el2
 	mov	x1, #ARM_EXCEPTION_TRAP
 	b	__guest_exit
 
-	/*
-	 * Translation failed. Just return to the guest and
-	 * let it fault again. Another CPU is probably playing
-	 * behind our back.
-	 */
-3:	restore_x0_to_x3
-
-	eret
-
 el1_irq:
 	save_x0_to_x3
 	mrs	x0, tpidr_el2

+ 0 - 90
arch/arm64/kvm/hyp/hyp.h

@@ -1,90 +0,0 @@
-/*
- * Copyright (C) 2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __ARM64_KVM_HYP_H__
-#define __ARM64_KVM_HYP_H__
-
-#include <linux/compiler.h>
-#include <linux/kvm_host.h>
-#include <asm/kvm_mmu.h>
-#include <asm/sysreg.h>
-
-#define __hyp_text __section(.hyp.text) notrace
-
-#define kern_hyp_va(v) (typeof(v))((unsigned long)(v) & HYP_PAGE_OFFSET_MASK)
-#define hyp_kern_va(v) (typeof(v))((unsigned long)(v) - HYP_PAGE_OFFSET \
-						      + PAGE_OFFSET)
-
-/**
- * hyp_alternate_select - Generates patchable code sequences that are
- * used to switch between two implementations of a function, depending
- * on the availability of a feature.
- *
- * @fname: a symbol name that will be defined as a function returning a
- * function pointer whose type will match @orig and @alt
- * @orig: A pointer to the default function, as returned by @fname when
- * @cond doesn't hold
- * @alt: A pointer to the alternate function, as returned by @fname
- * when @cond holds
- * @cond: a CPU feature (as described in asm/cpufeature.h)
- */
-#define hyp_alternate_select(fname, orig, alt, cond)			\
-typeof(orig) * __hyp_text fname(void)					\
-{									\
-	typeof(alt) *val = orig;					\
-	asm volatile(ALTERNATIVE("nop		\n",			\
-				 "mov	%0, %1	\n",			\
-				 cond)					\
-		     : "+r" (val) : "r" (alt));				\
-	return val;							\
-}
-
-void __vgic_v2_save_state(struct kvm_vcpu *vcpu);
-void __vgic_v2_restore_state(struct kvm_vcpu *vcpu);
-
-void __vgic_v3_save_state(struct kvm_vcpu *vcpu);
-void __vgic_v3_restore_state(struct kvm_vcpu *vcpu);
-
-void __timer_save_state(struct kvm_vcpu *vcpu);
-void __timer_restore_state(struct kvm_vcpu *vcpu);
-
-void __sysreg_save_state(struct kvm_cpu_context *ctxt);
-void __sysreg_restore_state(struct kvm_cpu_context *ctxt);
-void __sysreg32_save_state(struct kvm_vcpu *vcpu);
-void __sysreg32_restore_state(struct kvm_vcpu *vcpu);
-
-void __debug_save_state(struct kvm_vcpu *vcpu,
-			struct kvm_guest_debug_arch *dbg,
-			struct kvm_cpu_context *ctxt);
-void __debug_restore_state(struct kvm_vcpu *vcpu,
-			   struct kvm_guest_debug_arch *dbg,
-			   struct kvm_cpu_context *ctxt);
-void __debug_cond_save_host_state(struct kvm_vcpu *vcpu);
-void __debug_cond_restore_host_state(struct kvm_vcpu *vcpu);
-
-void __fpsimd_save_state(struct user_fpsimd_state *fp_regs);
-void __fpsimd_restore_state(struct user_fpsimd_state *fp_regs);
-static inline bool __fpsimd_enabled(void)
-{
-	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
-}
-
-u64 __guest_enter(struct kvm_vcpu *vcpu, struct kvm_cpu_context *host_ctxt);
-void __noreturn __hyp_do_panic(unsigned long, ...);
-
-#endif /* __ARM64_KVM_HYP_H__ */
-

+ 43 - 0
arch/arm64/kvm/hyp/s2-setup.c

@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2016 - ARM Ltd
+ * Author: Marc Zyngier <marc.zyngier@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/types.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+
+void __hyp_text __init_stage2_translation(void)
+{
+	u64 val = VTCR_EL2_FLAGS;
+	u64 tmp;
+
+	/*
+	 * Read the PARange bits from ID_AA64MMFR0_EL1 and set the PS
+	 * bits in VTCR_EL2. Amusingly, the PARange is 4 bits, while
+	 * PS is only 3. Fortunately, bit 19 is RES0 in VTCR_EL2...
+	 */
+	val |= (read_sysreg(id_aa64mmfr0_el1) & 7) << 16;
+
+	/*
+	 * Read the VMIDBits bits from ID_AA64MMFR1_EL1 and set the VS
+	 * bit in VTCR_EL2.
+	 */
+	tmp = (read_sysreg(id_aa64mmfr1_el1) >> 4) & 0xf;
+	val |= (tmp == 2) ? VTCR_EL2_VS : 0;
+
+	write_sysreg(val, vtcr_el2);
+}

+ 186 - 20
arch/arm64/kvm/hyp/switch.c

@@ -15,7 +15,53 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "hyp.h"
+#include <linux/types.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_hyp.h>
+
+static bool __hyp_text __fpsimd_enabled_nvhe(void)
+{
+	return !(read_sysreg(cptr_el2) & CPTR_EL2_TFP);
+}
+
+static bool __hyp_text __fpsimd_enabled_vhe(void)
+{
+	return !!(read_sysreg(cpacr_el1) & CPACR_EL1_FPEN);
+}
+
+static hyp_alternate_select(__fpsimd_is_enabled,
+			    __fpsimd_enabled_nvhe, __fpsimd_enabled_vhe,
+			    ARM64_HAS_VIRT_HOST_EXTN);
+
+bool __hyp_text __fpsimd_enabled(void)
+{
+	return __fpsimd_is_enabled()();
+}
+
+static void __hyp_text __activate_traps_vhe(void)
+{
+	u64 val;
+
+	val = read_sysreg(cpacr_el1);
+	val |= CPACR_EL1_TTA;
+	val &= ~CPACR_EL1_FPEN;
+	write_sysreg(val, cpacr_el1);
+
+	write_sysreg(__kvm_hyp_vector, vbar_el1);
+}
+
+static void __hyp_text __activate_traps_nvhe(void)
+{
+	u64 val;
+
+	val = CPTR_EL2_DEFAULT;
+	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
+	write_sysreg(val, cptr_el2);
+}
+
+static hyp_alternate_select(__activate_traps_arch,
+			    __activate_traps_nvhe, __activate_traps_vhe,
+			    ARM64_HAS_VIRT_HOST_EXTN);
 
 static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 {
@@ -36,20 +82,37 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu)
 	write_sysreg(val, hcr_el2);
 	/* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */
 	write_sysreg(1 << 15, hstr_el2);
+	/* Make sure we trap PMU access from EL0 to EL2 */
+	write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0);
+	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+	__activate_traps_arch()();
+}
 
-	val = CPTR_EL2_DEFAULT;
-	val |= CPTR_EL2_TTA | CPTR_EL2_TFP;
-	write_sysreg(val, cptr_el2);
+static void __hyp_text __deactivate_traps_vhe(void)
+{
+	extern char vectors[];	/* kernel exception vectors */
 
-	write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2);
+	write_sysreg(HCR_HOST_VHE_FLAGS, hcr_el2);
+	write_sysreg(CPACR_EL1_FPEN, cpacr_el1);
+	write_sysreg(vectors, vbar_el1);
 }
 
-static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
+static void __hyp_text __deactivate_traps_nvhe(void)
 {
 	write_sysreg(HCR_RW, hcr_el2);
+	write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+}
+
+static hyp_alternate_select(__deactivate_traps_arch,
+			    __deactivate_traps_nvhe, __deactivate_traps_vhe,
+			    ARM64_HAS_VIRT_HOST_EXTN);
+
+static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu)
+{
+	__deactivate_traps_arch()();
 	write_sysreg(0, hstr_el2);
 	write_sysreg(read_sysreg(mdcr_el2) & MDCR_EL2_HPMN_MASK, mdcr_el2);
-	write_sysreg(CPTR_EL2_DEFAULT, cptr_el2);
+	write_sysreg(0, pmuserenr_el0);
 }
 
 static void __hyp_text __activate_vm(struct kvm_vcpu *vcpu)
@@ -89,6 +152,86 @@ static void __hyp_text __vgic_restore_state(struct kvm_vcpu *vcpu)
 	__vgic_call_restore_state()(vcpu);
 }
 
+static bool __hyp_text __true_value(void)
+{
+	return true;
+}
+
+static bool __hyp_text __false_value(void)
+{
+	return false;
+}
+
+static hyp_alternate_select(__check_arm_834220,
+			    __false_value, __true_value,
+			    ARM64_WORKAROUND_834220);
+
+static bool __hyp_text __translate_far_to_hpfar(u64 far, u64 *hpfar)
+{
+	u64 par, tmp;
+
+	/*
+	 * Resolve the IPA the hard way using the guest VA.
+	 *
+	 * Stage-1 translation already validated the memory access
+	 * rights. As such, we can use the EL1 translation regime, and
+	 * don't have to distinguish between EL0 and EL1 access.
+	 *
+	 * We do need to save/restore PAR_EL1 though, as we haven't
+	 * saved the guest context yet, and we may return early...
+	 */
+	par = read_sysreg(par_el1);
+	asm volatile("at s1e1r, %0" : : "r" (far));
+	isb();
+
+	tmp = read_sysreg(par_el1);
+	write_sysreg(par, par_el1);
+
+	if (unlikely(tmp & 1))
+		return false; /* Translation failed, back to guest */
+
+	/* Convert PAR to HPFAR format */
+	*hpfar = ((tmp >> 12) & ((1UL << 36) - 1)) << 4;
+	return true;
+}
+
+static bool __hyp_text __populate_fault_info(struct kvm_vcpu *vcpu)
+{
+	u64 esr = read_sysreg_el2(esr);
+	u8 ec = esr >> ESR_ELx_EC_SHIFT;
+	u64 hpfar, far;
+
+	vcpu->arch.fault.esr_el2 = esr;
+
+	if (ec != ESR_ELx_EC_DABT_LOW && ec != ESR_ELx_EC_IABT_LOW)
+		return true;
+
+	far = read_sysreg_el2(far);
+
+	/*
+	 * The HPFAR can be invalid if the stage 2 fault did not
+	 * happen during a stage 1 page table walk (the ESR_EL2.S1PTW
+	 * bit is clear) and one of the two following cases are true:
+	 *   1. The fault was due to a permission fault
+	 *   2. The processor carries errata 834220
+	 *
+	 * Therefore, for all non S1PTW faults where we either have a
+	 * permission fault or the errata workaround is enabled, we
+	 * resolve the IPA using the AT instruction.
+	 */
+	if (!(esr & ESR_ELx_S1PTW) &&
+	    (__check_arm_834220()() || (esr & ESR_ELx_FSC_TYPE) == FSC_PERM)) {
+		if (!__translate_far_to_hpfar(far, &hpfar))
+			return false;
+	} else {
+		hpfar = read_sysreg(hpfar_el2);
+	}
+
+	vcpu->arch.fault.far_el2 = far;
+	vcpu->arch.fault.hpfar_el2 = hpfar;
+	return true;
+}
+
 static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpu_context *host_ctxt;
@@ -102,7 +245,7 @@ static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 	host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
 	guest_ctxt = &vcpu->arch.ctxt;
 
-	__sysreg_save_state(host_ctxt);
+	__sysreg_save_host_state(host_ctxt);
 	__debug_cond_save_host_state(vcpu);
 
 	__activate_traps(vcpu);
@@ -116,16 +259,20 @@ static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 	 * to Cortex-A57 erratum #852523.
 	 */
 	__sysreg32_restore_state(vcpu);
-	__sysreg_restore_state(guest_ctxt);
+	__sysreg_restore_guest_state(guest_ctxt);
 	__debug_restore_state(vcpu, kern_hyp_va(vcpu->arch.debug_ptr), guest_ctxt);
 
 	/* Jump in the fire! */
+again:
 	exit_code = __guest_enter(vcpu, host_ctxt);
 	/* And we're baaack! */
 
+	if (exit_code == ARM_EXCEPTION_TRAP && !__populate_fault_info(vcpu))
+		goto again;
+
 	fp_enabled = __fpsimd_enabled();
 
-	__sysreg_save_state(guest_ctxt);
+	__sysreg_save_guest_state(guest_ctxt);
 	__sysreg32_save_state(vcpu);
 	__timer_save_state(vcpu);
 	__vgic_save_state(vcpu);
@@ -133,7 +280,7 @@ static int __hyp_text __guest_run(struct kvm_vcpu *vcpu)
 	__deactivate_traps(vcpu);
 	__deactivate_vm(vcpu);
 
-	__sysreg_restore_state(host_ctxt);
+	__sysreg_restore_host_state(host_ctxt);
 
 	if (fp_enabled) {
 		__fpsimd_save_state(&guest_ctxt->gp_regs.fp_regs);
@@ -150,11 +297,34 @@ __alias(__guest_run) int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%p\n";
 
-void __hyp_text __noreturn __hyp_panic(void)
+static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
 {
 	unsigned long str_va = (unsigned long)__hyp_panic_string;
-	u64 spsr = read_sysreg(spsr_el2);
-	u64 elr = read_sysreg(elr_el2);
+
+	__hyp_do_panic(hyp_kern_va(str_va),
+		       spsr,  elr,
+		       read_sysreg(esr_el2),   read_sysreg_el2(far),
+		       read_sysreg(hpfar_el2), par,
+		       (void *)read_sysreg(tpidr_el2));
+}
+
+static void __hyp_text __hyp_call_panic_vhe(u64 spsr, u64 elr, u64 par)
+{
+	panic(__hyp_panic_string,
+	      spsr,  elr,
+	      read_sysreg_el2(esr),   read_sysreg_el2(far),
+	      read_sysreg(hpfar_el2), par,
+	      (void *)read_sysreg(tpidr_el2));
+}
+
+static hyp_alternate_select(__hyp_call_panic,
+			    __hyp_call_panic_nvhe, __hyp_call_panic_vhe,
+			    ARM64_HAS_VIRT_HOST_EXTN);
+
+void __hyp_text __noreturn __hyp_panic(void)
+{
+	u64 spsr = read_sysreg_el2(spsr);
+	u64 elr = read_sysreg_el2(elr);
 	u64 par = read_sysreg(par_el1);
 
 	if (read_sysreg(vttbr_el2)) {
@@ -165,15 +335,11 @@ void __hyp_text __noreturn __hyp_panic(void)
 		host_ctxt = kern_hyp_va(vcpu->arch.host_cpu_context);
 		__deactivate_traps(vcpu);
 		__deactivate_vm(vcpu);
-		__sysreg_restore_state(host_ctxt);
+		__sysreg_restore_host_state(host_ctxt);
 	}
 
 	/* Call panic for real */
-	__hyp_do_panic(hyp_kern_va(str_va),
-		       spsr,  elr,
-		       read_sysreg(esr_el2),   read_sysreg(far_el2),
-		       read_sysreg(hpfar_el2), par,
-		       (void *)read_sysreg(tpidr_el2));
+	__hyp_call_panic()(spsr, elr, par);
 
 	unreachable();
 }

+ 98 - 51
arch/arm64/kvm/hyp/sysreg-sr.c

@@ -19,75 +19,122 @@
 #include <linux/kvm_host.h>
 
 #include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
+#include <asm/kvm_hyp.h>
 
-#include "hyp.h"
+/* Yes, this does nothing, on purpose */
+static void __hyp_text __sysreg_do_nothing(struct kvm_cpu_context *ctxt) { }
 
-/* ctxt is already in the HYP VA space */
-void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+/*
+ * Non-VHE: Both host and guest must save everything.
+ *
+ * VHE: Host must save tpidr*_el[01], actlr_el1, sp0, pc, pstate, and
+ * guest must save everything.
+ */
+
+static void __hyp_text __sysreg_save_common_state(struct kvm_cpu_context *ctxt)
 {
-	ctxt->sys_regs[MPIDR_EL1]	= read_sysreg(vmpidr_el2);
-	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
-	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg(sctlr_el1);
 	ctxt->sys_regs[ACTLR_EL1]	= read_sysreg(actlr_el1);
-	ctxt->sys_regs[CPACR_EL1]	= read_sysreg(cpacr_el1);
-	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg(ttbr0_el1);
-	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg(ttbr1_el1);
-	ctxt->sys_regs[TCR_EL1]		= read_sysreg(tcr_el1);
-	ctxt->sys_regs[ESR_EL1]		= read_sysreg(esr_el1);
-	ctxt->sys_regs[AFSR0_EL1]	= read_sysreg(afsr0_el1);
-	ctxt->sys_regs[AFSR1_EL1]	= read_sysreg(afsr1_el1);
-	ctxt->sys_regs[FAR_EL1]		= read_sysreg(far_el1);
-	ctxt->sys_regs[MAIR_EL1]	= read_sysreg(mair_el1);
-	ctxt->sys_regs[VBAR_EL1]	= read_sysreg(vbar_el1);
-	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg(contextidr_el1);
 	ctxt->sys_regs[TPIDR_EL0]	= read_sysreg(tpidr_el0);
 	ctxt->sys_regs[TPIDRRO_EL0]	= read_sysreg(tpidrro_el0);
 	ctxt->sys_regs[TPIDR_EL1]	= read_sysreg(tpidr_el1);
-	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg(amair_el1);
-	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg(cntkctl_el1);
+	ctxt->gp_regs.regs.sp		= read_sysreg(sp_el0);
+	ctxt->gp_regs.regs.pc		= read_sysreg_el2(elr);
+	ctxt->gp_regs.regs.pstate	= read_sysreg_el2(spsr);
+}
+
+static void __hyp_text __sysreg_save_state(struct kvm_cpu_context *ctxt)
+{
+	ctxt->sys_regs[MPIDR_EL1]	= read_sysreg(vmpidr_el2);
+	ctxt->sys_regs[CSSELR_EL1]	= read_sysreg(csselr_el1);
+	ctxt->sys_regs[SCTLR_EL1]	= read_sysreg_el1(sctlr);
+	ctxt->sys_regs[CPACR_EL1]	= read_sysreg_el1(cpacr);
+	ctxt->sys_regs[TTBR0_EL1]	= read_sysreg_el1(ttbr0);
+	ctxt->sys_regs[TTBR1_EL1]	= read_sysreg_el1(ttbr1);
+	ctxt->sys_regs[TCR_EL1]		= read_sysreg_el1(tcr);
+	ctxt->sys_regs[ESR_EL1]		= read_sysreg_el1(esr);
+	ctxt->sys_regs[AFSR0_EL1]	= read_sysreg_el1(afsr0);
+	ctxt->sys_regs[AFSR1_EL1]	= read_sysreg_el1(afsr1);
+	ctxt->sys_regs[FAR_EL1]		= read_sysreg_el1(far);
+	ctxt->sys_regs[MAIR_EL1]	= read_sysreg_el1(mair);
+	ctxt->sys_regs[VBAR_EL1]	= read_sysreg_el1(vbar);
+	ctxt->sys_regs[CONTEXTIDR_EL1]	= read_sysreg_el1(contextidr);
+	ctxt->sys_regs[AMAIR_EL1]	= read_sysreg_el1(amair);
+	ctxt->sys_regs[CNTKCTL_EL1]	= read_sysreg_el1(cntkctl);
 	ctxt->sys_regs[PAR_EL1]		= read_sysreg(par_el1);
 	ctxt->sys_regs[MDSCR_EL1]	= read_sysreg(mdscr_el1);
 
-	ctxt->gp_regs.regs.sp		= read_sysreg(sp_el0);
-	ctxt->gp_regs.regs.pc		= read_sysreg(elr_el2);
-	ctxt->gp_regs.regs.pstate	= read_sysreg(spsr_el2);
 	ctxt->gp_regs.sp_el1		= read_sysreg(sp_el1);
-	ctxt->gp_regs.elr_el1		= read_sysreg(elr_el1);
-	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg(spsr_el1);
+	ctxt->gp_regs.elr_el1		= read_sysreg_el1(elr);
+	ctxt->gp_regs.spsr[KVM_SPSR_EL1]= read_sysreg_el1(spsr);
+}
+
+static hyp_alternate_select(__sysreg_call_save_host_state,
+			    __sysreg_save_state, __sysreg_do_nothing,
+			    ARM64_HAS_VIRT_HOST_EXTN);
+
+void __hyp_text __sysreg_save_host_state(struct kvm_cpu_context *ctxt)
+{
+	__sysreg_call_save_host_state()(ctxt);
+	__sysreg_save_common_state(ctxt);
+}
+
+void __hyp_text __sysreg_save_guest_state(struct kvm_cpu_context *ctxt)
+{
+	__sysreg_save_state(ctxt);
+	__sysreg_save_common_state(ctxt);
 }
 
-void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+static void __hyp_text __sysreg_restore_common_state(struct kvm_cpu_context *ctxt)
 {
-	write_sysreg(ctxt->sys_regs[MPIDR_EL1],	  vmpidr_el2);
-	write_sysreg(ctxt->sys_regs[CSSELR_EL1],  csselr_el1);
-	write_sysreg(ctxt->sys_regs[SCTLR_EL1],	  sctlr_el1);
 	write_sysreg(ctxt->sys_regs[ACTLR_EL1],	  actlr_el1);
-	write_sysreg(ctxt->sys_regs[CPACR_EL1],	  cpacr_el1);
-	write_sysreg(ctxt->sys_regs[TTBR0_EL1],	  ttbr0_el1);
-	write_sysreg(ctxt->sys_regs[TTBR1_EL1],	  ttbr1_el1);
-	write_sysreg(ctxt->sys_regs[TCR_EL1],	  tcr_el1);
-	write_sysreg(ctxt->sys_regs[ESR_EL1],	  esr_el1);
-	write_sysreg(ctxt->sys_regs[AFSR0_EL1],	  afsr0_el1);
-	write_sysreg(ctxt->sys_regs[AFSR1_EL1],	  afsr1_el1);
-	write_sysreg(ctxt->sys_regs[FAR_EL1],	  far_el1);
-	write_sysreg(ctxt->sys_regs[MAIR_EL1],	  mair_el1);
-	write_sysreg(ctxt->sys_regs[VBAR_EL1],	  vbar_el1);
-	write_sysreg(ctxt->sys_regs[CONTEXTIDR_EL1], contextidr_el1);
 	write_sysreg(ctxt->sys_regs[TPIDR_EL0],	  tpidr_el0);
 	write_sysreg(ctxt->sys_regs[TPIDRRO_EL0], tpidrro_el0);
 	write_sysreg(ctxt->sys_regs[TPIDR_EL1],	  tpidr_el1);
-	write_sysreg(ctxt->sys_regs[AMAIR_EL1],	  amair_el1);
-	write_sysreg(ctxt->sys_regs[CNTKCTL_EL1], cntkctl_el1);
-	write_sysreg(ctxt->sys_regs[PAR_EL1],	  par_el1);
-	write_sysreg(ctxt->sys_regs[MDSCR_EL1],	  mdscr_el1);
-
-	write_sysreg(ctxt->gp_regs.regs.sp,	sp_el0);
-	write_sysreg(ctxt->gp_regs.regs.pc,	elr_el2);
-	write_sysreg(ctxt->gp_regs.regs.pstate,	spsr_el2);
-	write_sysreg(ctxt->gp_regs.sp_el1,	sp_el1);
-	write_sysreg(ctxt->gp_regs.elr_el1,	elr_el1);
-	write_sysreg(ctxt->gp_regs.spsr[KVM_SPSR_EL1], spsr_el1);
+	write_sysreg(ctxt->gp_regs.regs.sp,	  sp_el0);
+	write_sysreg_el2(ctxt->gp_regs.regs.pc,	  elr);
+	write_sysreg_el2(ctxt->gp_regs.regs.pstate, spsr);
+}
+
+static void __hyp_text __sysreg_restore_state(struct kvm_cpu_context *ctxt)
+{
+	write_sysreg(ctxt->sys_regs[MPIDR_EL1],		vmpidr_el2);
+	write_sysreg(ctxt->sys_regs[CSSELR_EL1],	csselr_el1);
+	write_sysreg_el1(ctxt->sys_regs[SCTLR_EL1],	sctlr);
+	write_sysreg_el1(ctxt->sys_regs[CPACR_EL1],	cpacr);
+	write_sysreg_el1(ctxt->sys_regs[TTBR0_EL1],	ttbr0);
+	write_sysreg_el1(ctxt->sys_regs[TTBR1_EL1],	ttbr1);
+	write_sysreg_el1(ctxt->sys_regs[TCR_EL1],	tcr);
+	write_sysreg_el1(ctxt->sys_regs[ESR_EL1],	esr);
+	write_sysreg_el1(ctxt->sys_regs[AFSR0_EL1],	afsr0);
+	write_sysreg_el1(ctxt->sys_regs[AFSR1_EL1],	afsr1);
+	write_sysreg_el1(ctxt->sys_regs[FAR_EL1],	far);
+	write_sysreg_el1(ctxt->sys_regs[MAIR_EL1],	mair);
+	write_sysreg_el1(ctxt->sys_regs[VBAR_EL1],	vbar);
+	write_sysreg_el1(ctxt->sys_regs[CONTEXTIDR_EL1],contextidr);
+	write_sysreg_el1(ctxt->sys_regs[AMAIR_EL1],	amair);
+	write_sysreg_el1(ctxt->sys_regs[CNTKCTL_EL1], 	cntkctl);
+	write_sysreg(ctxt->sys_regs[PAR_EL1],		par_el1);
+	write_sysreg(ctxt->sys_regs[MDSCR_EL1],		mdscr_el1);
+
+	write_sysreg(ctxt->gp_regs.sp_el1,		sp_el1);
+	write_sysreg_el1(ctxt->gp_regs.elr_el1,		elr);
+	write_sysreg_el1(ctxt->gp_regs.spsr[KVM_SPSR_EL1],spsr);
+}
+
+static hyp_alternate_select(__sysreg_call_restore_host_state,
+			    __sysreg_restore_state, __sysreg_do_nothing,
+			    ARM64_HAS_VIRT_HOST_EXTN);
+
+void __hyp_text __sysreg_restore_host_state(struct kvm_cpu_context *ctxt)
+{
+	__sysreg_call_restore_host_state()(ctxt);
+	__sysreg_restore_common_state(ctxt);
+}
+
+void __hyp_text __sysreg_restore_guest_state(struct kvm_cpu_context *ctxt)
+{
+	__sysreg_restore_state(ctxt);
+	__sysreg_restore_common_state(ctxt);
 }
 
 void __hyp_text __sysreg32_save_state(struct kvm_vcpu *vcpu)

+ 1 - 1
arch/arm64/kvm/hyp/tlb.c

@@ -15,7 +15,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
-#include "hyp.h"
+#include <asm/kvm_hyp.h>
 
 static void __hyp_text __tlb_flush_vmid_ipa(struct kvm *kvm, phys_addr_t ipa)
 {

+ 0 - 84
arch/arm64/kvm/hyp/vgic-v2-sr.c

@@ -1,84 +0,0 @@
-/*
- * Copyright (C) 2012-2015 - ARM Ltd
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/compiler.h>
-#include <linux/irqchip/arm-gic.h>
-#include <linux/kvm_host.h>
-
-#include <asm/kvm_mmu.h>
-
-#include "hyp.h"
-
-/* vcpu is already in the HYP VA space */
-void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = kern_hyp_va(vcpu->kvm);
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-	u32 eisr0, eisr1, elrsr0, elrsr1;
-	int i, nr_lr;
-
-	if (!base)
-		return;
-
-	nr_lr = vcpu->arch.vgic_cpu.nr_lr;
-	cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR);
-	cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR);
-	eisr0  = readl_relaxed(base + GICH_EISR0);
-	elrsr0 = readl_relaxed(base + GICH_ELRSR0);
-	if (unlikely(nr_lr > 32)) {
-		eisr1  = readl_relaxed(base + GICH_EISR1);
-		elrsr1 = readl_relaxed(base + GICH_ELRSR1);
-	} else {
-		eisr1 = elrsr1 = 0;
-	}
-#ifdef CONFIG_CPU_BIG_ENDIAN
-	cpu_if->vgic_eisr  = ((u64)eisr0 << 32) | eisr1;
-	cpu_if->vgic_elrsr = ((u64)elrsr0 << 32) | elrsr1;
-#else
-	cpu_if->vgic_eisr  = ((u64)eisr1 << 32) | eisr0;
-	cpu_if->vgic_elrsr = ((u64)elrsr1 << 32) | elrsr0;
-#endif
-	cpu_if->vgic_apr    = readl_relaxed(base + GICH_APR);
-
-	writel_relaxed(0, base + GICH_HCR);
-
-	for (i = 0; i < nr_lr; i++)
-		cpu_if->vgic_lr[i] = readl_relaxed(base + GICH_LR0 + (i * 4));
-}
-
-/* vcpu is already in the HYP VA space */
-void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
-{
-	struct kvm *kvm = kern_hyp_va(vcpu->kvm);
-	struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
-	struct vgic_dist *vgic = &kvm->arch.vgic;
-	void __iomem *base = kern_hyp_va(vgic->vctrl_base);
-	int i, nr_lr;
-
-	if (!base)
-		return;
-
-	writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR);
-	writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR);
-	writel_relaxed(cpu_if->vgic_apr, base + GICH_APR);
-
-	nr_lr = vcpu->arch.vgic_cpu.nr_lr;
-	for (i = 0; i < nr_lr; i++)
-		writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4));
-}

+ 225 - 116
arch/arm64/kvm/hyp/vgic-v3-sr.c

@@ -19,9 +19,7 @@
 #include <linux/irqchip/arm-gic-v3.h>
 #include <linux/kvm_host.h>
 
-#include <asm/kvm_mmu.h>
-
-#include "hyp.h"
+#include <asm/kvm_hyp.h>
 
 #define vtr_to_max_lr_idx(v)		((v) & 0xf)
 #define vtr_to_nr_pri_bits(v)		(((u32)(v) >> 29) + 1)
@@ -39,12 +37,133 @@
 		asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\
 	} while (0)
 
-/* vcpu is already in the HYP VA space */
+static u64 __hyp_text __gic_v3_get_lr(unsigned int lr)
+{
+	switch (lr & 0xf) {
+	case 0:
+		return read_gicreg(ICH_LR0_EL2);
+	case 1:
+		return read_gicreg(ICH_LR1_EL2);
+	case 2:
+		return read_gicreg(ICH_LR2_EL2);
+	case 3:
+		return read_gicreg(ICH_LR3_EL2);
+	case 4:
+		return read_gicreg(ICH_LR4_EL2);
+	case 5:
+		return read_gicreg(ICH_LR5_EL2);
+	case 6:
+		return read_gicreg(ICH_LR6_EL2);
+	case 7:
+		return read_gicreg(ICH_LR7_EL2);
+	case 8:
+		return read_gicreg(ICH_LR8_EL2);
+	case 9:
+		return read_gicreg(ICH_LR9_EL2);
+	case 10:
+		return read_gicreg(ICH_LR10_EL2);
+	case 11:
+		return read_gicreg(ICH_LR11_EL2);
+	case 12:
+		return read_gicreg(ICH_LR12_EL2);
+	case 13:
+		return read_gicreg(ICH_LR13_EL2);
+	case 14:
+		return read_gicreg(ICH_LR14_EL2);
+	case 15:
+		return read_gicreg(ICH_LR15_EL2);
+	}
+
+	unreachable();
+}
+
+static void __hyp_text __gic_v3_set_lr(u64 val, int lr)
+{
+	switch (lr & 0xf) {
+	case 0:
+		write_gicreg(val, ICH_LR0_EL2);
+		break;
+	case 1:
+		write_gicreg(val, ICH_LR1_EL2);
+		break;
+	case 2:
+		write_gicreg(val, ICH_LR2_EL2);
+		break;
+	case 3:
+		write_gicreg(val, ICH_LR3_EL2);
+		break;
+	case 4:
+		write_gicreg(val, ICH_LR4_EL2);
+		break;
+	case 5:
+		write_gicreg(val, ICH_LR5_EL2);
+		break;
+	case 6:
+		write_gicreg(val, ICH_LR6_EL2);
+		break;
+	case 7:
+		write_gicreg(val, ICH_LR7_EL2);
+		break;
+	case 8:
+		write_gicreg(val, ICH_LR8_EL2);
+		break;
+	case 9:
+		write_gicreg(val, ICH_LR9_EL2);
+		break;
+	case 10:
+		write_gicreg(val, ICH_LR10_EL2);
+		break;
+	case 11:
+		write_gicreg(val, ICH_LR11_EL2);
+		break;
+	case 12:
+		write_gicreg(val, ICH_LR12_EL2);
+		break;
+	case 13:
+		write_gicreg(val, ICH_LR13_EL2);
+		break;
+	case 14:
+		write_gicreg(val, ICH_LR14_EL2);
+		break;
+	case 15:
+		write_gicreg(val, ICH_LR15_EL2);
+		break;
+	}
+}
+
+static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr)
+{
+	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
+	int i;
+	bool expect_mi;
+
+	expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE);
+
+	for (i = 0; i < nr_lr; i++) {
+		if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+				continue;
+
+		expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) &&
+			      (cpu_if->vgic_lr[i] & ICH_LR_EOI));
+	}
+
+	if (expect_mi) {
+		cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
+
+		if (cpu_if->vgic_misr & ICH_MISR_EOI)
+			cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2);
+		else
+			cpu_if->vgic_eisr = 0;
+	} else {
+		cpu_if->vgic_misr = 0;
+		cpu_if->vgic_eisr = 0;
+	}
+}
+
 void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 {
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 	u64 val;
-	u32 max_lr_idx, nr_pri_bits;
 
 	/*
 	 * Make sure stores to the GIC via the memory mapped interface
@@ -53,68 +172,66 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu)
 	dsb(st);
 
 	cpu_if->vgic_vmcr  = read_gicreg(ICH_VMCR_EL2);
-	cpu_if->vgic_misr  = read_gicreg(ICH_MISR_EL2);
-	cpu_if->vgic_eisr  = read_gicreg(ICH_EISR_EL2);
-	cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
 
-	write_gicreg(0, ICH_HCR_EL2);
-	val = read_gicreg(ICH_VTR_EL2);
-	max_lr_idx = vtr_to_max_lr_idx(val);
-	nr_pri_bits = vtr_to_nr_pri_bits(val);
+	if (vcpu->arch.vgic_cpu.live_lrs) {
+		int i;
+		u32 max_lr_idx, nr_pri_bits;
 
-	switch (max_lr_idx) {
-	case 15:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(15)] = read_gicreg(ICH_LR15_EL2);
-	case 14:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(14)] = read_gicreg(ICH_LR14_EL2);
-	case 13:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(13)] = read_gicreg(ICH_LR13_EL2);
-	case 12:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(12)] = read_gicreg(ICH_LR12_EL2);
-	case 11:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(11)] = read_gicreg(ICH_LR11_EL2);
-	case 10:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(10)] = read_gicreg(ICH_LR10_EL2);
-	case 9:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(9)] = read_gicreg(ICH_LR9_EL2);
-	case 8:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(8)] = read_gicreg(ICH_LR8_EL2);
-	case 7:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(7)] = read_gicreg(ICH_LR7_EL2);
-	case 6:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(6)] = read_gicreg(ICH_LR6_EL2);
-	case 5:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(5)] = read_gicreg(ICH_LR5_EL2);
-	case 4:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(4)] = read_gicreg(ICH_LR4_EL2);
-	case 3:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(3)] = read_gicreg(ICH_LR3_EL2);
-	case 2:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(2)] = read_gicreg(ICH_LR2_EL2);
-	case 1:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(1)] = read_gicreg(ICH_LR1_EL2);
-	case 0:
-		cpu_if->vgic_lr[VGIC_V3_LR_INDEX(0)] = read_gicreg(ICH_LR0_EL2);
-	}
+		cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2);
 
-	switch (nr_pri_bits) {
-	case 7:
-		cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
-		cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
-	case 6:
-		cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
-	default:
-		cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
-	}
+		write_gicreg(0, ICH_HCR_EL2);
+		val = read_gicreg(ICH_VTR_EL2);
+		max_lr_idx = vtr_to_max_lr_idx(val);
+		nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-	switch (nr_pri_bits) {
-	case 7:
-		cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
-		cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
-	case 6:
-		cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
-	default:
-		cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+		save_maint_int_state(vcpu, max_lr_idx + 1);
+
+		for (i = 0; i <= max_lr_idx; i++) {
+			if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i)))
+				continue;
+
+			if (cpu_if->vgic_elrsr & (1 << i)) {
+				cpu_if->vgic_lr[i] &= ~ICH_LR_STATE;
+				continue;
+			}
+
+			cpu_if->vgic_lr[i] = __gic_v3_get_lr(i);
+			__gic_v3_set_lr(0, i);
+		}
+
+		switch (nr_pri_bits) {
+		case 7:
+			cpu_if->vgic_ap0r[3] = read_gicreg(ICH_AP0R3_EL2);
+			cpu_if->vgic_ap0r[2] = read_gicreg(ICH_AP0R2_EL2);
+		case 6:
+			cpu_if->vgic_ap0r[1] = read_gicreg(ICH_AP0R1_EL2);
+		default:
+			cpu_if->vgic_ap0r[0] = read_gicreg(ICH_AP0R0_EL2);
+		}
+
+		switch (nr_pri_bits) {
+		case 7:
+			cpu_if->vgic_ap1r[3] = read_gicreg(ICH_AP1R3_EL2);
+			cpu_if->vgic_ap1r[2] = read_gicreg(ICH_AP1R2_EL2);
+		case 6:
+			cpu_if->vgic_ap1r[1] = read_gicreg(ICH_AP1R1_EL2);
+		default:
+			cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2);
+		}
+
+		vcpu->arch.vgic_cpu.live_lrs = 0;
+	} else {
+		cpu_if->vgic_misr  = 0;
+		cpu_if->vgic_eisr  = 0;
+		cpu_if->vgic_elrsr = 0xffff;
+		cpu_if->vgic_ap0r[0] = 0;
+		cpu_if->vgic_ap0r[1] = 0;
+		cpu_if->vgic_ap0r[2] = 0;
+		cpu_if->vgic_ap0r[3] = 0;
+		cpu_if->vgic_ap1r[0] = 0;
+		cpu_if->vgic_ap1r[1] = 0;
+		cpu_if->vgic_ap1r[2] = 0;
+		cpu_if->vgic_ap1r[3] = 0;
 	}
 
 	val = read_gicreg(ICC_SRE_EL2);
@@ -128,6 +245,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3;
 	u64 val;
 	u32 max_lr_idx, nr_pri_bits;
+	u16 live_lrs = 0;
+	int i;
 
 	/*
 	 * VFIQEn is RES1 if ICC_SRE_EL1.SRE is 1. This causes a
@@ -140,66 +259,46 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	write_gicreg(cpu_if->vgic_sre, ICC_SRE_EL1);
 	isb();
 
-	write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
-	write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
-
 	val = read_gicreg(ICH_VTR_EL2);
 	max_lr_idx = vtr_to_max_lr_idx(val);
 	nr_pri_bits = vtr_to_nr_pri_bits(val);
 
-	switch (nr_pri_bits) {
-	case 7:
-		 write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
-		 write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
-	case 6:
-		 write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
-	default:
-		 write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+	for (i = 0; i <= max_lr_idx; i++) {
+		if (cpu_if->vgic_lr[i] & ICH_LR_STATE)
+			live_lrs |= (1 << i);
 	}
 
-	switch (nr_pri_bits) {
-	case 7:
-		 write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
-		 write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
-	case 6:
-		 write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
-	default:
-		 write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
-	}
+	write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2);
 
-	switch (max_lr_idx) {
-	case 15:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(15)], ICH_LR15_EL2);
-	case 14:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(14)], ICH_LR14_EL2);
-	case 13:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(13)], ICH_LR13_EL2);
-	case 12:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(12)], ICH_LR12_EL2);
-	case 11:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(11)], ICH_LR11_EL2);
-	case 10:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(10)], ICH_LR10_EL2);
-	case 9:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(9)], ICH_LR9_EL2);
-	case 8:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(8)], ICH_LR8_EL2);
-	case 7:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(7)], ICH_LR7_EL2);
-	case 6:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(6)], ICH_LR6_EL2);
-	case 5:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(5)], ICH_LR5_EL2);
-	case 4:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(4)], ICH_LR4_EL2);
-	case 3:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(3)], ICH_LR3_EL2);
-	case 2:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(2)], ICH_LR2_EL2);
-	case 1:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(1)], ICH_LR1_EL2);
-	case 0:
-		write_gicreg(cpu_if->vgic_lr[VGIC_V3_LR_INDEX(0)], ICH_LR0_EL2);
+	if (live_lrs) {
+		write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2);
+
+		switch (nr_pri_bits) {
+		case 7:
+			write_gicreg(cpu_if->vgic_ap0r[3], ICH_AP0R3_EL2);
+			write_gicreg(cpu_if->vgic_ap0r[2], ICH_AP0R2_EL2);
+		case 6:
+			write_gicreg(cpu_if->vgic_ap0r[1], ICH_AP0R1_EL2);
+		default:
+			write_gicreg(cpu_if->vgic_ap0r[0], ICH_AP0R0_EL2);
+		}
+
+		switch (nr_pri_bits) {
+		case 7:
+			write_gicreg(cpu_if->vgic_ap1r[3], ICH_AP1R3_EL2);
+			write_gicreg(cpu_if->vgic_ap1r[2], ICH_AP1R2_EL2);
+		case 6:
+			write_gicreg(cpu_if->vgic_ap1r[1], ICH_AP1R1_EL2);
+		default:
+			write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2);
+		}
+
+		for (i = 0; i <= max_lr_idx; i++) {
+			if (!(live_lrs & (1 << i)))
+				continue;
+
+			__gic_v3_set_lr(cpu_if->vgic_lr[i], i);
+		}
 	}
 
 	/*
@@ -209,6 +308,7 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	 */
 	isb();
 	dsb(sy);
+	vcpu->arch.vgic_cpu.live_lrs = live_lrs;
 
 	/*
 	 * Prevent the guest from touching the GIC system registers if
@@ -220,6 +320,15 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu)
 	}
 }
 
+void __hyp_text __vgic_v3_init_lrs(void)
+{
+	int max_lr_idx = vtr_to_max_lr_idx(read_gicreg(ICH_VTR_EL2));
+	int i;
+
+	for (i = 0; i <= max_lr_idx; i++)
+		__gic_v3_set_lr(0, i);
+}
+
 static u64 __hyp_text __vgic_v3_read_ich_vtr_el2(void)
 {
 	return read_gicreg(ICH_VTR_EL2);

+ 7 - 0
arch/arm64/kvm/reset.c

@@ -77,7 +77,11 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_GUEST_DEBUG_HW_WPS:
 		r = get_num_wrps();
 		break;
+	case KVM_CAP_ARM_PMU_V3:
+		r = kvm_arm_support_pmu_v3();
+		break;
 	case KVM_CAP_SET_GUEST_DEBUG:
+	case KVM_CAP_VCPU_ATTRIBUTES:
 		r = 1;
 		break;
 	default:
@@ -120,6 +124,9 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	/* Reset system registers */
 	kvm_reset_sys_regs(vcpu);
 
+	/* Reset PMU */
+	kvm_pmu_vcpu_reset(vcpu);
+
 	/* Reset timer */
 	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }

+ 562 - 47
arch/arm64/kvm/sys_regs.c

@@ -20,6 +20,7 @@
  * along with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/bsearch.h>
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
 #include <linux/uaccess.h>
@@ -34,6 +35,7 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_host.h>
 #include <asm/kvm_mmu.h>
+#include <asm/perf_event.h>
 
 #include <trace/events/kvm.h>
 
@@ -439,6 +441,344 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	vcpu_sys_reg(vcpu, MPIDR_EL1) = (1ULL << 31) | mpidr;
 }
 
+static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
+{
+	u64 pmcr, val;
+
+	asm volatile("mrs %0, pmcr_el0\n" : "=r" (pmcr));
+	/* Writable bits of PMCR_EL0 (ARMV8_PMU_PMCR_MASK) is reset to UNKNOWN
+	 * except PMCR.E resetting to zero.
+	 */
+	val = ((pmcr & ~ARMV8_PMU_PMCR_MASK)
+	       | (ARMV8_PMU_PMCR_MASK & 0xdecafbad)) & (~ARMV8_PMU_PMCR_E);
+	vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+}
+
+static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu)
+{
+	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+
+	return !((reg & ARMV8_PMU_USERENR_EN) || vcpu_mode_priv(vcpu));
+}
+
+static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu)
+{
+	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+
+	return !((reg & (ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN))
+		 || vcpu_mode_priv(vcpu));
+}
+
+static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu)
+{
+	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+
+	return !((reg & (ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN))
+		 || vcpu_mode_priv(vcpu));
+}
+
+static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu)
+{
+	u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0);
+
+	return !((reg & (ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN))
+		 || vcpu_mode_priv(vcpu));
+}
+
+static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			const struct sys_reg_desc *r)
+{
+	u64 val;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_access_el0_disabled(vcpu))
+		return false;
+
+	if (p->is_write) {
+		/* Only update writeable bits of PMCR */
+		val = vcpu_sys_reg(vcpu, PMCR_EL0);
+		val &= ~ARMV8_PMU_PMCR_MASK;
+		val |= p->regval & ARMV8_PMU_PMCR_MASK;
+		vcpu_sys_reg(vcpu, PMCR_EL0) = val;
+		kvm_pmu_handle_pmcr(vcpu, val);
+	} else {
+		/* PMCR.P & PMCR.C are RAZ */
+		val = vcpu_sys_reg(vcpu, PMCR_EL0)
+		      & ~(ARMV8_PMU_PMCR_P | ARMV8_PMU_PMCR_C);
+		p->regval = val;
+	}
+
+	return true;
+}
+
+static bool access_pmselr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			  const struct sys_reg_desc *r)
+{
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_access_event_counter_el0_disabled(vcpu))
+		return false;
+
+	if (p->is_write)
+		vcpu_sys_reg(vcpu, PMSELR_EL0) = p->regval;
+	else
+		/* return PMSELR.SEL field */
+		p->regval = vcpu_sys_reg(vcpu, PMSELR_EL0)
+			    & ARMV8_PMU_COUNTER_MASK;
+
+	return true;
+}
+
+static bool access_pmceid(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			  const struct sys_reg_desc *r)
+{
+	u64 pmceid;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	BUG_ON(p->is_write);
+
+	if (pmu_access_el0_disabled(vcpu))
+		return false;
+
+	if (!(p->Op2 & 1))
+		asm volatile("mrs %0, pmceid0_el0\n" : "=r" (pmceid));
+	else
+		asm volatile("mrs %0, pmceid1_el0\n" : "=r" (pmceid));
+
+	p->regval = pmceid;
+
+	return true;
+}
+
+static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx)
+{
+	u64 pmcr, val;
+
+	pmcr = vcpu_sys_reg(vcpu, PMCR_EL0);
+	val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK;
+	if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX)
+		return false;
+
+	return true;
+}
+
+static bool access_pmu_evcntr(struct kvm_vcpu *vcpu,
+			      struct sys_reg_params *p,
+			      const struct sys_reg_desc *r)
+{
+	u64 idx;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (r->CRn == 9 && r->CRm == 13) {
+		if (r->Op2 == 2) {
+			/* PMXEVCNTR_EL0 */
+			if (pmu_access_event_counter_el0_disabled(vcpu))
+				return false;
+
+			idx = vcpu_sys_reg(vcpu, PMSELR_EL0)
+			      & ARMV8_PMU_COUNTER_MASK;
+		} else if (r->Op2 == 0) {
+			/* PMCCNTR_EL0 */
+			if (pmu_access_cycle_counter_el0_disabled(vcpu))
+				return false;
+
+			idx = ARMV8_PMU_CYCLE_IDX;
+		} else {
+			BUG();
+		}
+	} else if (r->CRn == 14 && (r->CRm & 12) == 8) {
+		/* PMEVCNTRn_EL0 */
+		if (pmu_access_event_counter_el0_disabled(vcpu))
+			return false;
+
+		idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+	} else {
+		BUG();
+	}
+
+	if (!pmu_counter_idx_valid(vcpu, idx))
+		return false;
+
+	if (p->is_write) {
+		if (pmu_access_el0_disabled(vcpu))
+			return false;
+
+		kvm_pmu_set_counter_value(vcpu, idx, p->regval);
+	} else {
+		p->regval = kvm_pmu_get_counter_value(vcpu, idx);
+	}
+
+	return true;
+}
+
+static bool access_pmu_evtyper(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			       const struct sys_reg_desc *r)
+{
+	u64 idx, reg;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_access_el0_disabled(vcpu))
+		return false;
+
+	if (r->CRn == 9 && r->CRm == 13 && r->Op2 == 1) {
+		/* PMXEVTYPER_EL0 */
+		idx = vcpu_sys_reg(vcpu, PMSELR_EL0) & ARMV8_PMU_COUNTER_MASK;
+		reg = PMEVTYPER0_EL0 + idx;
+	} else if (r->CRn == 14 && (r->CRm & 12) == 12) {
+		idx = ((r->CRm & 3) << 3) | (r->Op2 & 7);
+		if (idx == ARMV8_PMU_CYCLE_IDX)
+			reg = PMCCFILTR_EL0;
+		else
+			/* PMEVTYPERn_EL0 */
+			reg = PMEVTYPER0_EL0 + idx;
+	} else {
+		BUG();
+	}
+
+	if (!pmu_counter_idx_valid(vcpu, idx))
+		return false;
+
+	if (p->is_write) {
+		kvm_pmu_set_counter_event_type(vcpu, p->regval, idx);
+		vcpu_sys_reg(vcpu, reg) = p->regval & ARMV8_PMU_EVTYPE_MASK;
+	} else {
+		p->regval = vcpu_sys_reg(vcpu, reg) & ARMV8_PMU_EVTYPE_MASK;
+	}
+
+	return true;
+}
+
+static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			   const struct sys_reg_desc *r)
+{
+	u64 val, mask;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_access_el0_disabled(vcpu))
+		return false;
+
+	mask = kvm_pmu_valid_counter_mask(vcpu);
+	if (p->is_write) {
+		val = p->regval & mask;
+		if (r->Op2 & 0x1) {
+			/* accessing PMCNTENSET_EL0 */
+			vcpu_sys_reg(vcpu, PMCNTENSET_EL0) |= val;
+			kvm_pmu_enable_counter(vcpu, val);
+		} else {
+			/* accessing PMCNTENCLR_EL0 */
+			vcpu_sys_reg(vcpu, PMCNTENSET_EL0) &= ~val;
+			kvm_pmu_disable_counter(vcpu, val);
+		}
+	} else {
+		p->regval = vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask;
+	}
+
+	return true;
+}
+
+static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			   const struct sys_reg_desc *r)
+{
+	u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (!vcpu_mode_priv(vcpu))
+		return false;
+
+	if (p->is_write) {
+		u64 val = p->regval & mask;
+
+		if (r->Op2 & 0x1)
+			/* accessing PMINTENSET_EL1 */
+			vcpu_sys_reg(vcpu, PMINTENSET_EL1) |= val;
+		else
+			/* accessing PMINTENCLR_EL1 */
+			vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val;
+	} else {
+		p->regval = vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask;
+	}
+
+	return true;
+}
+
+static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			 const struct sys_reg_desc *r)
+{
+	u64 mask = kvm_pmu_valid_counter_mask(vcpu);
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_access_el0_disabled(vcpu))
+		return false;
+
+	if (p->is_write) {
+		if (r->CRm & 0x2)
+			/* accessing PMOVSSET_EL0 */
+			kvm_pmu_overflow_set(vcpu, p->regval & mask);
+		else
+			/* accessing PMOVSCLR_EL0 */
+			vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask);
+	} else {
+		p->regval = vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask;
+	}
+
+	return true;
+}
+
+static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			   const struct sys_reg_desc *r)
+{
+	u64 mask;
+
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (pmu_write_swinc_el0_disabled(vcpu))
+		return false;
+
+	if (p->is_write) {
+		mask = kvm_pmu_valid_counter_mask(vcpu);
+		kvm_pmu_software_increment(vcpu, p->regval & mask);
+		return true;
+	}
+
+	return false;
+}
+
+static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p,
+			     const struct sys_reg_desc *r)
+{
+	if (!kvm_arm_pmu_v3_ready(vcpu))
+		return trap_raz_wi(vcpu, p, r);
+
+	if (p->is_write) {
+		if (!vcpu_mode_priv(vcpu))
+			return false;
+
+		vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval
+						    & ARMV8_PMU_USERENR_MASK;
+	} else {
+		p->regval = vcpu_sys_reg(vcpu, PMUSERENR_EL0)
+			    & ARMV8_PMU_USERENR_MASK;
+	}
+
+	return true;
+}
+
 /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)					\
 	/* DBGBVRn_EL1 */						\
@@ -454,6 +794,20 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),	\
 	  trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
+/* Macro to expand the PMEVCNTRn_EL0 register */
+#define PMU_PMEVCNTR_EL0(n)						\
+	/* PMEVCNTRn_EL0 */						\
+	{ Op0(0b11), Op1(0b011), CRn(0b1110),				\
+	  CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	  access_pmu_evcntr, reset_unknown, (PMEVCNTR0_EL0 + n), }
+
+/* Macro to expand the PMEVTYPERn_EL0 register */
+#define PMU_PMEVTYPER_EL0(n)						\
+	/* PMEVTYPERn_EL0 */						\
+	{ Op0(0b11), Op1(0b011), CRn(0b1110),				\
+	  CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	  access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), }
+
 /*
  * Architected system registers.
  * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2
@@ -583,10 +937,10 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	/* PMINTENSET_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001),
-	  trap_raz_wi },
+	  access_pminten, reset_unknown, PMINTENSET_EL1 },
 	/* PMINTENCLR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010),
-	  trap_raz_wi },
+	  access_pminten, NULL, PMINTENSET_EL1 },
 
 	/* MAIR_EL1 */
 	{ Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000),
@@ -623,43 +977,46 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 
 	/* PMCR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000),
-	  trap_raz_wi },
+	  access_pmcr, reset_pmcr, },
 	/* PMCNTENSET_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001),
-	  trap_raz_wi },
+	  access_pmcnten, reset_unknown, PMCNTENSET_EL0 },
 	/* PMCNTENCLR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010),
-	  trap_raz_wi },
+	  access_pmcnten, NULL, PMCNTENSET_EL0 },
 	/* PMOVSCLR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011),
-	  trap_raz_wi },
+	  access_pmovs, NULL, PMOVSSET_EL0 },
 	/* PMSWINC_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100),
-	  trap_raz_wi },
+	  access_pmswinc, reset_unknown, PMSWINC_EL0 },
 	/* PMSELR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101),
-	  trap_raz_wi },
+	  access_pmselr, reset_unknown, PMSELR_EL0 },
 	/* PMCEID0_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110),
-	  trap_raz_wi },
+	  access_pmceid },
 	/* PMCEID1_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111),
-	  trap_raz_wi },
+	  access_pmceid },
 	/* PMCCNTR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000),
-	  trap_raz_wi },
+	  access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 },
 	/* PMXEVTYPER_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001),
-	  trap_raz_wi },
+	  access_pmu_evtyper },
 	/* PMXEVCNTR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010),
-	  trap_raz_wi },
-	/* PMUSERENR_EL0 */
+	  access_pmu_evcntr },
+	/* PMUSERENR_EL0
+	 * This register resets as unknown in 64bit mode while it resets as zero
+	 * in 32bit mode. Here we choose to reset it as zero for consistency.
+	 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000),
-	  trap_raz_wi },
+	  access_pmuserenr, reset_val, PMUSERENR_EL0, 0 },
 	/* PMOVSSET_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011),
-	  trap_raz_wi },
+	  access_pmovs, reset_unknown, PMOVSSET_EL0 },
 
 	/* TPIDR_EL0 */
 	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010),
@@ -668,6 +1025,77 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	{ Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011),
 	  NULL, reset_unknown, TPIDRRO_EL0 },
 
+	/* PMEVCNTRn_EL0 */
+	PMU_PMEVCNTR_EL0(0),
+	PMU_PMEVCNTR_EL0(1),
+	PMU_PMEVCNTR_EL0(2),
+	PMU_PMEVCNTR_EL0(3),
+	PMU_PMEVCNTR_EL0(4),
+	PMU_PMEVCNTR_EL0(5),
+	PMU_PMEVCNTR_EL0(6),
+	PMU_PMEVCNTR_EL0(7),
+	PMU_PMEVCNTR_EL0(8),
+	PMU_PMEVCNTR_EL0(9),
+	PMU_PMEVCNTR_EL0(10),
+	PMU_PMEVCNTR_EL0(11),
+	PMU_PMEVCNTR_EL0(12),
+	PMU_PMEVCNTR_EL0(13),
+	PMU_PMEVCNTR_EL0(14),
+	PMU_PMEVCNTR_EL0(15),
+	PMU_PMEVCNTR_EL0(16),
+	PMU_PMEVCNTR_EL0(17),
+	PMU_PMEVCNTR_EL0(18),
+	PMU_PMEVCNTR_EL0(19),
+	PMU_PMEVCNTR_EL0(20),
+	PMU_PMEVCNTR_EL0(21),
+	PMU_PMEVCNTR_EL0(22),
+	PMU_PMEVCNTR_EL0(23),
+	PMU_PMEVCNTR_EL0(24),
+	PMU_PMEVCNTR_EL0(25),
+	PMU_PMEVCNTR_EL0(26),
+	PMU_PMEVCNTR_EL0(27),
+	PMU_PMEVCNTR_EL0(28),
+	PMU_PMEVCNTR_EL0(29),
+	PMU_PMEVCNTR_EL0(30),
+	/* PMEVTYPERn_EL0 */
+	PMU_PMEVTYPER_EL0(0),
+	PMU_PMEVTYPER_EL0(1),
+	PMU_PMEVTYPER_EL0(2),
+	PMU_PMEVTYPER_EL0(3),
+	PMU_PMEVTYPER_EL0(4),
+	PMU_PMEVTYPER_EL0(5),
+	PMU_PMEVTYPER_EL0(6),
+	PMU_PMEVTYPER_EL0(7),
+	PMU_PMEVTYPER_EL0(8),
+	PMU_PMEVTYPER_EL0(9),
+	PMU_PMEVTYPER_EL0(10),
+	PMU_PMEVTYPER_EL0(11),
+	PMU_PMEVTYPER_EL0(12),
+	PMU_PMEVTYPER_EL0(13),
+	PMU_PMEVTYPER_EL0(14),
+	PMU_PMEVTYPER_EL0(15),
+	PMU_PMEVTYPER_EL0(16),
+	PMU_PMEVTYPER_EL0(17),
+	PMU_PMEVTYPER_EL0(18),
+	PMU_PMEVTYPER_EL0(19),
+	PMU_PMEVTYPER_EL0(20),
+	PMU_PMEVTYPER_EL0(21),
+	PMU_PMEVTYPER_EL0(22),
+	PMU_PMEVTYPER_EL0(23),
+	PMU_PMEVTYPER_EL0(24),
+	PMU_PMEVTYPER_EL0(25),
+	PMU_PMEVTYPER_EL0(26),
+	PMU_PMEVTYPER_EL0(27),
+	PMU_PMEVTYPER_EL0(28),
+	PMU_PMEVTYPER_EL0(29),
+	PMU_PMEVTYPER_EL0(30),
+	/* PMCCFILTR_EL0
+	 * This register resets as unknown in 64bit mode while it resets as zero
+	 * in 32bit mode. Here we choose to reset it as zero for consistency.
+	 */
+	{ Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b1111), Op2(0b111),
+	  access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 },
+
 	/* DACR32_EL2 */
 	{ Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b0000), Op2(0b000),
 	  NULL, reset_unknown, DACR32_EL2 },
@@ -857,6 +1285,20 @@ static const struct sys_reg_desc cp14_64_regs[] = {
 	{ Op1( 0), CRm( 2), .access = trap_raz_wi },
 };
 
+/* Macro to expand the PMEVCNTRn register */
+#define PMU_PMEVCNTR(n)							\
+	/* PMEVCNTRn */							\
+	{ Op1(0), CRn(0b1110),						\
+	  CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	  access_pmu_evcntr }
+
+/* Macro to expand the PMEVTYPERn register */
+#define PMU_PMEVTYPER(n)						\
+	/* PMEVTYPERn */						\
+	{ Op1(0), CRn(0b1110),						\
+	  CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)),		\
+	  access_pmu_evtyper }
+
 /*
  * Trapped cp15 registers. TTBR0/TTBR1 get a double encoding,
  * depending on the way they are accessed (as a 32bit or a 64bit
@@ -885,19 +1327,21 @@ static const struct sys_reg_desc cp15_regs[] = {
 	{ Op1( 0), CRn( 7), CRm(14), Op2( 2), access_dcsw },
 
 	/* PMU */
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 0), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 1), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 2), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 3), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 5), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 6), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(12), Op2( 7), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(13), Op2( 0), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(13), Op2( 1), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(13), Op2( 2), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(14), Op2( 0), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(14), Op2( 1), trap_raz_wi },
-	{ Op1( 0), CRn( 9), CRm(14), Op2( 2), trap_raz_wi },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 0), access_pmcr },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 1), access_pmcnten },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 2), access_pmcnten },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 3), access_pmovs },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 4), access_pmswinc },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 5), access_pmselr },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 6), access_pmceid },
+	{ Op1( 0), CRn( 9), CRm(12), Op2( 7), access_pmceid },
+	{ Op1( 0), CRn( 9), CRm(13), Op2( 0), access_pmu_evcntr },
+	{ Op1( 0), CRn( 9), CRm(13), Op2( 1), access_pmu_evtyper },
+	{ Op1( 0), CRn( 9), CRm(13), Op2( 2), access_pmu_evcntr },
+	{ Op1( 0), CRn( 9), CRm(14), Op2( 0), access_pmuserenr },
+	{ Op1( 0), CRn( 9), CRm(14), Op2( 1), access_pminten },
+	{ Op1( 0), CRn( 9), CRm(14), Op2( 2), access_pminten },
+	{ Op1( 0), CRn( 9), CRm(14), Op2( 3), access_pmovs },
 
 	{ Op1( 0), CRn(10), CRm( 2), Op2( 0), access_vm_reg, NULL, c10_PRRR },
 	{ Op1( 0), CRn(10), CRm( 2), Op2( 1), access_vm_reg, NULL, c10_NMRR },
@@ -908,10 +1352,78 @@ static const struct sys_reg_desc cp15_regs[] = {
 	{ Op1( 0), CRn(12), CRm(12), Op2( 5), trap_raz_wi },
 
 	{ Op1( 0), CRn(13), CRm( 0), Op2( 1), access_vm_reg, NULL, c13_CID },
+
+	/* PMEVCNTRn */
+	PMU_PMEVCNTR(0),
+	PMU_PMEVCNTR(1),
+	PMU_PMEVCNTR(2),
+	PMU_PMEVCNTR(3),
+	PMU_PMEVCNTR(4),
+	PMU_PMEVCNTR(5),
+	PMU_PMEVCNTR(6),
+	PMU_PMEVCNTR(7),
+	PMU_PMEVCNTR(8),
+	PMU_PMEVCNTR(9),
+	PMU_PMEVCNTR(10),
+	PMU_PMEVCNTR(11),
+	PMU_PMEVCNTR(12),
+	PMU_PMEVCNTR(13),
+	PMU_PMEVCNTR(14),
+	PMU_PMEVCNTR(15),
+	PMU_PMEVCNTR(16),
+	PMU_PMEVCNTR(17),
+	PMU_PMEVCNTR(18),
+	PMU_PMEVCNTR(19),
+	PMU_PMEVCNTR(20),
+	PMU_PMEVCNTR(21),
+	PMU_PMEVCNTR(22),
+	PMU_PMEVCNTR(23),
+	PMU_PMEVCNTR(24),
+	PMU_PMEVCNTR(25),
+	PMU_PMEVCNTR(26),
+	PMU_PMEVCNTR(27),
+	PMU_PMEVCNTR(28),
+	PMU_PMEVCNTR(29),
+	PMU_PMEVCNTR(30),
+	/* PMEVTYPERn */
+	PMU_PMEVTYPER(0),
+	PMU_PMEVTYPER(1),
+	PMU_PMEVTYPER(2),
+	PMU_PMEVTYPER(3),
+	PMU_PMEVTYPER(4),
+	PMU_PMEVTYPER(5),
+	PMU_PMEVTYPER(6),
+	PMU_PMEVTYPER(7),
+	PMU_PMEVTYPER(8),
+	PMU_PMEVTYPER(9),
+	PMU_PMEVTYPER(10),
+	PMU_PMEVTYPER(11),
+	PMU_PMEVTYPER(12),
+	PMU_PMEVTYPER(13),
+	PMU_PMEVTYPER(14),
+	PMU_PMEVTYPER(15),
+	PMU_PMEVTYPER(16),
+	PMU_PMEVTYPER(17),
+	PMU_PMEVTYPER(18),
+	PMU_PMEVTYPER(19),
+	PMU_PMEVTYPER(20),
+	PMU_PMEVTYPER(21),
+	PMU_PMEVTYPER(22),
+	PMU_PMEVTYPER(23),
+	PMU_PMEVTYPER(24),
+	PMU_PMEVTYPER(25),
+	PMU_PMEVTYPER(26),
+	PMU_PMEVTYPER(27),
+	PMU_PMEVTYPER(28),
+	PMU_PMEVTYPER(29),
+	PMU_PMEVTYPER(30),
+	/* PMCCFILTR */
+	{ Op1(0), CRn(14), CRm(15), Op2(7), access_pmu_evtyper },
 };
 
 static const struct sys_reg_desc cp15_64_regs[] = {
 	{ Op1( 0), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR0 },
+	{ Op1( 0), CRn( 0), CRm( 9), Op2( 0), access_pmu_evcntr },
 	{ Op1( 0), CRn( 0), CRm(12), Op2( 0), access_gic_sgi },
 	{ Op1( 1), CRn( 0), CRm( 2), Op2( 0), access_vm_reg, NULL, c2_TTBR1 },
 };
@@ -942,29 +1454,32 @@ static const struct sys_reg_desc *get_target_table(unsigned target,
 	}
 }
 
+#define reg_to_match_value(x)						\
+	({								\
+		unsigned long val;					\
+		val  = (x)->Op0 << 14;					\
+		val |= (x)->Op1 << 11;					\
+		val |= (x)->CRn << 7;					\
+		val |= (x)->CRm << 3;					\
+		val |= (x)->Op2;					\
+		val;							\
+	 })
+
+static int match_sys_reg(const void *key, const void *elt)
+{
+	const unsigned long pval = (unsigned long)key;
+	const struct sys_reg_desc *r = elt;
+
+	return pval - reg_to_match_value(r);
+}
+
 static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params,
 					 const struct sys_reg_desc table[],
 					 unsigned int num)
 {
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		const struct sys_reg_desc *r = &table[i];
-
-		if (params->Op0 != r->Op0)
-			continue;
-		if (params->Op1 != r->Op1)
-			continue;
-		if (params->CRn != r->CRn)
-			continue;
-		if (params->CRm != r->CRm)
-			continue;
-		if (params->Op2 != r->Op2)
-			continue;
+	unsigned long pval = reg_to_match_value(params);
 
-		return r;
-	}
-	return NULL;
+	return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg);
 }
 
 int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run)

+ 0 - 2
arch/powerpc/include/asm/kvm_book3s_64.h

@@ -33,8 +33,6 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu)
 }
 #endif
 
-#define SPAPR_TCE_SHIFT		12
-
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 #define KVM_DEFAULT_HPT_ORDER	24	/* 16MB HPT by default */
 #endif

+ 4 - 1
arch/powerpc/include/asm/kvm_host.h

@@ -182,7 +182,10 @@ struct kvmppc_spapr_tce_table {
 	struct list_head list;
 	struct kvm *kvm;
 	u64 liobn;
-	u32 window_size;
+	struct rcu_head rcu;
+	u32 page_shift;
+	u64 offset;		/* in pages */
+	u64 size;		/* window size in pages */
 	struct page *pages[0];
 };
 

+ 50 - 1
arch/powerpc/include/asm/kvm_ppc.h

@@ -165,9 +165,25 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				struct kvm_create_spapr_tce *args);
+				struct kvm_create_spapr_tce_64 *args);
+extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
+		struct kvm_vcpu *vcpu, unsigned long liobn);
+extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long ioba, unsigned long npages);
+extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
+		unsigned long tce);
+extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+		unsigned long *ua, unsigned long **prmap);
+extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
+		unsigned long idx, unsigned long tce);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba, unsigned long tce);
+extern long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages);
+extern long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 			     unsigned long ioba);
 extern struct page *kvm_alloc_hpt(unsigned long nr_pages);
@@ -437,6 +453,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 {
 	return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+extern void kvmppc_alloc_host_rm_ops(void);
+extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -445,7 +463,11 @@ extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 			struct kvm_vcpu *vcpu, u32 cpu);
+extern void kvmppc_xics_ipi_action(void);
+extern int h_ipi_redirect;
 #else
+static inline void kvmppc_alloc_host_rm_ops(void) {};
+static inline void kvmppc_free_host_rm_ops(void) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 	{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
@@ -459,6 +481,33 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 	{ return 0; }
 #endif
 
+/*
+ * Host-side operations we want to set up while running in real
+ * mode in the guest operating on the xics.
+ * Currently only VCPU wakeup is supported.
+ */
+
+union kvmppc_rm_state {
+	unsigned long raw;
+	struct {
+		u32 in_host;
+		u32 rm_action;
+	};
+};
+
+struct kvmppc_host_rm_core {
+	union kvmppc_rm_state rm_state;
+	void *rm_data;
+	char pad[112];
+};
+
+struct kvmppc_host_rm_ops {
+	struct kvmppc_host_rm_core	*rm_core;
+	void		(*vcpu_kick)(struct kvm_vcpu *vcpu);
+};
+
+extern struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+
 static inline unsigned long kvmppc_get_epr(struct kvm_vcpu *vcpu)
 {
 #ifdef CONFIG_KVM_BOOKE_HV

+ 3 - 0
arch/powerpc/include/asm/pgtable.h

@@ -78,6 +78,9 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t *pgdir, unsigned long ea,
 	}
 	return __find_linux_pte_or_hugepte(pgdir, ea, is_thp, shift);
 }
+
+unsigned long vmalloc_to_phys(void *vmalloc_addr);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_H */

+ 4 - 0
arch/powerpc/include/asm/smp.h

@@ -114,6 +114,9 @@ extern int cpu_to_core_id(int cpu);
 #define PPC_MSG_TICK_BROADCAST	2
 #define PPC_MSG_DEBUGGER_BREAK  3
 
+/* This is only used by the powernv kernel */
+#define PPC_MSG_RM_HOST_ACTION	4
+
 /* for irq controllers that have dedicated ipis per message (4) */
 extern int smp_request_message_ipi(int virq, int message);
 extern const char *smp_ipi_name[];
@@ -121,6 +124,7 @@ extern const char *smp_ipi_name[];
 /* for irq controllers with only a single ipi */
 extern void smp_muxed_ipi_set_data(int cpu, unsigned long data);
 extern void smp_muxed_ipi_message_pass(int cpu, int msg);
+extern void smp_muxed_ipi_set_message(int cpu, int msg);
 extern irqreturn_t smp_ipi_demux(void);
 
 void smp_init_pSeries(void);

+ 1 - 0
arch/powerpc/include/asm/xics.h

@@ -30,6 +30,7 @@
 #ifdef CONFIG_PPC_ICP_NATIVE
 extern int icp_native_init(void);
 extern void icp_native_flush_interrupt(void);
+extern void icp_native_cause_ipi_rm(int cpu);
 #else
 static inline int icp_native_init(void) { return -ENODEV; }
 #endif

+ 9 - 0
arch/powerpc/include/uapi/asm/kvm.h

@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
 	__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+	__u64 liobn;
+	__u32 page_shift;
+	__u32 flags;
+	__u64 offset;	/* in pages */
+	__u64 size;	/* in pages */
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
 	__u64 rma_size;

+ 23 - 5
arch/powerpc/kernel/smp.c

@@ -206,7 +206,7 @@ int smp_request_message_ipi(int virq, int msg)
 
 #ifdef CONFIG_PPC_SMP_MUXED_IPI
 struct cpu_messages {
-	int messages;			/* current messages */
+	long messages;			/* current messages */
 	unsigned long data;		/* data for cause ipi */
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message);
@@ -218,7 +218,7 @@ void smp_muxed_ipi_set_data(int cpu, unsigned long data)
 	info->data = data;
 }
 
-void smp_muxed_ipi_message_pass(int cpu, int msg)
+void smp_muxed_ipi_set_message(int cpu, int msg)
 {
 	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
 	char *message = (char *)&info->messages;
@@ -228,6 +228,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 	 */
 	smp_mb();
 	message[msg] = 1;
+}
+
+void smp_muxed_ipi_message_pass(int cpu, int msg)
+{
+	struct cpu_messages *info = &per_cpu(ipi_message, cpu);
+
+	smp_muxed_ipi_set_message(cpu, msg);
 	/*
 	 * cause_ipi functions are required to include a full barrier
 	 * before doing whatever causes the IPI.
@@ -236,20 +243,31 @@ void smp_muxed_ipi_message_pass(int cpu, int msg)
 }
 
 #ifdef __BIG_ENDIAN__
-#define IPI_MESSAGE(A) (1 << (24 - 8 * (A)))
+#define IPI_MESSAGE(A) (1uL << ((BITS_PER_LONG - 8) - 8 * (A)))
 #else
-#define IPI_MESSAGE(A) (1 << (8 * (A)))
+#define IPI_MESSAGE(A) (1uL << (8 * (A)))
 #endif
 
 irqreturn_t smp_ipi_demux(void)
 {
 	struct cpu_messages *info = this_cpu_ptr(&ipi_message);
-	unsigned int all;
+	unsigned long all;
 
 	mb();	/* order any irq clear */
 
 	do {
 		all = xchg(&info->messages, 0);
+#if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+		/*
+		 * Must check for PPC_MSG_RM_HOST_ACTION messages
+		 * before PPC_MSG_CALL_FUNCTION messages because when
+		 * a VM is destroyed, we call kick_all_cpus_sync()
+		 * to ensure that any pending PPC_MSG_RM_HOST_ACTION
+		 * messages have completed before we free any VCPUs.
+		 */
+		if (all & IPI_MESSAGE(PPC_MSG_RM_HOST_ACTION))
+			kvmppc_xics_ipi_action();
+#endif
 		if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNCTION))
 			generic_smp_call_function_interrupt();
 		if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))

+ 1 - 1
arch/powerpc/kvm/Makefile

@@ -8,7 +8,7 @@ ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
 KVM := ../../../virt/kvm
 
 common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \
-		$(KVM)/eventfd.o
+		$(KVM)/eventfd.o $(KVM)/vfio.o
 
 CFLAGS_e500_mmu.o := -I.
 CFLAGS_e500_mmu_host.o := -I.

+ 1 - 1
arch/powerpc/kvm/book3s.c

@@ -807,7 +807,7 @@ int kvmppc_core_init_vm(struct kvm *kvm)
 {
 
 #ifdef CONFIG_PPC64
-	INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables);
+	INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
 	INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
 #endif
 

+ 136 - 20
arch/powerpc/kvm/book3s_64_vio.c

@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
@@ -36,28 +37,69 @@
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
 
-#define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
+static unsigned long kvmppc_tce_pages(unsigned long iommu_pages)
+{
+	return ALIGN(iommu_pages * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+}
 
-static long kvmppc_stt_npages(unsigned long window_size)
+static unsigned long kvmppc_stt_pages(unsigned long tce_pages)
 {
-	return ALIGN((window_size >> SPAPR_TCE_SHIFT)
-		     * sizeof(u64), PAGE_SIZE) / PAGE_SIZE;
+	unsigned long stt_bytes = sizeof(struct kvmppc_spapr_tce_table) +
+			(tce_pages * sizeof(struct page *));
+
+	return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
 }
 
-static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt)
+static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
 {
-	struct kvm *kvm = stt->kvm;
-	int i;
+	long ret = 0;
 
-	mutex_lock(&kvm->lock);
-	list_del(&stt->list);
-	for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++)
+	if (!current || !current->mm)
+		return ret; /* process exited */
+
+	down_write(&current->mm->mmap_sem);
+
+	if (inc) {
+		unsigned long locked, lock_limit;
+
+		locked = current->mm->locked_vm + stt_pages;
+		lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+			ret = -ENOMEM;
+		else
+			current->mm->locked_vm += stt_pages;
+	} else {
+		if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
+			stt_pages = current->mm->locked_vm;
+
+		current->mm->locked_vm -= stt_pages;
+	}
+
+	pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
+			inc ? '+' : '-',
+			stt_pages << PAGE_SHIFT,
+			current->mm->locked_vm << PAGE_SHIFT,
+			rlimit(RLIMIT_MEMLOCK),
+			ret ? " - exceeded" : "");
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+static void release_spapr_tce_table(struct rcu_head *head)
+{
+	struct kvmppc_spapr_tce_table *stt = container_of(head,
+			struct kvmppc_spapr_tce_table, rcu);
+	unsigned long i, npages = kvmppc_tce_pages(stt->size);
+
+	for (i = 0; i < npages; i++)
 		__free_page(stt->pages[i]);
-	kfree(stt);
-	mutex_unlock(&kvm->lock);
 
-	kvm_put_kvm(kvm);
+	kfree(stt);
 }
 
 static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
@@ -65,7 +107,7 @@ static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data;
 	struct page *page;
 
-	if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size))
+	if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
 		return VM_FAULT_SIGBUS;
 
 	page = stt->pages[vmf->pgoff];
@@ -88,7 +130,14 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
 
-	release_spapr_tce_table(stt);
+	list_del_rcu(&stt->list);
+
+	kvm_put_kvm(stt->kvm);
+
+	kvmppc_account_memlimit(
+		kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
+	call_rcu(&stt->rcu, release_spapr_tce_table);
+
 	return 0;
 }
 
@@ -98,20 +147,29 @@ static const struct file_operations kvm_spapr_tce_fops = {
 };
 
 long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-				   struct kvm_create_spapr_tce *args)
+				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
-	long npages;
+	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
 
+	if (!args->size)
+		return -EINVAL;
+
 	/* Check this LIOBN hasn't been previously allocated */
 	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
 		if (stt->liobn == args->liobn)
 			return -EBUSY;
 	}
 
-	npages = kvmppc_stt_npages(args->window_size);
+	size = args->size;
+	npages = kvmppc_tce_pages(size);
+	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+	if (ret) {
+		stt = NULL;
+		goto fail;
+	}
 
 	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 		      GFP_KERNEL);
@@ -119,7 +177,9 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		goto fail;
 
 	stt->liobn = args->liobn;
-	stt->window_size = args->window_size;
+	stt->page_shift = args->page_shift;
+	stt->offset = args->offset;
+	stt->size = size;
 	stt->kvm = kvm;
 
 	for (i = 0; i < npages; i++) {
@@ -131,7 +191,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	kvm_get_kvm(kvm);
 
 	mutex_lock(&kvm->lock);
-	list_add(&stt->list, &kvm->arch.spapr_tce_tables);
+	list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
 
 	mutex_unlock(&kvm->lock);
 
@@ -148,3 +208,59 @@ fail:
 	}
 	return ret;
 }
+
+long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret = H_SUCCESS, idx;
+	unsigned long entry, ua = 0;
+	u64 __user *tces, tce;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	entry = ioba >> stt->page_shift;
+	/*
+	 * SPAPR spec says that the maximum size of the list is 512 TCEs
+	 * so the whole table fits in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & (SZ_4K - 1))
+		return H_PARAMETER;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = srcu_read_lock(&vcpu->kvm->srcu);
+	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+		ret = H_TOO_HARD;
+		goto unlock_exit;
+	}
+	tces = (u64 __user *) ua;
+
+	for (i = 0; i < npages; ++i) {
+		if (get_user(tce, tces + i)) {
+			ret = H_TOO_HARD;
+			goto unlock_exit;
+		}
+		tce = be64_to_cpu(tce);
+
+		ret = kvmppc_tce_validate(stt, tce);
+		if (ret != H_SUCCESS)
+			goto unlock_exit;
+
+		kvmppc_tce_put(stt, entry + i, tce);
+	}
+
+unlock_exit:
+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_put_tce_indirect);

+ 288 - 42
arch/powerpc/kvm/book3s_64_vio_hv.c

@@ -14,6 +14,7 @@
  *
  * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
  * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
+ * Copyright 2016 Alexey Kardashevskiy, IBM Corporation <aik@au1.ibm.com>
  */
 
 #include <linux/types.h>
@@ -30,76 +31,321 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/mmu-hash64.h>
+#include <asm/mmu_context.h>
 #include <asm/hvcall.h>
 #include <asm/synch.h>
 #include <asm/ppc-opcode.h>
 #include <asm/kvm_host.h>
 #include <asm/udbg.h>
+#include <asm/iommu.h>
+#include <asm/tce.h>
+#include <asm/iommu.h>
 
 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
 
-/* WARNING: This will be called in real-mode on HV KVM and virtual
+/*
+ * Finds a TCE table descriptor by LIOBN.
+ *
+ * WARNING: This will be called in real or virtual mode on HV KVM and virtual
  *          mode on PR KVM
  */
-long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-		      unsigned long ioba, unsigned long tce)
+struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu,
+		unsigned long liobn)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;
 
+	list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list)
+		if (stt->liobn == liobn)
+			return stt;
+
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(kvmppc_find_table);
+
+/*
+ * Validates IO address.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+		unsigned long ioba, unsigned long npages)
+{
+	unsigned long mask = (1ULL << stt->page_shift) - 1;
+	unsigned long idx = ioba >> stt->page_shift;
+
+	if ((ioba & mask) || (idx < stt->offset) ||
+			(idx - stt->offset + npages > stt->size) ||
+			(idx + npages < idx))
+		return H_PARAMETER;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_ioba_validate);
+
+/*
+ * Validates TCE address.
+ * At the moment flags and page mask are validated.
+ * As the host kernel does not access those addresses (just puts them
+ * to the table and user space is supposed to process them), we can skip
+ * checking other things (such as TCE is a guest RAM address or the page
+ * was actually allocated).
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+{
+	unsigned long page_mask = ~((1ULL << stt->page_shift) - 1);
+	unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ);
+
+	if (tce & mask)
+		return H_PARAMETER;
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
+
+/* Note on the use of page_address() in real mode,
+ *
+ * It is safe to use page_address() in real mode on ppc64 because
+ * page_address() is always defined as lowmem_page_address()
+ * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetic
+ * operation and does not access page struct.
+ *
+ * Theoretically page_address() could be defined different
+ * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
+ * would have to be enabled.
+ * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
+ * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
+ * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
+ * is not expected to be enabled on ppc32, page_address()
+ * is safe for ppc32 as well.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+static u64 *kvmppc_page_address(struct page *page)
+{
+#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
+#error TODO: fix to avoid page_address() here
+#endif
+	return (u64 *) page_address(page);
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Called in both real and virtual modes.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ *
+ * WARNING: This will be called in real-mode on HV KVM and virtual
+ *          mode on PR KVM
+ */
+void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+		unsigned long idx, unsigned long tce)
+{
+	struct page *page;
+	u64 *tbl;
+
+	idx -= stt->offset;
+	page = stt->pages[idx / TCES_PER_PAGE];
+	tbl = kvmppc_page_address(page);
+
+	tbl[idx % TCES_PER_PAGE] = tce;
+}
+EXPORT_SYMBOL_GPL(kvmppc_tce_put);
+
+long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
+		unsigned long *ua, unsigned long **prmap)
+{
+	unsigned long gfn = gpa >> PAGE_SHIFT;
+	struct kvm_memory_slot *memslot;
+
+	memslot = search_memslots(kvm_memslots(kvm), gfn);
+	if (!memslot)
+		return -EINVAL;
+
+	*ua = __gfn_to_hva_memslot(memslot, gfn) |
+		(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+	if (prmap)
+		*prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+#endif
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
+
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba, unsigned long tce)
+{
+	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	long ret;
+
 	/* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
 	/* 	    liobn, ioba, tce); */
 
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == liobn) {
-			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-			struct page *page;
-			u64 *tbl;
-
-			/* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  window_size=0x%x\n", */
-			/* 	    liobn, stt, stt->window_size); */
-			if (ioba >= stt->window_size)
-				return H_PARAMETER;
-
-			page = stt->pages[idx / TCES_PER_PAGE];
-			tbl = (u64 *)page_address(page);
-
-			/* FIXME: Need to validate the TCE itself */
-			/* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
-			tbl[idx % TCES_PER_PAGE] = tce;
-			return H_SUCCESS;
-		}
-	}
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;
 
-	/* Didn't find the liobn, punt it to userspace */
-	return H_TOO_HARD;
+	ret = kvmppc_tce_validate(stt, tce);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	kvmppc_tce_put(stt, ioba >> stt->page_shift, tce);
+
+	return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_put_tce);
 
-long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
-		      unsigned long ioba)
+static long kvmppc_rm_ua_to_hpa(struct kvm_vcpu *vcpu,
+		unsigned long ua, unsigned long *phpa)
+{
+	pte_t *ptep, pte;
+	unsigned shift = 0;
+
+	ptep = __find_linux_pte_or_hugepte(vcpu->arch.pgdir, ua, NULL, &shift);
+	if (!ptep || !pte_present(*ptep))
+		return -ENXIO;
+	pte = *ptep;
+
+	if (!shift)
+		shift = PAGE_SHIFT;
+
+	/* Avoid handling anything potentially complicated in realmode */
+	if (shift > PAGE_SHIFT)
+		return -EAGAIN;
+
+	if (!pte_young(pte))
+		return -EAGAIN;
+
+	*phpa = (pte_pfn(pte) << PAGE_SHIFT) | (ua & ((1ULL << shift) - 1)) |
+			(ua & ~PAGE_MASK);
+
+	return 0;
+}
+
+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_list,	unsigned long npages)
 {
-	struct kvm *kvm = vcpu->kvm;
 	struct kvmppc_spapr_tce_table *stt;
+	long i, ret = H_SUCCESS;
+	unsigned long tces, entry, ua = 0;
+	unsigned long *rmap = NULL;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	entry = ioba >> stt->page_shift;
+	/*
+	 * The spec says that the maximum size of the list is 512 TCEs
+	 * so the whole table addressed resides in 4K page
+	 */
+	if (npages > 512)
+		return H_PARAMETER;
+
+	if (tce_list & (SZ_4K - 1))
+		return H_PARAMETER;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
 
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == liobn) {
-			unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
-			struct page *page;
-			u64 *tbl;
+	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+		return H_TOO_HARD;
 
-			if (ioba >= stt->window_size)
-				return H_PARAMETER;
+	rmap = (void *) vmalloc_to_phys(rmap);
 
-			page = stt->pages[idx / TCES_PER_PAGE];
-			tbl = (u64 *)page_address(page);
+	/*
+	 * Synchronize with the MMU notifier callbacks in
+	 * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.).
+	 * While we have the rmap lock, code running on other CPUs
+	 * cannot finish unmapping the host real page that backs
+	 * this guest real page, so we are OK to access the host
+	 * real page.
+	 */
+	lock_rmap(rmap);
+	if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) {
+		ret = H_TOO_HARD;
+		goto unlock_exit;
+	}
+
+	for (i = 0; i < npages; ++i) {
+		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
+
+		ret = kvmppc_tce_validate(stt, tce);
+		if (ret != H_SUCCESS)
+			goto unlock_exit;
 
-			vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
-			return H_SUCCESS;
-		}
+		kvmppc_tce_put(stt, entry + i, tce);
 	}
 
-	/* Didn't find the liobn, punt it to userspace */
-	return H_TOO_HARD;
+unlock_exit:
+	unlock_rmap(rmap);
+
+	return ret;
+}
+
+long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
+		unsigned long liobn, unsigned long ioba,
+		unsigned long tce_value, unsigned long npages)
+{
+	struct kvmppc_spapr_tce_table *stt;
+	long i, ret;
+
+	stt = kvmppc_find_table(vcpu, liobn);
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, npages);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	/* Check permission bits only to allow userspace poison TCE for debug */
+	if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ))
+		return H_PARAMETER;
+
+	for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
+		kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+
+	return H_SUCCESS;
+}
+EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
+
+long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
+		      unsigned long ioba)
+{
+	struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn);
+	long ret;
+	unsigned long idx;
+	struct page *page;
+	u64 *tbl;
+
+	if (!stt)
+		return H_TOO_HARD;
+
+	ret = kvmppc_ioba_validate(stt, ioba, 1);
+	if (ret != H_SUCCESS)
+		return ret;
+
+	idx = (ioba >> stt->page_shift) - stt->offset;
+	page = stt->pages[idx / TCES_PER_PAGE];
+	tbl = (u64 *)page_address(page);
+
+	vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE];
+
+	return H_SUCCESS;
 }
 EXPORT_SYMBOL_GPL(kvmppc_h_get_tce);
+
+#endif /* KVM_BOOK3S_HV_POSSIBLE */

+ 191 - 1
arch/powerpc/kvm/book3s_hv.c

@@ -81,6 +81,17 @@ static int target_smt_mode;
 module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
+#ifdef CONFIG_KVM_XICS
+static struct kernel_param_ops module_param_ops = {
+	.set = param_set_int,
+	.get = param_get_int,
+};
+
+module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
+							S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
+#endif
+
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
@@ -768,7 +779,31 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 		if (kvmppc_xics_enabled(vcpu)) {
 			ret = kvmppc_xics_hcall(vcpu, req);
 			break;
-		} /* fallthrough */
+		}
+		return RESUME_HOST;
+	case H_PUT_TCE:
+		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_PUT_TCE_INDIRECT:
+		ret = kvmppc_h_put_tce_indirect(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
+	case H_STUFF_TCE:
+		ret = kvmppc_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+						kvmppc_get_gpr(vcpu, 5),
+						kvmppc_get_gpr(vcpu, 6),
+						kvmppc_get_gpr(vcpu, 7));
+		if (ret == H_TOO_HARD)
+			return RESUME_HOST;
+		break;
 	default:
 		return RESUME_HOST;
 	}
@@ -2278,6 +2313,46 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 	spin_unlock(&vc->lock);
 }
 
+/*
+ * Clear core from the list of active host cores as we are about to
+ * enter the guest. Only do this if it is the primary thread of the
+ * core (not if a subcore) that is entering the guest.
+ */
+static inline void kvmppc_clear_host_core(int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return;
+	/*
+	 * Memory barrier can be omitted here as we will do a smp_wmb()
+	 * later in kvmppc_start_thread and we need ensure that state is
+	 * visible to other CPUs only after we enter guest.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0;
+}
+
+/*
+ * Advertise this core as an active host core since we exited the guest
+ * Only need to do this if it is the primary thread of the core that is
+ * exiting.
+ */
+static inline void kvmppc_set_host_core(int cpu)
+{
+	int core;
+
+	if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu))
+		return;
+
+	/*
+	 * Memory barrier can be omitted here because we do a spin_unlock
+	 * immediately after this which provides the memory barrier.
+	 */
+	core = cpu >> threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1;
+}
+
 /*
  * Run a set of guest threads on a physical core.
  * Called with vc->lock held.
@@ -2390,6 +2465,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		}
 	}
 
+	kvmppc_clear_host_core(pcpu);
+
 	/* Start all the threads */
 	active = 0;
 	for (sub = 0; sub < core_info.n_subcores; ++sub) {
@@ -2486,6 +2563,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 			kvmppc_ipi_thread(pcpu + i);
 	}
 
+	kvmppc_set_host_core(pcpu);
+
 	spin_unlock(&vc->lock);
 
 	/* make sure updates to secondary vcpu structs are visible now */
@@ -2983,6 +3062,114 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 	goto out_srcu;
 }
 
+#ifdef CONFIG_KVM_XICS
+static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action,
+			void *hcpu)
+{
+	unsigned long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		kvmppc_set_host_core(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+		kvmppc_clear_host_core(cpu);
+		break;
+#endif
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kvmppc_cpu_notifier = {
+	    .notifier_call = kvmppc_cpu_notify,
+};
+
+/*
+ * Allocate a per-core structure for managing state about which cores are
+ * running in the host versus the guest and for exchanging data between
+ * real mode KVM and CPU running in the host.
+ * This is only done for the first VM.
+ * The allocated structure stays even if all VMs have stopped.
+ * It is only freed when the kvm-hv module is unloaded.
+ * It's OK for this routine to fail, we just don't support host
+ * core operations like redirecting H_IPI wakeups.
+ */
+void kvmppc_alloc_host_rm_ops(void)
+{
+	struct kvmppc_host_rm_ops *ops;
+	unsigned long l_ops;
+	int cpu, core;
+	int size;
+
+	/* Not the first time here ? */
+	if (kvmppc_host_rm_ops_hv != NULL)
+		return;
+
+	ops = kzalloc(sizeof(struct kvmppc_host_rm_ops), GFP_KERNEL);
+	if (!ops)
+		return;
+
+	size = cpu_nr_cores() * sizeof(struct kvmppc_host_rm_core);
+	ops->rm_core = kzalloc(size, GFP_KERNEL);
+
+	if (!ops->rm_core) {
+		kfree(ops);
+		return;
+	}
+
+	get_online_cpus();
+
+	for (cpu = 0; cpu < nr_cpu_ids; cpu += threads_per_core) {
+		if (!cpu_online(cpu))
+			continue;
+
+		core = cpu >> threads_shift;
+		ops->rm_core[core].rm_state.in_host = 1;
+	}
+
+	ops->vcpu_kick = kvmppc_fast_vcpu_kick_hv;
+
+	/*
+	 * Make the contents of the kvmppc_host_rm_ops structure visible
+	 * to other CPUs before we assign it to the global variable.
+	 * Do an atomic assignment (no locks used here), but if someone
+	 * beats us to it, just free our copy and return.
+	 */
+	smp_wmb();
+	l_ops = (unsigned long) ops;
+
+	if (cmpxchg64((unsigned long *)&kvmppc_host_rm_ops_hv, 0, l_ops)) {
+		put_online_cpus();
+		kfree(ops->rm_core);
+		kfree(ops);
+		return;
+	}
+
+	register_cpu_notifier(&kvmppc_cpu_notifier);
+
+	put_online_cpus();
+}
+
+void kvmppc_free_host_rm_ops(void)
+{
+	if (kvmppc_host_rm_ops_hv) {
+		unregister_cpu_notifier(&kvmppc_cpu_notifier);
+		kfree(kvmppc_host_rm_ops_hv->rm_core);
+		kfree(kvmppc_host_rm_ops_hv);
+		kvmppc_host_rm_ops_hv = NULL;
+	}
+}
+#endif
+
 static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 {
 	unsigned long lpcr, lpid;
@@ -2995,6 +3182,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 		return -ENOMEM;
 	kvm->arch.lpid = lpid;
 
+	kvmppc_alloc_host_rm_ops();
+
 	/*
 	 * Since we don't flush the TLB when tearing down a VM,
 	 * and this lpid might have previously been used,
@@ -3228,6 +3417,7 @@ static int kvmppc_book3s_init_hv(void)
 
 static void kvmppc_book3s_exit_hv(void)
 {
+	kvmppc_free_host_rm_ops();
 	kvmppc_hv_ops = NULL;
 }
 

+ 3 - 0
arch/powerpc/kvm/book3s_hv_builtin.c

@@ -283,3 +283,6 @@ void kvmhv_commence_exit(int trap)
 			kvmhv_interrupt_vcore(vc, ee);
 	}
 }
+
+struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
+EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);

+ 128 - 3
arch/powerpc/kvm/book3s_hv_rm_xics.c

@@ -17,12 +17,16 @@
 #include <asm/xics.h>
 #include <asm/debug.h>
 #include <asm/synch.h>
+#include <asm/cputhreads.h>
 #include <asm/ppc-opcode.h>
 
 #include "book3s_xics.h"
 
 #define DEBUG_PASSUP
 
+int h_ipi_redirect = 1;
+EXPORT_SYMBOL(h_ipi_redirect);
+
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp,
 			    u32 new_irq);
 
@@ -50,11 +54,84 @@ static void ics_rm_check_resend(struct kvmppc_xics *xics,
 
 /* -- ICP routines -- */
 
+#ifdef CONFIG_SMP
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
+{
+	int hcpu;
+
+	hcpu = hcore << threads_shift;
+	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
+	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
+	icp_native_cause_ipi_rm(hcpu);
+}
+#else
+static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
+#endif
+
+/*
+ * We start the search from our current CPU Id in the core map
+ * and go in a circle until we get back to our ID looking for a
+ * core that is running in host context and that hasn't already
+ * been targeted for another rm_host_ops.
+ *
+ * In the future, could consider using a fairer algorithm (one
+ * that distributes the IPIs better)
+ *
+ * Returns -1, if no CPU could be found in the host
+ * Else, returns a CPU Id which has been reserved for use
+ */
+static inline int grab_next_hostcore(int start,
+		struct kvmppc_host_rm_core *rm_core, int max, int action)
+{
+	bool success;
+	int core;
+	union kvmppc_rm_state old, new;
+
+	for (core = start + 1; core < max; core++)  {
+		old = new = READ_ONCE(rm_core[core].rm_state);
+
+		if (!old.in_host || old.rm_action)
+			continue;
+
+		/* Try to grab this host core if not taken already. */
+		new.rm_action = action;
+
+		success = cmpxchg64(&rm_core[core].rm_state.raw,
+						old.raw, new.raw) == old.raw;
+		if (success) {
+			/*
+			 * Make sure that the store to the rm_action is made
+			 * visible before we return to caller (and the
+			 * subsequent store to rm_data) to synchronize with
+			 * the IPI handler.
+			 */
+			smp_wmb();
+			return core;
+		}
+	}
+
+	return -1;
+}
+
+static inline int find_available_hostcore(int action)
+{
+	int core;
+	int my_core = smp_processor_id() >> threads_shift;
+	struct kvmppc_host_rm_core *rm_core = kvmppc_host_rm_ops_hv->rm_core;
+
+	core = grab_next_hostcore(my_core, rm_core, cpu_nr_cores(), action);
+	if (core == -1)
+		core = grab_next_hostcore(core, rm_core, my_core, action);
+
+	return core;
+}
+
 static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 				struct kvm_vcpu *this_vcpu)
 {
 	struct kvmppc_icp *this_icp = this_vcpu->arch.icp;
 	int cpu;
+	int hcore;
 
 	/* Mark the target VCPU as having an interrupt pending */
 	vcpu->stat.queue_intr++;
@@ -66,11 +143,22 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 		return;
 	}
 
-	/* Check if the core is loaded, if not, too hard */
+	/*
+	 * Check if the core is loaded,
+	 * if not, find an available host core to post to wake the VCPU,
+	 * if we can't find one, set up state to eventually return too hard.
+	 */
 	cpu = vcpu->arch.thread_cpu;
 	if (cpu < 0 || cpu >= nr_cpu_ids) {
-		this_icp->rm_action |= XICS_RM_KICK_VCPU;
-		this_icp->rm_kick_target = vcpu;
+		hcore = -1;
+		if (kvmppc_host_rm_ops_hv && h_ipi_redirect)
+			hcore = find_available_hostcore(XICS_RM_KICK_VCPU);
+		if (hcore != -1) {
+			icp_send_hcore_msg(hcore, vcpu);
+		} else {
+			this_icp->rm_action |= XICS_RM_KICK_VCPU;
+			this_icp->rm_kick_target = vcpu;
+		}
 		return;
 	}
 
@@ -623,3 +711,40 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
  bail:
 	return check_too_hard(xics, icp);
 }
+
+/*  --- Non-real mode XICS-related built-in routines ---  */
+
+/**
+ * Host Operations poked by RM KVM
+ */
+static void rm_host_ipi_action(int action, void *data)
+{
+	switch (action) {
+	case XICS_RM_KICK_VCPU:
+		kvmppc_host_rm_ops_hv->vcpu_kick(data);
+		break;
+	default:
+		WARN(1, "Unexpected rm_action=%d data=%p\n", action, data);
+		break;
+	}
+
+}
+
+void kvmppc_xics_ipi_action(void)
+{
+	int core;
+	unsigned int cpu = smp_processor_id();
+	struct kvmppc_host_rm_core *rm_corep;
+
+	core = cpu >> threads_shift;
+	rm_corep = &kvmppc_host_rm_ops_hv->rm_core[core];
+
+	if (rm_corep->rm_data) {
+		rm_host_ipi_action(rm_corep->rm_state.rm_action,
+							rm_corep->rm_data);
+		/* Order these stores against the real mode KVM */
+		rm_corep->rm_data = NULL;
+		smp_wmb();
+		rm_corep->rm_state.rm_action = 0;
+	}
+}

+ 2 - 2
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -2020,8 +2020,8 @@ hcall_real_table:
 	.long	0		/* 0x12c */
 	.long	0		/* 0x130 */
 	.long	DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
-	.long	0		/* 0x138 */
-	.long	0		/* 0x13c */
+	.long	DOTSYM(kvmppc_h_stuff_tce) - hcall_real_table
+	.long	DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
 	.long	0		/* 0x140 */
 	.long	0		/* 0x144 */
 	.long	0		/* 0x148 */

+ 35 - 0
arch/powerpc/kvm/book3s_pr_papr.c

@@ -280,6 +280,37 @@ static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 }
 
+static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_put_tce_indirect(vcpu, liobn, ioba,
+			tce, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
+static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
+{
+	unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
+	unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
+	unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
+	unsigned long npages = kvmppc_get_gpr(vcpu, 7);
+	long rc;
+
+	rc = kvmppc_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
+	if (rc == H_TOO_HARD)
+		return EMULATE_FAIL;
+	kvmppc_set_gpr(vcpu, 3, rc);
+	return EMULATE_DONE;
+}
+
 static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd)
 {
 	long rc = kvmppc_xics_hcall(vcpu, cmd);
@@ -306,6 +337,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
 		return kvmppc_h_pr_bulk_remove(vcpu);
 	case H_PUT_TCE:
 		return kvmppc_h_pr_put_tce(vcpu);
+	case H_PUT_TCE_INDIRECT:
+		return kvmppc_h_pr_put_tce_indirect(vcpu);
+	case H_STUFF_TCE:
+		return kvmppc_h_pr_stuff_tce(vcpu);
 	case H_CEDE:
 		kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE);
 		kvm_vcpu_block(vcpu);

+ 37 - 1
arch/powerpc/kvm/powerpc.c

@@ -33,6 +33,7 @@
 #include <asm/tlbflush.h>
 #include <asm/cputhreads.h>
 #include <asm/irqflags.h>
+#include <asm/iommu.h>
 #include "timing.h"
 #include "irq.h"
 #include "../mm/mmu_decl.h"
@@ -437,6 +438,16 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	unsigned int i;
 	struct kvm_vcpu *vcpu;
 
+#ifdef CONFIG_KVM_XICS
+	/*
+	 * We call kick_all_cpus_sync() to ensure that all
+	 * CPUs have executed any pending IPIs before we
+	 * continue and free VCPUs structures below.
+	 */
+	if (is_kvmppc_hv_enabled(kvm))
+		kick_all_cpus_sync();
+#endif
+
 	kvm_for_each_vcpu(i, vcpu, kvm)
 		kvm_arch_vcpu_free(vcpu);
 
@@ -509,6 +520,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
 #ifdef CONFIG_PPC_BOOK3S_64
 	case KVM_CAP_SPAPR_TCE:
+	case KVM_CAP_SPAPR_TCE_64:
 	case KVM_CAP_PPC_ALLOC_HTAB:
 	case KVM_CAP_PPC_RTAS:
 	case KVM_CAP_PPC_FIXUP_HCALL:
@@ -569,6 +581,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_PPC_GET_SMMU_INFO:
 		r = 1;
 		break;
+	case KVM_CAP_SPAPR_MULTITCE:
+		r = 1;
+		break;
 #endif
 	default:
 		r = 0;
@@ -1331,13 +1346,34 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		break;
 	}
 #ifdef CONFIG_PPC_BOOK3S_64
+	case KVM_CREATE_SPAPR_TCE_64: {
+		struct kvm_create_spapr_tce_64 create_tce_64;
+
+		r = -EFAULT;
+		if (copy_from_user(&create_tce_64, argp, sizeof(create_tce_64)))
+			goto out;
+		if (create_tce_64.flags) {
+			r = -EINVAL;
+			goto out;
+		}
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
+		goto out;
+	}
 	case KVM_CREATE_SPAPR_TCE: {
 		struct kvm_create_spapr_tce create_tce;
+		struct kvm_create_spapr_tce_64 create_tce_64;
 
 		r = -EFAULT;
 		if (copy_from_user(&create_tce, argp, sizeof(create_tce)))
 			goto out;
-		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce);
+
+		create_tce_64.liobn = create_tce.liobn;
+		create_tce_64.page_shift = IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.offset = 0;
+		create_tce_64.size = create_tce.window_size >>
+				IOMMU_PAGE_SHIFT_4K;
+		create_tce_64.flags = 0;
+		r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce_64);
 		goto out;
 	}
 	case KVM_PPC_GET_SMMU_INFO: {

+ 8 - 0
arch/powerpc/mm/pgtable.c

@@ -243,3 +243,11 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr)
 }
 #endif /* CONFIG_DEBUG_VM */
 
+unsigned long vmalloc_to_phys(void *va)
+{
+	unsigned long pfn = vmalloc_to_pfn(va);
+
+	BUG_ON(!pfn);
+	return __pa(pfn_to_kaddr(pfn)) + offset_in_page(va);
+}
+EXPORT_SYMBOL_GPL(vmalloc_to_phys);

+ 0 - 8
arch/powerpc/perf/hv-24x7.c

@@ -493,14 +493,6 @@ static size_t event_to_attr_ct(struct hv_24x7_event_data *event)
 	}
 }
 
-static unsigned long vmalloc_to_phys(void *v)
-{
-	struct page *p = vmalloc_to_page(v);
-
-	BUG_ON(!p);
-	return page_to_phys(p) + offset_in_page(v);
-}
-
 /* */
 struct event_uniq {
 	struct rb_node node;

+ 21 - 0
arch/powerpc/sysdev/xics/icp-native.c

@@ -159,6 +159,27 @@ static void icp_native_cause_ipi(int cpu, unsigned long data)
 	icp_native_set_qirr(cpu, IPI_PRIORITY);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void icp_native_cause_ipi_rm(int cpu)
+{
+	/*
+	 * Currently not used to send IPIs to another CPU
+	 * on the same core. Only caller is KVM real mode.
+	 * Need the physical address of the XICS to be
+	 * previously saved in kvm_hstate in the paca.
+	 */
+	unsigned long xics_phys;
+
+	/*
+	 * Just like the cause_ipi functions, it is required to
+	 * include a full barrier (out8 includes a sync) before
+	 * causing the IPI.
+	 */
+	xics_phys = paca[cpu].kvm_hstate.xics_phys;
+	out_rm8((u8 *)(xics_phys + XICS_MFRR), IPI_PRIORITY);
+}
+#endif
+
 /*
  * Called when an interrupt is received on an off-line CPU to
  * clear the interrupt, so that the CPU can go back to nap mode.

+ 26 - 15
arch/s390/include/asm/kvm_host.h

@@ -20,6 +20,7 @@
 #include <linux/kvm_types.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
+#include <linux/seqlock.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
 #include <asm/fpu/api.h>
@@ -229,17 +230,11 @@ struct kvm_s390_itdb {
 	__u8	data[256];
 } __packed;
 
-struct kvm_s390_vregs {
-	__vector128 vrs[32];
-	__u8	reserved200[512];	/* for future vector expansion */
-} __packed;
-
 struct sie_page {
 	struct kvm_s390_sie_block sie_block;
 	__u8 reserved200[1024];		/* 0x0200 */
 	struct kvm_s390_itdb itdb;	/* 0x0600 */
-	__u8 reserved700[1280];		/* 0x0700 */
-	struct kvm_s390_vregs vregs;	/* 0x0c00 */
+	__u8 reserved700[2304];		/* 0x0700 */
 } __packed;
 
 struct kvm_vcpu_stat {
@@ -558,6 +553,15 @@ struct kvm_vcpu_arch {
 	unsigned long pfault_token;
 	unsigned long pfault_select;
 	unsigned long pfault_compare;
+	bool cputm_enabled;
+	/*
+	 * The seqcount protects updates to cputm_start and sie_block.cputm,
+	 * this way we can have non-blocking reads with consistent values.
+	 * Only the owning VCPU thread (vcpu->cpu) is allowed to change these
+	 * values and to start/stop/enable/disable cpu timer accounting.
+	 */
+	seqcount_t cputm_seqcount;
+	__u64 cputm_start;
 };
 
 struct kvm_vm_stat {
@@ -596,15 +600,11 @@ struct s390_io_adapter {
 #define S390_ARCH_FAC_MASK_SIZE_U64 \
 	(S390_ARCH_FAC_MASK_SIZE_BYTE / sizeof(u64))
 
-struct kvm_s390_fac {
-	/* facility list requested by guest */
-	__u64 list[S390_ARCH_FAC_LIST_SIZE_U64];
-	/* facility mask supported by kvm & hosting machine */
-	__u64 mask[S390_ARCH_FAC_LIST_SIZE_U64];
-};
-
 struct kvm_s390_cpu_model {
-	struct kvm_s390_fac *fac;
+	/* facility mask supported by kvm & hosting machine */
+	__u64 fac_mask[S390_ARCH_FAC_LIST_SIZE_U64];
+	/* facility list requested by guest (in dma page) */
+	__u64 *fac_list;
 	struct cpuid cpu_id;
 	unsigned short ibc;
 };
@@ -623,6 +623,16 @@ struct kvm_s390_crypto_cb {
 	__u8    reserved80[128];                /* 0x0080 */
 };
 
+/*
+ * sie_page2 has to be allocated as DMA because fac_list and crycb need
+ * 31bit addresses in the sie control block.
+ */
+struct sie_page2 {
+	__u64 fac_list[S390_ARCH_FAC_LIST_SIZE_U64];	/* 0x0000 */
+	struct kvm_s390_crypto_cb crycb;		/* 0x0800 */
+	u8 reserved900[0x1000 - 0x900];			/* 0x0900 */
+} __packed;
+
 struct kvm_arch{
 	void *sca;
 	int use_esca;
@@ -643,6 +653,7 @@ struct kvm_arch{
 	int ipte_lock_count;
 	struct mutex ipte_mutex;
 	spinlock_t start_stop_lock;
+	struct sie_page2 *sie_page2;
 	struct kvm_s390_cpu_model model;
 	struct kvm_s390_crypto crypto;
 	u64 epoch;

+ 6 - 2
arch/s390/include/uapi/asm/kvm.h

@@ -154,6 +154,7 @@ struct kvm_guest_debug_arch {
 #define KVM_SYNC_PFAULT (1UL << 5)
 #define KVM_SYNC_VRS    (1UL << 6)
 #define KVM_SYNC_RICCB  (1UL << 7)
+#define KVM_SYNC_FPRS   (1UL << 8)
 /* definition of registers in kvm_run */
 struct kvm_sync_regs {
 	__u64 prefix;	/* prefix register */
@@ -168,9 +169,12 @@ struct kvm_sync_regs {
 	__u64 pft;	/* pfault token [PFAULT] */
 	__u64 pfs;	/* pfault select [PFAULT] */
 	__u64 pfc;	/* pfault compare [PFAULT] */
-	__u64 vrs[32][2];	/* vector registers */
+	union {
+		__u64 vrs[32][2];	/* vector registers (KVM_SYNC_VRS) */
+		__u64 fprs[16];		/* fp registers (KVM_SYNC_FPRS) */
+	};
 	__u8  reserved[512];	/* for future vector expansion */
-	__u32 fpc;	/* only valid with vector registers */
+	__u32 fpc;		/* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */
 	__u8 padding[52];	/* riccb needs to be 64byte aligned */
 	__u8 riccb[64];		/* runtime instrumentation controls block */
 };

+ 1 - 0
arch/s390/include/uapi/asm/sie.h

@@ -7,6 +7,7 @@
 	{ 0x9c, "DIAG (0x9c) time slice end directed" },	\
 	{ 0x204, "DIAG (0x204) logical-cpu utilization" },	\
 	{ 0x258, "DIAG (0x258) page-reference services" },	\
+	{ 0x288, "DIAG (0x288) watchdog functions" },		\
 	{ 0x308, "DIAG (0x308) ipl functions" },		\
 	{ 0x500, "DIAG (0x500) KVM virtio functions" },		\
 	{ 0x501, "DIAG (0x501) KVM breakpoint" }

+ 30 - 27
arch/s390/kvm/gaccess.c

@@ -373,7 +373,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu)
 }
 
 static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
-			  int write)
+			  enum gacc_mode mode)
 {
 	union alet alet;
 	struct ale ale;
@@ -454,7 +454,7 @@ static int ar_translation(struct kvm_vcpu *vcpu, union asce *asce, ar_t ar,
 		}
 	}
 
-	if (ale.fo == 1 && write)
+	if (ale.fo == 1 && mode == GACC_STORE)
 		return PGM_PROTECTION;
 
 	asce->val = aste.asce;
@@ -477,25 +477,28 @@ enum {
 };
 
 static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
-			 ar_t ar, int write)
+			 ar_t ar, enum gacc_mode mode)
 {
 	int rc;
-	psw_t *psw = &vcpu->arch.sie_block->gpsw;
+	struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	struct trans_exc_code_bits *tec_bits;
 
 	memset(pgm, 0, sizeof(*pgm));
 	tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-	tec_bits->fsi = write ? FSI_STORE : FSI_FETCH;
-	tec_bits->as = psw_bits(*psw).as;
+	tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+	tec_bits->as = psw.as;
 
-	if (!psw_bits(*psw).t) {
+	if (!psw.t) {
 		asce->val = 0;
 		asce->r = 1;
 		return 0;
 	}
 
-	switch (psw_bits(vcpu->arch.sie_block->gpsw).as) {
+	if (mode == GACC_IFETCH)
+		psw.as = psw.as == PSW_AS_HOME ? PSW_AS_HOME : PSW_AS_PRIMARY;
+
+	switch (psw.as) {
 	case PSW_AS_PRIMARY:
 		asce->val = vcpu->arch.sie_block->gcr[1];
 		return 0;
@@ -506,7 +509,7 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
 		asce->val = vcpu->arch.sie_block->gcr[13];
 		return 0;
 	case PSW_AS_ACCREG:
-		rc = ar_translation(vcpu, asce, ar, write);
+		rc = ar_translation(vcpu, asce, ar, mode);
 		switch (rc) {
 		case PGM_ALEN_TRANSLATION:
 		case PGM_ALE_SEQUENCE:
@@ -538,7 +541,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  * @gva: guest virtual address
  * @gpa: points to where guest physical (absolute) address should be stored
  * @asce: effective asce
- * @write: indicates if access is a write access
+ * @mode: indicates the access mode to be used
  *
  * Translate a guest virtual address into a guest absolute address by means
  * of dynamic address translation as specified by the architecture.
@@ -554,7 +557,7 @@ static int deref_table(struct kvm *kvm, unsigned long gpa, unsigned long *val)
  */
 static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 				     unsigned long *gpa, const union asce asce,
-				     int write)
+				     enum gacc_mode mode)
 {
 	union vaddress vaddr = {.addr = gva};
 	union raddress raddr = {.addr = gva};
@@ -699,7 +702,7 @@ static unsigned long guest_translate(struct kvm_vcpu *vcpu, unsigned long gva,
 real_address:
 	raddr.addr = kvm_s390_real_to_abs(vcpu, raddr.addr);
 absolute_address:
-	if (write && dat_protection)
+	if (mode == GACC_STORE && dat_protection)
 		return PGM_PROTECTION;
 	if (kvm_is_error_gpa(vcpu->kvm, raddr.addr))
 		return PGM_ADDRESSING;
@@ -728,7 +731,7 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
 
 static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
 			    unsigned long *pages, unsigned long nr_pages,
-			    const union asce asce, int write)
+			    const union asce asce, enum gacc_mode mode)
 {
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -740,13 +743,13 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
 	while (nr_pages) {
 		ga = kvm_s390_logical_to_effective(vcpu, ga);
 		tec_bits->addr = ga >> PAGE_SHIFT;
-		if (write && lap_enabled && is_low_address(ga)) {
+		if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
 			pgm->code = PGM_PROTECTION;
 			return pgm->code;
 		}
 		ga &= PAGE_MASK;
 		if (psw_bits(*psw).t) {
-			rc = guest_translate(vcpu, ga, pages, asce, write);
+			rc = guest_translate(vcpu, ga, pages, asce, mode);
 			if (rc < 0)
 				return rc;
 			if (rc == PGM_PROTECTION)
@@ -768,7 +771,7 @@ static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
 }
 
 int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
-		 unsigned long len, int write)
+		 unsigned long len, enum gacc_mode mode)
 {
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
 	unsigned long _len, nr_pages, gpa, idx;
@@ -780,7 +783,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 
 	if (!len)
 		return 0;
-	rc = get_vcpu_asce(vcpu, &asce, ar, write);
+	rc = get_vcpu_asce(vcpu, &asce, ar, mode);
 	if (rc)
 		return rc;
 	nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -792,11 +795,11 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 	need_ipte_lock = psw_bits(*psw).t && !asce.r;
 	if (need_ipte_lock)
 		ipte_lock(vcpu);
-	rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, write);
+	rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
 	for (idx = 0; idx < nr_pages && !rc; idx++) {
 		gpa = *(pages + idx) + (ga & ~PAGE_MASK);
 		_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-		if (write)
+		if (mode == GACC_STORE)
 			rc = kvm_write_guest(vcpu->kvm, gpa, data, _len);
 		else
 			rc = kvm_read_guest(vcpu->kvm, gpa, data, _len);
@@ -812,7 +815,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 }
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
-		      void *data, unsigned long len, int write)
+		      void *data, unsigned long len, enum gacc_mode mode)
 {
 	unsigned long _len, gpa;
 	int rc = 0;
@@ -820,7 +823,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
 	while (len && !rc) {
 		gpa = kvm_s390_real_to_abs(vcpu, gra);
 		_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
-		if (write)
+		if (mode)
 			rc = write_guest_abs(vcpu, gpa, data, _len);
 		else
 			rc = read_guest_abs(vcpu, gpa, data, _len);
@@ -841,7 +844,7 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
  * has to take care of this.
  */
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-			    unsigned long *gpa, int write)
+			    unsigned long *gpa, enum gacc_mode mode)
 {
 	struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
 	psw_t *psw = &vcpu->arch.sie_block->gpsw;
@@ -851,19 +854,19 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 
 	gva = kvm_s390_logical_to_effective(vcpu, gva);
 	tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
-	rc = get_vcpu_asce(vcpu, &asce, ar, write);
+	rc = get_vcpu_asce(vcpu, &asce, ar, mode);
 	tec->addr = gva >> PAGE_SHIFT;
 	if (rc)
 		return rc;
 	if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
-		if (write) {
+		if (mode == GACC_STORE) {
 			rc = pgm->code = PGM_PROTECTION;
 			return rc;
 		}
 	}
 
 	if (psw_bits(*psw).t && !asce.r) {	/* Use DAT? */
-		rc = guest_translate(vcpu, gva, gpa, asce, write);
+		rc = guest_translate(vcpu, gva, gpa, asce, mode);
 		if (rc > 0) {
 			if (rc == PGM_PROTECTION)
 				tec->b61 = 1;
@@ -883,7 +886,7 @@ int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
  * check_gva_range - test a range of guest virtual addresses for accessibility
  */
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-		    unsigned long length, int is_write)
+		    unsigned long length, enum gacc_mode mode)
 {
 	unsigned long gpa;
 	unsigned long currlen;
@@ -892,7 +895,7 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
 	ipte_lock(vcpu);
 	while (length > 0 && !rc) {
 		currlen = min(length, PAGE_SIZE - (gva % PAGE_SIZE));
-		rc = guest_translate_address(vcpu, gva, ar, &gpa, is_write);
+		rc = guest_translate_address(vcpu, gva, ar, &gpa, mode);
 		gva += currlen;
 		length -= currlen;
 	}

+ 32 - 6
arch/s390/kvm/gaccess.h

@@ -155,16 +155,22 @@ int read_guest_lc(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
 	return kvm_read_guest(vcpu->kvm, gpa, data, len);
 }
 
+enum gacc_mode {
+	GACC_FETCH,
+	GACC_STORE,
+	GACC_IFETCH,
+};
+
 int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva,
-			    ar_t ar, unsigned long *gpa, int write);
+			    ar_t ar, unsigned long *gpa, enum gacc_mode mode);
 int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
-		    unsigned long length, int is_write);
+		    unsigned long length, enum gacc_mode mode);
 
 int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
-		 unsigned long len, int write);
+		 unsigned long len, enum gacc_mode mode);
 
 int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
-		      void *data, unsigned long len, int write);
+		      void *data, unsigned long len, enum gacc_mode mode);
 
 /**
  * write_guest - copy data from kernel space to guest space
@@ -215,7 +221,7 @@ static inline __must_check
 int write_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 		unsigned long len)
 {
-	return access_guest(vcpu, ga, ar, data, len, 1);
+	return access_guest(vcpu, ga, ar, data, len, GACC_STORE);
 }
 
 /**
@@ -235,7 +241,27 @@ static inline __must_check
 int read_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
 	       unsigned long len)
 {
-	return access_guest(vcpu, ga, ar, data, len, 0);
+	return access_guest(vcpu, ga, ar, data, len, GACC_FETCH);
+}
+
+/**
+ * read_guest_instr - copy instruction data from guest space to kernel space
+ * @vcpu: virtual cpu
+ * @data: destination address in kernel space
+ * @len: number of bytes to copy
+ *
+ * Copy @len bytes from the current psw address (guest space) to @data (kernel
+ * space).
+ *
+ * The behaviour of read_guest_instr is identical to read_guest, except that
+ * instruction data will be read from primary space when in home-space or
+ * address-space mode.
+ */
+static inline __must_check
+int read_guest_instr(struct kvm_vcpu *vcpu, void *data, unsigned long len)
+{
+	return access_guest(vcpu, vcpu->arch.sie_block->gpsw.addr, 0, data, len,
+			    GACC_IFETCH);
 }
 
 /**

+ 47 - 31
arch/s390/kvm/intercept.c

@@ -38,17 +38,32 @@ static const intercept_handler_t instruction_handlers[256] = {
 	[0xeb] = kvm_s390_handle_eb,
 };
 
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc)
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+	u8 ilen = 0;
 
-	/* Use the length of the EXECUTE instruction if necessary */
-	if (sie_block->icptstatus & 1) {
-		ilc = (sie_block->icptstatus >> 4) & 0x6;
-		if (!ilc)
-			ilc = 4;
+	switch (vcpu->arch.sie_block->icptcode) {
+	case ICPT_INST:
+	case ICPT_INSTPROGI:
+	case ICPT_OPEREXC:
+	case ICPT_PARTEXEC:
+	case ICPT_IOINST:
+		/* instruction only stored for these icptcodes */
+		ilen = insn_length(vcpu->arch.sie_block->ipa >> 8);
+		/* Use the length of the EXECUTE instruction if necessary */
+		if (sie_block->icptstatus & 1) {
+			ilen = (sie_block->icptstatus >> 4) & 0x6;
+			if (!ilen)
+				ilen = 4;
+		}
+		break;
+	case ICPT_PROGI:
+		/* bit 1+2 of pgmilc are the ilc, so we directly get ilen */
+		ilen = vcpu->arch.sie_block->pgmilc & 0x6;
+		break;
 	}
-	sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilc);
+	return ilen;
 }
 
 static int handle_noop(struct kvm_vcpu *vcpu)
@@ -121,11 +136,13 @@ static int handle_instruction(struct kvm_vcpu *vcpu)
 	return -EOPNOTSUPP;
 }
 
-static void __extract_prog_irq(struct kvm_vcpu *vcpu,
-			       struct kvm_s390_pgm_info *pgm_info)
+static int inject_prog_on_prog_intercept(struct kvm_vcpu *vcpu)
 {
-	memset(pgm_info, 0, sizeof(struct kvm_s390_pgm_info));
-	pgm_info->code = vcpu->arch.sie_block->iprcc;
+	struct kvm_s390_pgm_info pgm_info = {
+		.code = vcpu->arch.sie_block->iprcc,
+		/* the PSW has already been rewound */
+		.flags = KVM_S390_PGM_FLAGS_NO_REWIND,
+	};
 
 	switch (vcpu->arch.sie_block->iprcc & ~PGM_PER) {
 	case PGM_AFX_TRANSLATION:
@@ -138,7 +155,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
 	case PGM_PRIMARY_AUTHORITY:
 	case PGM_SECONDARY_AUTHORITY:
 	case PGM_SPACE_SWITCH:
-		pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
+		pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
 		break;
 	case PGM_ALEN_TRANSLATION:
 	case PGM_ALE_SEQUENCE:
@@ -146,7 +163,7 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
 	case PGM_ASTE_SEQUENCE:
 	case PGM_ASTE_VALIDITY:
 	case PGM_EXTENDED_AUTHORITY:
-		pgm_info->exc_access_id = vcpu->arch.sie_block->eai;
+		pgm_info.exc_access_id = vcpu->arch.sie_block->eai;
 		break;
 	case PGM_ASCE_TYPE:
 	case PGM_PAGE_TRANSLATION:
@@ -154,32 +171,33 @@ static void __extract_prog_irq(struct kvm_vcpu *vcpu,
 	case PGM_REGION_SECOND_TRANS:
 	case PGM_REGION_THIRD_TRANS:
 	case PGM_SEGMENT_TRANSLATION:
-		pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
-		pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
-		pgm_info->op_access_id  = vcpu->arch.sie_block->oai;
+		pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+		pgm_info.exc_access_id  = vcpu->arch.sie_block->eai;
+		pgm_info.op_access_id  = vcpu->arch.sie_block->oai;
 		break;
 	case PGM_MONITOR:
-		pgm_info->mon_class_nr = vcpu->arch.sie_block->mcn;
-		pgm_info->mon_code = vcpu->arch.sie_block->tecmc;
+		pgm_info.mon_class_nr = vcpu->arch.sie_block->mcn;
+		pgm_info.mon_code = vcpu->arch.sie_block->tecmc;
 		break;
 	case PGM_VECTOR_PROCESSING:
 	case PGM_DATA:
-		pgm_info->data_exc_code = vcpu->arch.sie_block->dxc;
+		pgm_info.data_exc_code = vcpu->arch.sie_block->dxc;
 		break;
 	case PGM_PROTECTION:
-		pgm_info->trans_exc_code = vcpu->arch.sie_block->tecmc;
-		pgm_info->exc_access_id  = vcpu->arch.sie_block->eai;
+		pgm_info.trans_exc_code = vcpu->arch.sie_block->tecmc;
+		pgm_info.exc_access_id  = vcpu->arch.sie_block->eai;
 		break;
 	default:
 		break;
 	}
 
 	if (vcpu->arch.sie_block->iprcc & PGM_PER) {
-		pgm_info->per_code = vcpu->arch.sie_block->perc;
-		pgm_info->per_atmid = vcpu->arch.sie_block->peratmid;
-		pgm_info->per_address = vcpu->arch.sie_block->peraddr;
-		pgm_info->per_access_id = vcpu->arch.sie_block->peraid;
+		pgm_info.per_code = vcpu->arch.sie_block->perc;
+		pgm_info.per_atmid = vcpu->arch.sie_block->peratmid;
+		pgm_info.per_address = vcpu->arch.sie_block->peraddr;
+		pgm_info.per_access_id = vcpu->arch.sie_block->peraid;
 	}
+	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
 /*
@@ -208,7 +226,6 @@ static int handle_itdb(struct kvm_vcpu *vcpu)
 
 static int handle_prog(struct kvm_vcpu *vcpu)
 {
-	struct kvm_s390_pgm_info pgm_info;
 	psw_t psw;
 	int rc;
 
@@ -234,8 +251,7 @@ static int handle_prog(struct kvm_vcpu *vcpu)
 	if (rc)
 		return rc;
 
-	__extract_prog_irq(vcpu, &pgm_info);
-	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+	return inject_prog_on_prog_intercept(vcpu);
 }
 
 /**
@@ -302,7 +318,7 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
 	/* Make sure that the source is paged-in */
 	rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg2],
-				     reg2, &srcaddr, 0);
+				     reg2, &srcaddr, GACC_FETCH);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 	rc = kvm_arch_fault_in_page(vcpu, srcaddr, 0);
@@ -311,14 +327,14 @@ static int handle_mvpg_pei(struct kvm_vcpu *vcpu)
 
 	/* Make sure that the destination is paged-in */
 	rc = guest_translate_address(vcpu, vcpu->run->s.regs.gprs[reg1],
-				     reg1, &dstaddr, 1);
+				     reg1, &dstaddr, GACC_STORE);
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 	rc = kvm_arch_fault_in_page(vcpu, dstaddr, 1);
 	if (rc != 0)
 		return rc;
 
-	kvm_s390_rewind_psw(vcpu, 4);
+	kvm_s390_retry_instr(vcpu);
 
 	return 0;
 }

+ 55 - 38
arch/s390/kvm/interrupt.c

@@ -182,8 +182,9 @@ static int cpu_timer_interrupts_enabled(struct kvm_vcpu *vcpu)
 
 static int cpu_timer_irq_pending(struct kvm_vcpu *vcpu)
 {
-	return (vcpu->arch.sie_block->cputm >> 63) &&
-	       cpu_timer_interrupts_enabled(vcpu);
+	if (!cpu_timer_interrupts_enabled(vcpu))
+		return 0;
+	return kvm_s390_get_cpu_timer(vcpu) >> 63;
 }
 
 static inline int is_ioirq(unsigned long irq_type)
@@ -335,23 +336,6 @@ static void set_intercept_indicators(struct kvm_vcpu *vcpu)
 	set_intercept_indicators_stop(vcpu);
 }
 
-static u16 get_ilc(struct kvm_vcpu *vcpu)
-{
-	switch (vcpu->arch.sie_block->icptcode) {
-	case ICPT_INST:
-	case ICPT_INSTPROGI:
-	case ICPT_OPEREXC:
-	case ICPT_PARTEXEC:
-	case ICPT_IOINST:
-		/* last instruction only stored for these icptcodes */
-		return insn_length(vcpu->arch.sie_block->ipa >> 8);
-	case ICPT_PROGI:
-		return vcpu->arch.sie_block->pgmilc;
-	default:
-		return 0;
-	}
-}
-
 static int __must_check __deliver_cpu_timer(struct kvm_vcpu *vcpu)
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -588,7 +572,7 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_pgm_info pgm_info;
 	int rc = 0, nullifying = false;
-	u16 ilc = get_ilc(vcpu);
+	u16 ilen;
 
 	spin_lock(&li->lock);
 	pgm_info = li->irq.pgm;
@@ -596,8 +580,9 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 	memset(&li->irq.pgm, 0, sizeof(pgm_info));
 	spin_unlock(&li->lock);
 
-	VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilc:%d",
-		   pgm_info.code, ilc);
+	ilen = pgm_info.flags & KVM_S390_PGM_FLAGS_ILC_MASK;
+	VCPU_EVENT(vcpu, 3, "deliver: program irq code 0x%x, ilen:%d",
+		   pgm_info.code, ilen);
 	vcpu->stat.deliver_program_int++;
 	trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 					 pgm_info.code, 0);
@@ -681,10 +666,11 @@ static int __must_check __deliver_prog(struct kvm_vcpu *vcpu)
 				   (u8 *) __LC_PER_ACCESS_ID);
 	}
 
-	if (nullifying && vcpu->arch.sie_block->icptcode == ICPT_INST)
-		kvm_s390_rewind_psw(vcpu, ilc);
+	if (nullifying && !(pgm_info.flags & KVM_S390_PGM_FLAGS_NO_REWIND))
+		kvm_s390_rewind_psw(vcpu, ilen);
 
-	rc |= put_guest_lc(vcpu, ilc, (u16 *) __LC_PGM_ILC);
+	/* bit 1+2 of the target are the ilc, so we can directly use ilen */
+	rc |= put_guest_lc(vcpu, ilen, (u16 *) __LC_PGM_ILC);
 	rc |= put_guest_lc(vcpu, vcpu->arch.sie_block->gbea,
 				 (u64 *) __LC_LAST_BREAK);
 	rc |= put_guest_lc(vcpu, pgm_info.code,
@@ -923,9 +909,35 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return ckc_irq_pending(vcpu) || cpu_timer_irq_pending(vcpu);
 }
 
+static u64 __calculate_sltime(struct kvm_vcpu *vcpu)
+{
+	u64 now, cputm, sltime = 0;
+
+	if (ckc_interrupts_enabled(vcpu)) {
+		now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
+		sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+		/* already expired or overflow? */
+		if (!sltime || vcpu->arch.sie_block->ckc <= now)
+			return 0;
+		if (cpu_timer_interrupts_enabled(vcpu)) {
+			cputm = kvm_s390_get_cpu_timer(vcpu);
+			/* already expired? */
+			if (cputm >> 63)
+				return 0;
+			return min(sltime, tod_to_ns(cputm));
+		}
+	} else if (cpu_timer_interrupts_enabled(vcpu)) {
+		sltime = kvm_s390_get_cpu_timer(vcpu);
+		/* already expired? */
+		if (sltime >> 63)
+			return 0;
+	}
+	return sltime;
+}
+
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 {
-	u64 now, sltime;
+	u64 sltime;
 
 	vcpu->stat.exit_wait_state++;
 
@@ -938,22 +950,20 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP; /* disabled wait */
 	}
 
-	if (!ckc_interrupts_enabled(vcpu)) {
+	if (!ckc_interrupts_enabled(vcpu) &&
+	    !cpu_timer_interrupts_enabled(vcpu)) {
 		VCPU_EVENT(vcpu, 3, "%s", "enabled wait w/o timer");
 		__set_cpu_idle(vcpu);
 		goto no_timer;
 	}
 
-	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
-
-	/* underflow */
-	if (vcpu->arch.sie_block->ckc < now)
+	sltime = __calculate_sltime(vcpu);
+	if (!sltime)
 		return 0;
 
 	__set_cpu_idle(vcpu);
 	hrtimer_start(&vcpu->arch.ckc_timer, ktime_set (0, sltime) , HRTIMER_MODE_REL);
-	VCPU_EVENT(vcpu, 4, "enabled wait via clock comparator: %llu ns", sltime);
+	VCPU_EVENT(vcpu, 4, "enabled wait: %llu ns", sltime);
 no_timer:
 	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 	kvm_vcpu_block(vcpu);
@@ -980,18 +990,16 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
 {
 	struct kvm_vcpu *vcpu;
-	u64 now, sltime;
+	u64 sltime;
 
 	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
-	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
-	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
+	sltime = __calculate_sltime(vcpu);
 
 	/*
 	 * If the monotonic clock runs faster than the tod clock we might be
 	 * woken up too early and have to go back to sleep to avoid deadlocks.
 	 */
-	if (vcpu->arch.sie_block->ckc > now &&
-	    hrtimer_forward_now(timer, ns_to_ktime(sltime)))
+	if (sltime && hrtimer_forward_now(timer, ns_to_ktime(sltime)))
 		return HRTIMER_RESTART;
 	kvm_s390_vcpu_wakeup(vcpu);
 	return HRTIMER_NORESTART;
@@ -1059,8 +1067,16 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 				   irq->u.pgm.code, 0);
 
+	if (!(irq->u.pgm.flags & KVM_S390_PGM_FLAGS_ILC_VALID)) {
+		/* auto detection if no valid ILC was given */
+		irq->u.pgm.flags &= ~KVM_S390_PGM_FLAGS_ILC_MASK;
+		irq->u.pgm.flags |= kvm_s390_get_ilen(vcpu);
+		irq->u.pgm.flags |= KVM_S390_PGM_FLAGS_ILC_VALID;
+	}
+
 	if (irq->u.pgm.code == PGM_PER) {
 		li->irq.pgm.code |= PGM_PER;
+		li->irq.pgm.flags = irq->u.pgm.flags;
 		/* only modify PER related information */
 		li->irq.pgm.per_address = irq->u.pgm.per_address;
 		li->irq.pgm.per_code = irq->u.pgm.per_code;
@@ -1069,6 +1085,7 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	} else if (!(irq->u.pgm.code & PGM_PER)) {
 		li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
 				   irq->u.pgm.code;
+		li->irq.pgm.flags = irq->u.pgm.flags;
 		/* only modify non-PER information */
 		li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
 		li->irq.pgm.mon_code = irq->u.pgm.mon_code;

+ 172 - 63
arch/s390/kvm/kvm-s390.c

@@ -158,6 +158,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
 		kvm->arch.epoch -= *delta;
 		kvm_for_each_vcpu(i, vcpu, kvm) {
 			vcpu->arch.sie_block->epoch -= *delta;
+			if (vcpu->arch.cputm_enabled)
+				vcpu->arch.cputm_start += *delta;
 		}
 	}
 	return NOTIFY_OK;
@@ -274,7 +276,6 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
 	unsigned long address;
 	struct gmap *gmap = kvm->arch.gmap;
 
-	down_read(&gmap->mm->mmap_sem);
 	/* Loop over all guest pages */
 	last_gfn = memslot->base_gfn + memslot->npages;
 	for (cur_gfn = memslot->base_gfn; cur_gfn <= last_gfn; cur_gfn++) {
@@ -282,8 +283,10 @@ static void kvm_s390_sync_dirty_log(struct kvm *kvm,
 
 		if (gmap_test_and_clear_dirty(address, gmap))
 			mark_page_dirty(kvm, cur_gfn);
+		if (fatal_signal_pending(current))
+			return;
+		cond_resched();
 	}
-	up_read(&gmap->mm->mmap_sem);
 }
 
 /* Section: vm related */
@@ -352,8 +355,8 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		if (atomic_read(&kvm->online_vcpus)) {
 			r = -EBUSY;
 		} else if (MACHINE_HAS_VX) {
-			set_kvm_facility(kvm->arch.model.fac->mask, 129);
-			set_kvm_facility(kvm->arch.model.fac->list, 129);
+			set_kvm_facility(kvm->arch.model.fac_mask, 129);
+			set_kvm_facility(kvm->arch.model.fac_list, 129);
 			r = 0;
 		} else
 			r = -EINVAL;
@@ -367,8 +370,8 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
 		if (atomic_read(&kvm->online_vcpus)) {
 			r = -EBUSY;
 		} else if (test_facility(64)) {
-			set_kvm_facility(kvm->arch.model.fac->mask, 64);
-			set_kvm_facility(kvm->arch.model.fac->list, 64);
+			set_kvm_facility(kvm->arch.model.fac_mask, 64);
+			set_kvm_facility(kvm->arch.model.fac_list, 64);
 			r = 0;
 		}
 		mutex_unlock(&kvm->lock);
@@ -651,7 +654,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 		memcpy(&kvm->arch.model.cpu_id, &proc->cpuid,
 		       sizeof(struct cpuid));
 		kvm->arch.model.ibc = proc->ibc;
-		memcpy(kvm->arch.model.fac->list, proc->fac_list,
+		memcpy(kvm->arch.model.fac_list, proc->fac_list,
 		       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	} else
 		ret = -EFAULT;
@@ -685,7 +688,8 @@ static int kvm_s390_get_processor(struct kvm *kvm, struct kvm_device_attr *attr)
 	}
 	memcpy(&proc->cpuid, &kvm->arch.model.cpu_id, sizeof(struct cpuid));
 	proc->ibc = kvm->arch.model.ibc;
-	memcpy(&proc->fac_list, kvm->arch.model.fac->list, S390_ARCH_FAC_LIST_SIZE_BYTE);
+	memcpy(&proc->fac_list, kvm->arch.model.fac_list,
+	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	if (copy_to_user((void __user *)attr->addr, proc, sizeof(*proc)))
 		ret = -EFAULT;
 	kfree(proc);
@@ -705,7 +709,7 @@ static int kvm_s390_get_machine(struct kvm *kvm, struct kvm_device_attr *attr)
 	}
 	get_cpu_id((struct cpuid *) &mach->cpuid);
 	mach->ibc = sclp.ibc;
-	memcpy(&mach->fac_mask, kvm->arch.model.fac->mask,
+	memcpy(&mach->fac_mask, kvm->arch.model.fac_mask,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	memcpy((unsigned long *)&mach->fac_list, S390_lowcore.stfle_fac_list,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
@@ -1082,16 +1086,12 @@ static void kvm_s390_get_cpu_id(struct cpuid *cpu_id)
 	cpu_id->version = 0xff;
 }
 
-static int kvm_s390_crypto_init(struct kvm *kvm)
+static void kvm_s390_crypto_init(struct kvm *kvm)
 {
 	if (!test_kvm_facility(kvm, 76))
-		return 0;
-
-	kvm->arch.crypto.crycb = kzalloc(sizeof(*kvm->arch.crypto.crycb),
-					 GFP_KERNEL | GFP_DMA);
-	if (!kvm->arch.crypto.crycb)
-		return -ENOMEM;
+		return;
 
+	kvm->arch.crypto.crycb = &kvm->arch.sie_page2->crycb;
 	kvm_s390_set_crycb_format(kvm);
 
 	/* Enable AES/DEA protected key functions by default */
@@ -1101,8 +1101,6 @@ static int kvm_s390_crypto_init(struct kvm *kvm)
 			 sizeof(kvm->arch.crypto.crycb->aes_wrapping_key_mask));
 	get_random_bytes(kvm->arch.crypto.crycb->dea_wrapping_key_mask,
 			 sizeof(kvm->arch.crypto.crycb->dea_wrapping_key_mask));
-
-	return 0;
 }
 
 static void sca_dispose(struct kvm *kvm)
@@ -1156,37 +1154,30 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (!kvm->arch.dbf)
 		goto out_err;
 
-	/*
-	 * The architectural maximum amount of facilities is 16 kbit. To store
-	 * this amount, 2 kbyte of memory is required. Thus we need a full
-	 * page to hold the guest facility list (arch.model.fac->list) and the
-	 * facility mask (arch.model.fac->mask). Its address size has to be
-	 * 31 bits and word aligned.
-	 */
-	kvm->arch.model.fac =
-		(struct kvm_s390_fac *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
-	if (!kvm->arch.model.fac)
+	kvm->arch.sie_page2 =
+	     (struct sie_page2 *) get_zeroed_page(GFP_KERNEL | GFP_DMA);
+	if (!kvm->arch.sie_page2)
 		goto out_err;
 
 	/* Populate the facility mask initially. */
-	memcpy(kvm->arch.model.fac->mask, S390_lowcore.stfle_fac_list,
+	memcpy(kvm->arch.model.fac_mask, S390_lowcore.stfle_fac_list,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 	for (i = 0; i < S390_ARCH_FAC_LIST_SIZE_U64; i++) {
 		if (i < kvm_s390_fac_list_mask_size())
-			kvm->arch.model.fac->mask[i] &= kvm_s390_fac_list_mask[i];
+			kvm->arch.model.fac_mask[i] &= kvm_s390_fac_list_mask[i];
 		else
-			kvm->arch.model.fac->mask[i] = 0UL;
+			kvm->arch.model.fac_mask[i] = 0UL;
 	}
 
 	/* Populate the facility list initially. */
-	memcpy(kvm->arch.model.fac->list, kvm->arch.model.fac->mask,
+	kvm->arch.model.fac_list = kvm->arch.sie_page2->fac_list;
+	memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
 	       S390_ARCH_FAC_LIST_SIZE_BYTE);
 
 	kvm_s390_get_cpu_id(&kvm->arch.model.cpu_id);
 	kvm->arch.model.ibc = sclp.ibc & 0x0fff;
 
-	if (kvm_s390_crypto_init(kvm) < 0)
-		goto out_err;
+	kvm_s390_crypto_init(kvm);
 
 	spin_lock_init(&kvm->arch.float_int.lock);
 	for (i = 0; i < FIRQ_LIST_COUNT; i++)
@@ -1222,8 +1213,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 
 	return 0;
 out_err:
-	kfree(kvm->arch.crypto.crycb);
-	free_page((unsigned long)kvm->arch.model.fac);
+	free_page((unsigned long)kvm->arch.sie_page2);
 	debug_unregister(kvm->arch.dbf);
 	sca_dispose(kvm);
 	KVM_EVENT(3, "creation of vm failed: %d", rc);
@@ -1269,10 +1259,9 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 	kvm_free_vcpus(kvm);
-	free_page((unsigned long)kvm->arch.model.fac);
 	sca_dispose(kvm);
 	debug_unregister(kvm->arch.dbf);
-	kfree(kvm->arch.crypto.crycb);
+	free_page((unsigned long)kvm->arch.sie_page2);
 	if (!kvm_is_ucontrol(kvm))
 		gmap_free(kvm->arch.gmap);
 	kvm_s390_destroy_adapters(kvm);
@@ -1414,8 +1403,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 				    KVM_SYNC_PFAULT;
 	if (test_kvm_facility(vcpu->kvm, 64))
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB;
-	if (test_kvm_facility(vcpu->kvm, 129))
+	/* fprs can be synchronized via vrs, even if the guest has no vx. With
+	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
+	 */
+	if (MACHINE_HAS_VX)
 		vcpu->run->kvm_valid_regs |= KVM_SYNC_VRS;
+	else
+		vcpu->run->kvm_valid_regs |= KVM_SYNC_FPRS;
 
 	if (kvm_is_ucontrol(vcpu->kvm))
 		return __kvm_ucontrol_vcpu_init(vcpu);
@@ -1423,6 +1417,93 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+static void __start_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	WARN_ON_ONCE(vcpu->arch.cputm_start != 0);
+	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+	vcpu->arch.cputm_start = get_tod_clock_fast();
+	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+}
+
+/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+static void __stop_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	WARN_ON_ONCE(vcpu->arch.cputm_start == 0);
+	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+	vcpu->arch.sie_block->cputm -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+	vcpu->arch.cputm_start = 0;
+	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+}
+
+/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+static void __enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	WARN_ON_ONCE(vcpu->arch.cputm_enabled);
+	vcpu->arch.cputm_enabled = true;
+	__start_cpu_timer_accounting(vcpu);
+}
+
+/* needs disabled preemption to protect from TOD sync and vcpu_load/put */
+static void __disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	WARN_ON_ONCE(!vcpu->arch.cputm_enabled);
+	__stop_cpu_timer_accounting(vcpu);
+	vcpu->arch.cputm_enabled = false;
+}
+
+static void enable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+	__enable_cpu_timer_accounting(vcpu);
+	preempt_enable();
+}
+
+static void disable_cpu_timer_accounting(struct kvm_vcpu *vcpu)
+{
+	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+	__disable_cpu_timer_accounting(vcpu);
+	preempt_enable();
+}
+
+/* set the cpu timer - may only be called from the VCPU thread itself */
+void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm)
+{
+	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+	raw_write_seqcount_begin(&vcpu->arch.cputm_seqcount);
+	if (vcpu->arch.cputm_enabled)
+		vcpu->arch.cputm_start = get_tod_clock_fast();
+	vcpu->arch.sie_block->cputm = cputm;
+	raw_write_seqcount_end(&vcpu->arch.cputm_seqcount);
+	preempt_enable();
+}
+
+/* update and get the cpu timer - can also be called from other VCPU threads */
+__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu)
+{
+	unsigned int seq;
+	__u64 value;
+
+	if (unlikely(!vcpu->arch.cputm_enabled))
+		return vcpu->arch.sie_block->cputm;
+
+	preempt_disable(); /* protect from TOD sync and vcpu_load/put */
+	do {
+		seq = raw_read_seqcount(&vcpu->arch.cputm_seqcount);
+		/*
+		 * If the writer would ever execute a read in the critical
+		 * section, e.g. in irq context, we have a deadlock.
+		 */
+		WARN_ON_ONCE((seq & 1) && smp_processor_id() == vcpu->cpu);
+		value = vcpu->arch.sie_block->cputm;
+		/* if cputm_start is 0, accounting is being started/stopped */
+		if (likely(vcpu->arch.cputm_start))
+			value -= get_tod_clock_fast() - vcpu->arch.cputm_start;
+	} while (read_seqcount_retry(&vcpu->arch.cputm_seqcount, seq & ~1));
+	preempt_enable();
+	return value;
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 	/* Save host register state */
@@ -1430,10 +1511,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc;
 	vcpu->arch.host_fpregs.regs = current->thread.fpu.regs;
 
-	/* Depending on MACHINE_HAS_VX, data stored to vrs either
-	 * has vector register or floating point register format.
-	 */
-	current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+	if (MACHINE_HAS_VX)
+		current->thread.fpu.regs = vcpu->run->s.regs.vrs;
+	else
+		current->thread.fpu.regs = vcpu->run->s.regs.fprs;
 	current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
 	if (test_fp_ctl(current->thread.fpu.fpc))
 		/* User space provided an invalid FPC, let's clear it */
@@ -1443,10 +1524,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 	restore_access_regs(vcpu->run->s.regs.acrs);
 	gmap_enable(vcpu->arch.gmap);
 	atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
+	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+		__start_cpu_timer_accounting(vcpu);
+	vcpu->cpu = cpu;
 }
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
+	vcpu->cpu = -1;
+	if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
+		__stop_cpu_timer_accounting(vcpu);
 	atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
 	gmap_disable(vcpu->arch.gmap);
 
@@ -1468,7 +1555,7 @@ static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu)
 	vcpu->arch.sie_block->gpsw.mask = 0UL;
 	vcpu->arch.sie_block->gpsw.addr = 0UL;
 	kvm_s390_set_prefix(vcpu, 0);
-	vcpu->arch.sie_block->cputm     = 0UL;
+	kvm_s390_set_cpu_timer(vcpu, 0);
 	vcpu->arch.sie_block->ckc       = 0UL;
 	vcpu->arch.sie_block->todpr     = 0;
 	memset(vcpu->arch.sie_block->gcr, 0, 16 * sizeof(__u64));
@@ -1538,7 +1625,8 @@ static void kvm_s390_vcpu_setup_model(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.cpu_id = model->cpu_id;
 	vcpu->arch.sie_block->ibc = model->ibc;
-	vcpu->arch.sie_block->fac = (int) (long) model->fac->list;
+	if (test_kvm_facility(vcpu->kvm, 7))
+		vcpu->arch.sie_block->fac = (u32)(u64) model->fac_list;
 }
 
 int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
@@ -1616,6 +1704,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 	vcpu->arch.local_int.float_int = &kvm->arch.float_int;
 	vcpu->arch.local_int.wq = &vcpu->wq;
 	vcpu->arch.local_int.cpuflags = &vcpu->arch.sie_block->cpuflags;
+	seqcount_init(&vcpu->arch.cputm_seqcount);
 
 	rc = kvm_vcpu_init(vcpu, kvm, id);
 	if (rc)
@@ -1715,7 +1804,7 @@ static int kvm_arch_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu,
 			     (u64 __user *)reg->addr);
 		break;
 	case KVM_REG_S390_CPU_TIMER:
-		r = put_user(vcpu->arch.sie_block->cputm,
+		r = put_user(kvm_s390_get_cpu_timer(vcpu),
 			     (u64 __user *)reg->addr);
 		break;
 	case KVM_REG_S390_CLOCK_COMP:
@@ -1753,6 +1842,7 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
 					   struct kvm_one_reg *reg)
 {
 	int r = -EINVAL;
+	__u64 val;
 
 	switch (reg->id) {
 	case KVM_REG_S390_TODPR:
@@ -1764,8 +1854,9 @@ static int kvm_arch_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu,
 			     (u64 __user *)reg->addr);
 		break;
 	case KVM_REG_S390_CPU_TIMER:
-		r = get_user(vcpu->arch.sie_block->cputm,
-			     (u64 __user *)reg->addr);
+		r = get_user(val, (u64 __user *)reg->addr);
+		if (!r)
+			kvm_s390_set_cpu_timer(vcpu, val);
 		break;
 	case KVM_REG_S390_CLOCK_COMP:
 		r = get_user(vcpu->arch.sie_block->ckc,
@@ -2158,8 +2249,10 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu)
 
 static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 {
-	psw_t *psw = &vcpu->arch.sie_block->gpsw;
-	u8 opcode;
+	struct kvm_s390_pgm_info pgm_info = {
+		.code = PGM_ADDRESSING,
+	};
+	u8 opcode, ilen;
 	int rc;
 
 	VCPU_EVENT(vcpu, 3, "%s", "fault in sie instruction");
@@ -2173,12 +2266,21 @@ static int vcpu_post_run_fault_in_sie(struct kvm_vcpu *vcpu)
 	 * to look up the current opcode to get the length of the instruction
 	 * to be able to forward the PSW.
 	 */
-	rc = read_guest(vcpu, psw->addr, 0, &opcode, 1);
-	if (rc)
-		return kvm_s390_inject_prog_cond(vcpu, rc);
-	psw->addr = __rewind_psw(*psw, -insn_length(opcode));
-
-	return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+	rc = read_guest_instr(vcpu, &opcode, 1);
+	ilen = insn_length(opcode);
+	if (rc < 0) {
+		return rc;
+	} else if (rc) {
+		/* Instruction-Fetching Exceptions - we can't detect the ilen.
+		 * Forward by arbitrary ilc, injection will take care of
+		 * nullification if necessary.
+		 */
+		pgm_info = vcpu->arch.pgm;
+		ilen = 4;
+	}
+	pgm_info.flags = ilen | KVM_S390_PGM_FLAGS_ILC_VALID;
+	kvm_s390_forward_psw(vcpu, ilen);
+	return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
 }
 
 static int vcpu_post_run(struct kvm_vcpu *vcpu, int exit_reason)
@@ -2244,10 +2346,12 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 		 */
 		local_irq_disable();
 		__kvm_guest_enter();
+		__disable_cpu_timer_accounting(vcpu);
 		local_irq_enable();
 		exit_reason = sie64a(vcpu->arch.sie_block,
 				     vcpu->run->s.regs.gprs);
 		local_irq_disable();
+		__enable_cpu_timer_accounting(vcpu);
 		__kvm_guest_exit();
 		local_irq_enable();
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2271,7 +2375,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 	}
 	if (kvm_run->kvm_dirty_regs & KVM_SYNC_ARCH0) {
-		vcpu->arch.sie_block->cputm = kvm_run->s.regs.cputm;
+		kvm_s390_set_cpu_timer(vcpu, kvm_run->s.regs.cputm);
 		vcpu->arch.sie_block->ckc = kvm_run->s.regs.ckc;
 		vcpu->arch.sie_block->todpr = kvm_run->s.regs.todpr;
 		vcpu->arch.sie_block->pp = kvm_run->s.regs.pp;
@@ -2293,7 +2397,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	kvm_run->psw_addr = vcpu->arch.sie_block->gpsw.addr;
 	kvm_run->s.regs.prefix = kvm_s390_get_prefix(vcpu);
 	memcpy(&kvm_run->s.regs.crs, &vcpu->arch.sie_block->gcr, 128);
-	kvm_run->s.regs.cputm = vcpu->arch.sie_block->cputm;
+	kvm_run->s.regs.cputm = kvm_s390_get_cpu_timer(vcpu);
 	kvm_run->s.regs.ckc = vcpu->arch.sie_block->ckc;
 	kvm_run->s.regs.todpr = vcpu->arch.sie_block->todpr;
 	kvm_run->s.regs.pp = vcpu->arch.sie_block->pp;
@@ -2325,6 +2429,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 
 	sync_regs(vcpu, kvm_run);
+	enable_cpu_timer_accounting(vcpu);
 
 	might_fault();
 	rc = __vcpu_run(vcpu);
@@ -2344,6 +2449,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		rc = 0;
 	}
 
+	disable_cpu_timer_accounting(vcpu);
 	store_regs(vcpu, kvm_run);
 
 	if (vcpu->sigset_active)
@@ -2364,7 +2470,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 	unsigned char archmode = 1;
 	freg_t fprs[NUM_FPRS];
 	unsigned int px;
-	u64 clkcomp;
+	u64 clkcomp, cputm;
 	int rc;
 
 	px = kvm_s390_get_prefix(vcpu);
@@ -2386,7 +2492,7 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 				     fprs, 128);
 	} else {
 		rc = write_guest_abs(vcpu, gpa + __LC_FPREGS_SAVE_AREA,
-				     vcpu->run->s.regs.vrs, 128);
+				     vcpu->run->s.regs.fprs, 128);
 	}
 	rc |= write_guest_abs(vcpu, gpa + __LC_GPREGS_SAVE_AREA,
 			      vcpu->run->s.regs.gprs, 128);
@@ -2398,8 +2504,9 @@ int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long gpa)
 			      &vcpu->run->s.regs.fpc, 4);
 	rc |= write_guest_abs(vcpu, gpa + __LC_TOD_PROGREG_SAVE_AREA,
 			      &vcpu->arch.sie_block->todpr, 4);
+	cputm = kvm_s390_get_cpu_timer(vcpu);
 	rc |= write_guest_abs(vcpu, gpa + __LC_CPU_TIMER_SAVE_AREA,
-			      &vcpu->arch.sie_block->cputm, 8);
+			      &cputm, 8);
 	clkcomp = vcpu->arch.sie_block->ckc >> 8;
 	rc |= write_guest_abs(vcpu, gpa + __LC_CLOCK_COMP_SAVE_AREA,
 			      &clkcomp, 8);
@@ -2605,7 +2712,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 	switch (mop->op) {
 	case KVM_S390_MEMOP_LOGICAL_READ:
 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, false);
+			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+					    mop->size, GACC_FETCH);
 			break;
 		}
 		r = read_guest(vcpu, mop->gaddr, mop->ar, tmpbuf, mop->size);
@@ -2616,7 +2724,8 @@ static long kvm_s390_guest_mem_op(struct kvm_vcpu *vcpu,
 		break;
 	case KVM_S390_MEMOP_LOGICAL_WRITE:
 		if (mop->flags & KVM_S390_MEMOP_F_CHECK_ONLY) {
-			r = check_gva_range(vcpu, mop->gaddr, mop->ar, mop->size, true);
+			r = check_gva_range(vcpu, mop->gaddr, mop->ar,
+					    mop->size, GACC_STORE);
 			break;
 		}
 		if (copy_from_user(tmpbuf, uaddr, mop->size)) {

+ 25 - 3
arch/s390/kvm/kvm-s390.h

@@ -19,6 +19,7 @@
 #include <linux/kvm.h>
 #include <linux/kvm_host.h>
 #include <asm/facility.h>
+#include <asm/processor.h>
 
 typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu);
 
@@ -53,6 +54,11 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
 	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_STOPPED;
 }
 
+static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
+{
+	return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+}
+
 static inline int kvm_is_ucontrol(struct kvm *kvm)
 {
 #ifdef CONFIG_KVM_S390_UCONTROL
@@ -154,8 +160,8 @@ static inline void kvm_s390_set_psw_cc(struct kvm_vcpu *vcpu, unsigned long cc)
 /* test availability of facility in a kvm instance */
 static inline int test_kvm_facility(struct kvm *kvm, unsigned long nr)
 {
-	return __test_facility(nr, kvm->arch.model.fac->mask) &&
-		__test_facility(nr, kvm->arch.model.fac->list);
+	return __test_facility(nr, kvm->arch.model.fac_mask) &&
+		__test_facility(nr, kvm->arch.model.fac_list);
 }
 
 static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
@@ -212,8 +218,22 @@ int kvm_s390_reinject_io_int(struct kvm *kvm,
 int kvm_s390_mask_adapter(struct kvm *kvm, unsigned int id, bool masked);
 
 /* implemented in intercept.c */
-void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilc);
+u8 kvm_s390_get_ilen(struct kvm_vcpu *vcpu);
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu);
+static inline void kvm_s390_rewind_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
+
+	sie_block->gpsw.addr = __rewind_psw(sie_block->gpsw, ilen);
+}
+static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
+{
+	kvm_s390_rewind_psw(vcpu, -ilen);
+}
+static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
+{
+	kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
+}
 
 /* implemented in priv.c */
 int is_valid_psw(psw_t *psw);
@@ -248,6 +268,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_unsetup_cmma(struct kvm_vcpu *vcpu);
 unsigned long kvm_s390_fac_list_mask_size(void);
 extern unsigned long kvm_s390_fac_list_mask[];
+void kvm_s390_set_cpu_timer(struct kvm_vcpu *vcpu, __u64 cputm);
+__u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu);
 
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);

+ 8 - 7
arch/s390/kvm/priv.c

@@ -173,7 +173,7 @@ static int handle_skey(struct kvm_vcpu *vcpu)
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 
-	kvm_s390_rewind_psw(vcpu, 4);
+	kvm_s390_retry_instr(vcpu);
 	VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
 	return 0;
 }
@@ -184,7 +184,7 @@ static int handle_ipte_interlock(struct kvm_vcpu *vcpu)
 	if (psw_bits(vcpu->arch.sie_block->gpsw).p)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 	wait_event(vcpu->kvm->arch.ipte_wq, !ipte_lock_held(vcpu));
-	kvm_s390_rewind_psw(vcpu, 4);
+	kvm_s390_retry_instr(vcpu);
 	VCPU_EVENT(vcpu, 4, "%s", "retrying ipte interlock operation");
 	return 0;
 }
@@ -354,7 +354,7 @@ static int handle_stfl(struct kvm_vcpu *vcpu)
 	 * We need to shift the lower 32 facility bits (bit 0-31) from a u64
 	 * into a u32 memory representation. They will remain bits 0-31.
 	 */
-	fac = *vcpu->kvm->arch.model.fac->list >> 32;
+	fac = *vcpu->kvm->arch.model.fac_list >> 32;
 	rc = write_guest_lc(vcpu, offsetof(struct lowcore, stfl_fac_list),
 			    &fac, sizeof(fac));
 	if (rc)
@@ -759,8 +759,8 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 	if (((vcpu->arch.sie_block->ipb & 0xf0000000) >> 28) > 6)
 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
 
-	/* Rewind PSW to repeat the ESSA instruction */
-	kvm_s390_rewind_psw(vcpu, 4);
+	/* Retry the ESSA instruction */
+	kvm_s390_retry_instr(vcpu);
 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
 	cbrlo = phys_to_virt(vcpu->arch.sie_block->cbrlo);
 	down_read(&gmap->mm->mmap_sem);
@@ -981,11 +981,12 @@ static int handle_tprot(struct kvm_vcpu *vcpu)
 		return -EOPNOTSUPP;
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_DAT)
 		ipte_lock(vcpu);
-	ret = guest_translate_address(vcpu, address1, ar, &gpa, 1);
+	ret = guest_translate_address(vcpu, address1, ar, &gpa, GACC_STORE);
 	if (ret == PGM_PROTECTION) {
 		/* Write protected? Try again with read-only... */
 		cc = 1;
-		ret = guest_translate_address(vcpu, address1, ar, &gpa, 0);
+		ret = guest_translate_address(vcpu, address1, ar, &gpa,
+					      GACC_FETCH);
 	}
 	if (ret) {
 		if (ret == PGM_ADDRESSING || ret == PGM_TRANSLATION_SPEC) {

+ 20 - 11
arch/x86/include/asm/kvm_host.h

@@ -32,6 +32,7 @@
 #include <asm/mtrr.h>
 #include <asm/msr-index.h>
 #include <asm/asm.h>
+#include <asm/kvm_page_track.h>
 
 #define KVM_MAX_VCPUS 255
 #define KVM_SOFT_MAX_VCPUS 160
@@ -214,6 +215,14 @@ struct kvm_mmu_memory_cache {
 	void *objects[KVM_NR_MEM_OBJS];
 };
 
+/*
+ * the pages used as guest page table on soft mmu are tracked by
+ * kvm_memory_slot.arch.gfn_track which is 16 bits, so the role bits used
+ * by indirect shadow page can not be more than 15 bits.
+ *
+ * Currently, we used 14 bits that are @level, @cr4_pae, @quadrant, @access,
+ * @nxe, @cr0_wp, @smep_andnot_wp and @smap_andnot_wp.
+ */
 union kvm_mmu_page_role {
 	unsigned word;
 	struct {
@@ -276,7 +285,7 @@ struct kvm_mmu_page {
 #endif
 
 	/* Number of writes since the last time traversal visited this page.  */
-	int write_flooding_count;
+	atomic_t write_flooding_count;
 };
 
 struct kvm_pio_request {
@@ -338,12 +347,8 @@ struct kvm_mmu {
 
 	struct rsvd_bits_validate guest_rsvd_check;
 
-	/*
-	 * Bitmap: bit set = last pte in walk
-	 * index[0:1]: level (zero-based)
-	 * index[2]: pte.ps
-	 */
-	u8 last_pte_bitmap;
+	/* Can have large pages at levels 2..last_nonleaf_level-1. */
+	u8 last_nonleaf_level;
 
 	bool nx;
 
@@ -498,7 +503,6 @@ struct kvm_vcpu_arch {
 	struct kvm_mmu_memory_cache mmu_page_header_cache;
 
 	struct fpu guest_fpu;
-	bool eager_fpu;
 	u64 xcr0;
 	u64 guest_supported_xcr0;
 	u32 guest_xstate_size;
@@ -644,12 +648,13 @@ struct kvm_vcpu_arch {
 };
 
 struct kvm_lpage_info {
-	int write_count;
+	int disallow_lpage;
 };
 
 struct kvm_arch_memory_slot {
 	struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
 	struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
+	unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
 };
 
 /*
@@ -694,6 +699,8 @@ struct kvm_arch {
 	 */
 	struct list_head active_mmu_pages;
 	struct list_head zapped_obsolete_pages;
+	struct kvm_page_track_notifier_node mmu_sp_tracker;
+	struct kvm_page_track_notifier_head track_notifier_head;
 
 	struct list_head assigned_dev_head;
 	struct iommu_domain *iommu_domain;
@@ -754,6 +761,8 @@ struct kvm_arch {
 
 	bool irqchip_split;
 	u8 nr_reserved_ioapic_pins;
+
+	bool disabled_lapic_found;
 };
 
 struct kvm_vm_stat {
@@ -988,6 +997,8 @@ void kvm_mmu_module_exit(void);
 void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
 int kvm_mmu_create(struct kvm_vcpu *vcpu);
 void kvm_mmu_setup(struct kvm_vcpu *vcpu);
+void kvm_mmu_init_vm(struct kvm *kvm);
+void kvm_mmu_uninit_vm(struct kvm *kvm);
 void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 		u64 dirty_mask, u64 nx_mask, u64 x_mask);
 
@@ -1127,8 +1138,6 @@ void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
-void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
-		       const u8 *new, int bytes);
 int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn);
 int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно