浏览代码

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull more kvm updates from Paolo Bonzini:
 "ARM:
   - Full debug support for arm64
   - Active state switching for timer interrupts
   - Lazy FP/SIMD save/restore for arm64
   - Generic ARMv8 target

  PPC:
   - Book3S: A few bug fixes
   - Book3S: Allow micro-threading on POWER8

  x86:
   - Compiler warnings

  Generic:
   - Adaptive polling for guest halt"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (49 commits)
  kvm: irqchip: fix memory leak
  kvm: move new trace event outside #ifdef CONFIG_KVM_ASYNC_PF
  KVM: trace kvm_halt_poll_ns grow/shrink
  KVM: dynamic halt-polling
  KVM: make halt_poll_ns per-vCPU
  Silence compiler warning in arch/x86/kvm/emulate.c
  kvm: compile process_smi_save_seg_64() only for x86_64
  KVM: x86: avoid uninitialized variable warning
  KVM: PPC: Book3S: Fix typo in top comment about locking
  KVM: PPC: Book3S: Fix size of the PSPB register
  KVM: PPC: Book3S HV: Exit on H_DOORBELL if HOST_IPI is set
  KVM: PPC: Book3S HV: Fix race in starting secondary threads
  KVM: PPC: Book3S: correct width in XER handling
  KVM: PPC: Book3S HV: Fix preempted vcore stolen time calculation
  KVM: PPC: Book3S HV: Fix preempted vcore list locking
  KVM: PPC: Book3S HV: Implement H_CLEAR_REF and H_CLEAR_MOD
  KVM: PPC: Book3S HV: Fix bug in dirty page tracking
  KVM: PPC: Book3S HV: Fix race in reading change bit when removing HPTE
  KVM: PPC: Book3S HV: Implement dynamic micro-threading on POWER8
  KVM: PPC: Book3S HV: Make use of unused threads when running guests
  ...
Linus Torvalds 10 年之前
父节点
当前提交
519f526d39
共有 62 个文件被更改,包括 2653 次插入700 次删除
  1. 11 4
      Documentation/virtual/kvm/api.txt
  2. 5 0
      arch/arm/include/asm/kvm_host.h
  3. 25 11
      arch/arm/kvm/arm.c
  4. 6 0
      arch/arm/kvm/guest.c
  5. 8 6
      arch/arm/kvm/interrupts.S
  6. 1 3
      arch/arm/kvm/reset.c
  7. 14 0
      arch/arm64/include/asm/hw_breakpoint.h
  8. 4 1
      arch/arm64/include/asm/kvm_arm.h
  9. 10 16
      arch/arm64/include/asm/kvm_asm.h
  10. 36 6
      arch/arm64/include/asm/kvm_host.h
  11. 35 2
      arch/arm64/include/uapi/asm/kvm.h
  12. 7 2
      arch/arm64/kernel/asm-offsets.c
  13. 0 12
      arch/arm64/kernel/hw_breakpoint.c
  14. 1 1
      arch/arm64/kvm/Makefile
  15. 217 0
      arch/arm64/kvm/debug.c
  16. 42 1
      arch/arm64/kvm/guest.c
  17. 44 0
      arch/arm64/kvm/handle_exit.c
  18. 218 399
      arch/arm64/kvm/hyp.S
  19. 17 3
      arch/arm64/kvm/reset.c
  20. 269 22
      arch/arm64/kvm/sys_regs.c
  21. 6 0
      arch/arm64/kvm/sys_regs.h
  22. 2 0
      arch/arm64/kvm/sys_regs_generic_v8.c
  23. 123 0
      arch/arm64/kvm/trace.h
  24. 3 2
      arch/powerpc/include/asm/kvm_book3s.h
  25. 21 1
      arch/powerpc/include/asm/kvm_book3s_asm.h
  26. 2 2
      arch/powerpc/include/asm/kvm_booke.h
  27. 20 6
      arch/powerpc/include/asm/kvm_host.h
  28. 1 1
      arch/powerpc/include/asm/ppc-opcode.h
  29. 9 0
      arch/powerpc/kernel/asm-offsets.c
  30. 4 4
      arch/powerpc/kvm/Kconfig
  31. 2 1
      arch/powerpc/kvm/book3s.c
  32. 1 0
      arch/powerpc/kvm/book3s_32_mmu_host.c
  33. 1 0
      arch/powerpc/kvm/book3s_64_mmu_host.c
  34. 7 1
      arch/powerpc/kvm/book3s_64_mmu_hv.c
  35. 1 0
      arch/powerpc/kvm/book3s_emulate.c
  36. 582 82
      arch/powerpc/kvm/book3s_hv.c
  37. 28 4
      arch/powerpc/kvm/book3s_hv_builtin.c
  38. 148 13
      arch/powerpc/kvm/book3s_hv_rm_mmu.c
  39. 1 3
      arch/powerpc/kvm/book3s_hv_rm_xics.c
  40. 117 20
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  41. 1 1
      arch/powerpc/kvm/book3s_paired_singles.c
  42. 2 2
      arch/powerpc/kvm/book3s_segment.S
  43. 1 1
      arch/powerpc/kvm/book3s_xics.c
  44. 1 0
      arch/powerpc/kvm/booke.c
  45. 1 1
      arch/powerpc/kvm/e500_mmu.c
  46. 1 1
      arch/powerpc/kvm/powerpc.c
  47. 1 1
      arch/x86/kvm/emulate.c
  48. 4 3
      arch/x86/kvm/mmu.c
  49. 2 0
      arch/x86/kvm/x86.c
  50. 5 2
      include/kvm/arm_arch_timer.h
  51. 36 3
      include/kvm/arm_vgic.h
  52. 3 0
      include/linux/irqchip/arm-gic-v3.h
  53. 2 1
      include/linux/irqchip/arm-gic.h
  54. 1 0
      include/linux/kvm_host.h
  55. 30 0
      include/trace/events/kvm.h
  56. 5 0
      include/uapi/linux/kvm.h
  57. 22 7
      virt/kvm/arm/arch_timer.c
  58. 15 1
      virt/kvm/arm/vgic-v2.c
  59. 18 3
      virt/kvm/arm/vgic-v3.c
  60. 389 38
      virt/kvm/arm/vgic.c
  61. 6 2
      virt/kvm/irqchip.c
  62. 58 4
      virt/kvm/kvm_main.c

+ 11 - 4
Documentation/virtual/kvm/api.txt

@@ -2671,7 +2671,7 @@ handled.
 4.87 KVM_SET_GUEST_DEBUG
 4.87 KVM_SET_GUEST_DEBUG
 
 
 Capability: KVM_CAP_SET_GUEST_DEBUG
 Capability: KVM_CAP_SET_GUEST_DEBUG
-Architectures: x86, s390, ppc
+Architectures: x86, s390, ppc, arm64
 Type: vcpu ioctl
 Type: vcpu ioctl
 Parameters: struct kvm_guest_debug (in)
 Parameters: struct kvm_guest_debug (in)
 Returns: 0 on success; -1 on error
 Returns: 0 on success; -1 on error
@@ -2693,8 +2693,8 @@ when running. Common control bits are:
 The top 16 bits of the control field are architecture specific control
 The top 16 bits of the control field are architecture specific control
 flags which can include the following:
 flags which can include the following:
 
 
-  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86]
-  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390]
+  - KVM_GUESTDBG_USE_SW_BP:     using software breakpoints [x86, arm64]
+  - KVM_GUESTDBG_USE_HW_BP:     using hardware breakpoints [x86, s390, arm64]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_DB:     inject DB type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_INJECT_BP:     inject BP type exception [x86]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
   - KVM_GUESTDBG_EXIT_PENDING:  trigger an immediate guest exit [s390]
@@ -2709,6 +2709,11 @@ updated to the correct (supplied) values.
 The second part of the structure is architecture specific and
 The second part of the structure is architecture specific and
 typically contains a set of debug registers.
 typically contains a set of debug registers.
 
 
+For arm64 the number of debug registers is implementation defined and
+can be determined by querying the KVM_CAP_GUEST_DEBUG_HW_BPS and
+KVM_CAP_GUEST_DEBUG_HW_WPS capabilities which return a positive number
+indicating the number of supported registers.
+
 When debug events exit the main run loop with the reason
 When debug events exit the main run loop with the reason
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 KVM_EXIT_DEBUG with the kvm_debug_exit_arch part of the kvm_run
 structure containing architecture specific debug information.
 structure containing architecture specific debug information.
@@ -3111,11 +3116,13 @@ data_offset describes where the data is located (KVM_EXIT_IO_OUT) or
 where kvm expects application code to place the data for the next
 where kvm expects application code to place the data for the next
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 KVM_RUN invocation (KVM_EXIT_IO_IN).  Data format is a packed array.
 
 
+		/* KVM_EXIT_DEBUG */
 		struct {
 		struct {
 			struct kvm_debug_exit_arch arch;
 			struct kvm_debug_exit_arch arch;
 		} debug;
 		} debug;
 
 
-Unused.
+If the exit_reason is KVM_EXIT_DEBUG, then a vcpu is processing a debug event
+for which architecture specific information is returned.
 
 
 		/* KVM_EXIT_MMIO */
 		/* KVM_EXIT_MMIO */
 		struct {
 		struct {

+ 5 - 0
arch/arm/include/asm/kvm_host.h

@@ -231,4 +231,9 @@ static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
 
+static inline void kvm_arm_init_debug(void) {}
+static inline void kvm_arm_setup_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_clear_debug(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu) {}
+
 #endif /* __ARM_KVM_HOST_H__ */
 #endif /* __ARM_KVM_HOST_H__ */

+ 25 - 11
arch/arm/kvm/arm.c

@@ -125,6 +125,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (ret)
 	if (ret)
 		goto out_free_stage2_pgd;
 		goto out_free_stage2_pgd;
 
 
+	kvm_vgic_early_init(kvm);
 	kvm_timer_init(kvm);
 	kvm_timer_init(kvm);
 
 
 	/* Mark the initial VMID generation invalid */
 	/* Mark the initial VMID generation invalid */
@@ -249,6 +250,7 @@ out:
 
 
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
 {
 {
+	kvm_vgic_vcpu_early_init(vcpu);
 }
 }
 
 
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
@@ -278,6 +280,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	/* Set up the timer */
 	/* Set up the timer */
 	kvm_timer_vcpu_init(vcpu);
 	kvm_timer_vcpu_init(vcpu);
 
 
+	kvm_arm_reset_debug_ptr(vcpu);
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -301,13 +305,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_arm_set_running_vcpu(NULL);
 	kvm_arm_set_running_vcpu(NULL);
 }
 }
 
 
-int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
-					struct kvm_guest_debug *dbg)
-{
-	return -EINVAL;
-}
-
-
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 				    struct kvm_mp_state *mp_state)
 {
 {
@@ -528,10 +525,20 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		if (vcpu->arch.pause)
 		if (vcpu->arch.pause)
 			vcpu_pause(vcpu);
 			vcpu_pause(vcpu);
 
 
-		kvm_vgic_flush_hwstate(vcpu);
+		/*
+		 * Disarming the background timer must be done in a
+		 * preemptible context, as this call may sleep.
+		 */
 		kvm_timer_flush_hwstate(vcpu);
 		kvm_timer_flush_hwstate(vcpu);
 
 
+		/*
+		 * Preparing the interrupts to be injected also
+		 * involves poking the GIC, which must be done in a
+		 * non-preemptible context.
+		 */
 		preempt_disable();
 		preempt_disable();
+		kvm_vgic_flush_hwstate(vcpu);
+
 		local_irq_disable();
 		local_irq_disable();
 
 
 		/*
 		/*
@@ -544,12 +551,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 
 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
 		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
 			local_irq_enable();
 			local_irq_enable();
+			kvm_vgic_sync_hwstate(vcpu);
 			preempt_enable();
 			preempt_enable();
 			kvm_timer_sync_hwstate(vcpu);
 			kvm_timer_sync_hwstate(vcpu);
-			kvm_vgic_sync_hwstate(vcpu);
 			continue;
 			continue;
 		}
 		}
 
 
+		kvm_arm_setup_debug(vcpu);
+
 		/**************************************************************
 		/**************************************************************
 		 * Enter the guest
 		 * Enter the guest
 		 */
 		 */
@@ -564,6 +573,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * Back from guest
 		 * Back from guest
 		 *************************************************************/
 		 *************************************************************/
 
 
+		kvm_arm_clear_debug(vcpu);
+
 		/*
 		/*
 		 * We may have taken a host interrupt in HYP mode (ie
 		 * We may have taken a host interrupt in HYP mode (ie
 		 * while executing the guest). This interrupt is still
 		 * while executing the guest). This interrupt is still
@@ -586,11 +597,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 */
 		 */
 		kvm_guest_exit();
 		kvm_guest_exit();
 		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
 		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
-		preempt_enable();
 
 
+		kvm_vgic_sync_hwstate(vcpu);
+
+		preempt_enable();
 
 
 		kvm_timer_sync_hwstate(vcpu);
 		kvm_timer_sync_hwstate(vcpu);
-		kvm_vgic_sync_hwstate(vcpu);
 
 
 		ret = handle_exit(vcpu, run, ret);
 		ret = handle_exit(vcpu, run, ret);
 	}
 	}
@@ -921,6 +933,8 @@ static void cpu_init_hyp_mode(void *dummy)
 	vector_ptr = (unsigned long)__kvm_hyp_vector;
 	vector_ptr = (unsigned long)__kvm_hyp_vector;
 
 
 	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
 	__cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+
+	kvm_arm_init_debug();
 }
 }
 
 
 static int hyp_init_cpu_notify(struct notifier_block *self,
 static int hyp_init_cpu_notify(struct notifier_block *self,

+ 6 - 0
arch/arm/kvm/guest.c

@@ -290,3 +290,9 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
 {
 	return -EINVAL;
 	return -EINVAL;
 }
 }
+
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	return -EINVAL;
+}

+ 8 - 6
arch/arm/kvm/interrupts.S

@@ -361,10 +361,6 @@ hyp_hvc:
 	@ Check syndrome register
 	@ Check syndrome register
 	mrc	p15, 4, r1, c5, c2, 0	@ HSR
 	mrc	p15, 4, r1, c5, c2, 0	@ HSR
 	lsr	r0, r1, #HSR_EC_SHIFT
 	lsr	r0, r1, #HSR_EC_SHIFT
-#ifdef CONFIG_VFPv3
-	cmp	r0, #HSR_EC_CP_0_13
-	beq	switch_to_guest_vfp
-#endif
 	cmp	r0, #HSR_EC_HVC
 	cmp	r0, #HSR_EC_HVC
 	bne	guest_trap		@ Not HVC instr.
 	bne	guest_trap		@ Not HVC instr.
 
 
@@ -378,7 +374,10 @@ hyp_hvc:
 	cmp     r2, #0
 	cmp     r2, #0
 	bne	guest_trap		@ Guest called HVC
 	bne	guest_trap		@ Guest called HVC
 
 
-host_switch_to_hyp:
+	/*
+	 * Getting here means host called HVC, we shift parameters and branch
+	 * to Hyp function.
+	 */
 	pop	{r0, r1, r2}
 	pop	{r0, r1, r2}
 
 
 	/* Check for __hyp_get_vectors */
 	/* Check for __hyp_get_vectors */
@@ -409,6 +408,10 @@ guest_trap:
 
 
 	@ Check if we need the fault information
 	@ Check if we need the fault information
 	lsr	r1, r1, #HSR_EC_SHIFT
 	lsr	r1, r1, #HSR_EC_SHIFT
+#ifdef CONFIG_VFPv3
+	cmp	r1, #HSR_EC_CP_0_13
+	beq	switch_to_guest_vfp
+#endif
 	cmp	r1, #HSR_EC_IABT
 	cmp	r1, #HSR_EC_IABT
 	mrceq	p15, 4, r2, c6, c0, 2	@ HIFAR
 	mrceq	p15, 4, r2, c6, c0, 2	@ HIFAR
 	beq	2f
 	beq	2f
@@ -477,7 +480,6 @@ guest_trap:
  */
  */
 #ifdef CONFIG_VFPv3
 #ifdef CONFIG_VFPv3
 switch_to_guest_vfp:
 switch_to_guest_vfp:
-	load_vcpu			@ Load VCPU pointer to r0
 	push	{r3-r7}
 	push	{r3-r7}
 
 
 	@ NEON/VFP used.  Turn on VFP access.
 	@ NEON/VFP used.  Turn on VFP access.

+ 1 - 3
arch/arm/kvm/reset.c

@@ -77,7 +77,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	kvm_reset_coprocs(vcpu);
 	kvm_reset_coprocs(vcpu);
 
 
 	/* Reset arch_timer context */
 	/* Reset arch_timer context */
-	kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-	return 0;
+	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
 }

+ 14 - 0
arch/arm64/include/asm/hw_breakpoint.h

@@ -16,6 +16,8 @@
 #ifndef __ASM_HW_BREAKPOINT_H
 #ifndef __ASM_HW_BREAKPOINT_H
 #define __ASM_HW_BREAKPOINT_H
 #define __ASM_HW_BREAKPOINT_H
 
 
+#include <asm/cputype.h>
+
 #ifdef __KERNEL__
 #ifdef __KERNEL__
 
 
 struct arch_hw_breakpoint_ctrl {
 struct arch_hw_breakpoint_ctrl {
@@ -132,5 +134,17 @@ static inline void ptrace_hw_copy_thread(struct task_struct *task)
 
 
 extern struct pmu perf_ops_bp;
 extern struct pmu perf_ops_bp;
 
 
+/* Determine number of BRP registers available. */
+static inline int get_num_brps(void)
+{
+	return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
+}
+
+/* Determine number of WRP registers available. */
+static inline int get_num_wrps(void)
+{
+	return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
+}
+
 #endif	/* __KERNEL__ */
 #endif	/* __KERNEL__ */
 #endif	/* __ASM_BREAKPOINT_H */
 #endif	/* __ASM_BREAKPOINT_H */

+ 4 - 1
arch/arm64/include/asm/kvm_arm.h

@@ -171,10 +171,13 @@
 #define HSTR_EL2_TTEE	(1 << 16)
 #define HSTR_EL2_TTEE	(1 << 16)
 #define HSTR_EL2_T(x)	(1 << x)
 #define HSTR_EL2_T(x)	(1 << x)
 
 
+/* Hyp Coproccessor Trap Register Shifts */
+#define CPTR_EL2_TFP_SHIFT 10
+
 /* Hyp Coprocessor Trap Register */
 /* Hyp Coprocessor Trap Register */
 #define CPTR_EL2_TCPAC	(1 << 31)
 #define CPTR_EL2_TCPAC	(1 << 31)
 #define CPTR_EL2_TTA	(1 << 20)
 #define CPTR_EL2_TTA	(1 << 20)
-#define CPTR_EL2_TFP	(1 << 10)
+#define CPTR_EL2_TFP	(1 << CPTR_EL2_TFP_SHIFT)
 
 
 /* Hyp Debug Configuration Register bits */
 /* Hyp Debug Configuration Register bits */
 #define MDCR_EL2_TDRA		(1 << 11)
 #define MDCR_EL2_TDRA		(1 << 11)

+ 10 - 16
arch/arm64/include/asm/kvm_asm.h

@@ -46,24 +46,16 @@
 #define	CNTKCTL_EL1	20	/* Timer Control Register (EL1) */
 #define	CNTKCTL_EL1	20	/* Timer Control Register (EL1) */
 #define	PAR_EL1		21	/* Physical Address Register */
 #define	PAR_EL1		21	/* Physical Address Register */
 #define MDSCR_EL1	22	/* Monitor Debug System Control Register */
 #define MDSCR_EL1	22	/* Monitor Debug System Control Register */
-#define DBGBCR0_EL1	23	/* Debug Breakpoint Control Registers (0-15) */
-#define DBGBCR15_EL1	38
-#define DBGBVR0_EL1	39	/* Debug Breakpoint Value Registers (0-15) */
-#define DBGBVR15_EL1	54
-#define DBGWCR0_EL1	55	/* Debug Watchpoint Control Registers (0-15) */
-#define DBGWCR15_EL1	70
-#define DBGWVR0_EL1	71	/* Debug Watchpoint Value Registers (0-15) */
-#define DBGWVR15_EL1	86
-#define MDCCINT_EL1	87	/* Monitor Debug Comms Channel Interrupt Enable Reg */
+#define MDCCINT_EL1	23	/* Monitor Debug Comms Channel Interrupt Enable Reg */
 
 
 /* 32bit specific registers. Keep them at the end of the range */
 /* 32bit specific registers. Keep them at the end of the range */
-#define	DACR32_EL2	88	/* Domain Access Control Register */
-#define	IFSR32_EL2	89	/* Instruction Fault Status Register */
-#define	FPEXC32_EL2	90	/* Floating-Point Exception Control Register */
-#define	DBGVCR32_EL2	91	/* Debug Vector Catch Register */
-#define	TEECR32_EL1	92	/* ThumbEE Configuration Register */
-#define	TEEHBR32_EL1	93	/* ThumbEE Handler Base Register */
-#define	NR_SYS_REGS	94
+#define	DACR32_EL2	24	/* Domain Access Control Register */
+#define	IFSR32_EL2	25	/* Instruction Fault Status Register */
+#define	FPEXC32_EL2	26	/* Floating-Point Exception Control Register */
+#define	DBGVCR32_EL2	27	/* Debug Vector Catch Register */
+#define	TEECR32_EL1	28	/* ThumbEE Configuration Register */
+#define	TEEHBR32_EL1	29	/* ThumbEE Handler Base Register */
+#define	NR_SYS_REGS	30
 
 
 /* 32bit mapping */
 /* 32bit mapping */
 #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
 #define c0_MPIDR	(MPIDR_EL1 * 2)	/* MultiProcessor ID Register */
@@ -132,6 +124,8 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
 
 
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 extern u64 __vgic_v3_get_ich_vtr_el2(void);
 
 
+extern u32 __kvm_get_mdcr_el2(void);
+
 #endif
 #endif
 
 
 #endif /* __ARM_KVM_ASM_H__ */
 #endif /* __ARM_KVM_ASM_H__ */

+ 36 - 6
arch/arm64/include/asm/kvm_host.h

@@ -103,15 +103,34 @@ struct kvm_vcpu_arch {
 
 
 	/* HYP configuration */
 	/* HYP configuration */
 	u64 hcr_el2;
 	u64 hcr_el2;
+	u32 mdcr_el2;
 
 
 	/* Exception Information */
 	/* Exception Information */
 	struct kvm_vcpu_fault_info fault;
 	struct kvm_vcpu_fault_info fault;
 
 
-	/* Debug state */
+	/* Guest debug state */
 	u64 debug_flags;
 	u64 debug_flags;
 
 
+	/*
+	 * We maintain more than a single set of debug registers to support
+	 * debugging the guest from the host and to maintain separate host and
+	 * guest state during world switches. vcpu_debug_state are the debug
+	 * registers of the vcpu as the guest sees them.  host_debug_state are
+	 * the host registers which are saved and restored during
+	 * world switches. external_debug_state contains the debug
+	 * values we want to debug the guest. This is set via the
+	 * KVM_SET_GUEST_DEBUG ioctl.
+	 *
+	 * debug_ptr points to the set of debug registers that should be loaded
+	 * onto the hardware when running the guest.
+	 */
+	struct kvm_guest_debug_arch *debug_ptr;
+	struct kvm_guest_debug_arch vcpu_debug_state;
+	struct kvm_guest_debug_arch external_debug_state;
+
 	/* Pointer to host CPU context */
 	/* Pointer to host CPU context */
 	kvm_cpu_context_t *host_cpu_context;
 	kvm_cpu_context_t *host_cpu_context;
+	struct kvm_guest_debug_arch host_debug_state;
 
 
 	/* VGIC state */
 	/* VGIC state */
 	struct vgic_cpu vgic_cpu;
 	struct vgic_cpu vgic_cpu;
@@ -122,6 +141,17 @@ struct kvm_vcpu_arch {
 	 * here.
 	 * here.
 	 */
 	 */
 
 
+	/*
+	 * Guest registers we preserve during guest debugging.
+	 *
+	 * These shadow registers are updated by the kvm_handle_sys_reg
+	 * trap handler if the guest accesses or updates them while we
+	 * are using guest debug.
+	 */
+	struct {
+		u32	mdscr_el1;
+	} guest_debug_preserved;
+
 	/* Don't run the guest */
 	/* Don't run the guest */
 	bool pause;
 	bool pause;
 
 
@@ -216,15 +246,15 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
 		     hyp_stack_ptr, vector_ptr);
 		     hyp_stack_ptr, vector_ptr);
 }
 }
 
 
-struct vgic_sr_vectors {
-	void	*save_vgic;
-	void	*restore_vgic;
-};
-
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_disable(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_hardware_unsetup(void) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_sync_events(struct kvm *kvm) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 
 
+void kvm_arm_init_debug(void);
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu);
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu);
+
 #endif /* __ARM64_KVM_HOST_H__ */
 #endif /* __ARM64_KVM_HOST_H__ */

+ 35 - 2
arch/arm64/include/uapi/asm/kvm.h

@@ -53,14 +53,20 @@ struct kvm_regs {
 	struct user_fpsimd_state fp_regs;
 	struct user_fpsimd_state fp_regs;
 };
 };
 
 
-/* Supported Processor Types */
+/*
+ * Supported CPU Targets - Adding a new target type is not recommended,
+ * unless there are some special registers not supported by the
+ * genericv8 syreg table.
+ */
 #define KVM_ARM_TARGET_AEM_V8		0
 #define KVM_ARM_TARGET_AEM_V8		0
 #define KVM_ARM_TARGET_FOUNDATION_V8	1
 #define KVM_ARM_TARGET_FOUNDATION_V8	1
 #define KVM_ARM_TARGET_CORTEX_A57	2
 #define KVM_ARM_TARGET_CORTEX_A57	2
 #define KVM_ARM_TARGET_XGENE_POTENZA	3
 #define KVM_ARM_TARGET_XGENE_POTENZA	3
 #define KVM_ARM_TARGET_CORTEX_A53	4
 #define KVM_ARM_TARGET_CORTEX_A53	4
+/* Generic ARM v8 target */
+#define KVM_ARM_TARGET_GENERIC_V8	5
 
 
-#define KVM_ARM_NUM_TARGETS		5
+#define KVM_ARM_NUM_TARGETS		6
 
 
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 /* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */
 #define KVM_ARM_DEVICE_TYPE_SHIFT	0
 #define KVM_ARM_DEVICE_TYPE_SHIFT	0
@@ -100,12 +106,39 @@ struct kvm_sregs {
 struct kvm_fpu {
 struct kvm_fpu {
 };
 };
 
 
+/*
+ * See v8 ARM ARM D7.3: Debug Registers
+ *
+ * The architectural limit is 16 debug registers of each type although
+ * in practice there are usually less (see ID_AA64DFR0_EL1).
+ *
+ * Although the control registers are architecturally defined as 32
+ * bits wide we use a 64 bit structure here to keep parity with
+ * KVM_GET/SET_ONE_REG behaviour which treats all system registers as
+ * 64 bit values. It also allows for the possibility of the
+ * architecture expanding the control registers without having to
+ * change the userspace ABI.
+ */
+#define KVM_ARM_MAX_DBG_REGS 16
 struct kvm_guest_debug_arch {
 struct kvm_guest_debug_arch {
+	__u64 dbg_bcr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_bvr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_wcr[KVM_ARM_MAX_DBG_REGS];
+	__u64 dbg_wvr[KVM_ARM_MAX_DBG_REGS];
 };
 };
 
 
 struct kvm_debug_exit_arch {
 struct kvm_debug_exit_arch {
+	__u32 hsr;
+	__u64 far;	/* used for watchpoints */
 };
 };
 
 
+/*
+ * Architecture specific defines for kvm_guest_debug->control
+ */
+
+#define KVM_GUESTDBG_USE_SW_BP		(1 << 16)
+#define KVM_GUESTDBG_USE_HW		(1 << 17)
+
 struct kvm_sync_regs {
 struct kvm_sync_regs {
 };
 };
 
 

+ 7 - 2
arch/arm64/kernel/asm-offsets.c

@@ -116,17 +116,22 @@ int main(void)
   DEFINE(VCPU_FAR_EL2,		offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_FAR_EL2,		offsetof(struct kvm_vcpu, arch.fault.far_el2));
   DEFINE(VCPU_HPFAR_EL2,	offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_HPFAR_EL2,	offsetof(struct kvm_vcpu, arch.fault.hpfar_el2));
   DEFINE(VCPU_DEBUG_FLAGS,	offsetof(struct kvm_vcpu, arch.debug_flags));
   DEFINE(VCPU_DEBUG_FLAGS,	offsetof(struct kvm_vcpu, arch.debug_flags));
+  DEFINE(VCPU_DEBUG_PTR,	offsetof(struct kvm_vcpu, arch.debug_ptr));
+  DEFINE(DEBUG_BCR, 		offsetof(struct kvm_guest_debug_arch, dbg_bcr));
+  DEFINE(DEBUG_BVR, 		offsetof(struct kvm_guest_debug_arch, dbg_bvr));
+  DEFINE(DEBUG_WCR, 		offsetof(struct kvm_guest_debug_arch, dbg_wcr));
+  DEFINE(DEBUG_WVR, 		offsetof(struct kvm_guest_debug_arch, dbg_wvr));
   DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
   DEFINE(VCPU_HCR_EL2,		offsetof(struct kvm_vcpu, arch.hcr_el2));
+  DEFINE(VCPU_MDCR_EL2,	offsetof(struct kvm_vcpu, arch.mdcr_el2));
   DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_IRQ_LINES,	offsetof(struct kvm_vcpu, arch.irq_lines));
   DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
   DEFINE(VCPU_HOST_CONTEXT,	offsetof(struct kvm_vcpu, arch.host_cpu_context));
+  DEFINE(VCPU_HOST_DEBUG_STATE, offsetof(struct kvm_vcpu, arch.host_debug_state));
   DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CTL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_ctl));
   DEFINE(VCPU_TIMER_CNTV_CVAL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(VCPU_TIMER_CNTV_CVAL,	offsetof(struct kvm_vcpu, arch.timer_cpu.cntv_cval));
   DEFINE(KVM_TIMER_CNTVOFF,	offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_CNTVOFF,	offsetof(struct kvm, arch.timer.cntvoff));
   DEFINE(KVM_TIMER_ENABLED,	offsetof(struct kvm, arch.timer.enabled));
   DEFINE(KVM_TIMER_ENABLED,	offsetof(struct kvm, arch.timer.enabled));
   DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_KVM,		offsetof(struct kvm_vcpu, kvm));
   DEFINE(VCPU_VGIC_CPU,		offsetof(struct kvm_vcpu, arch.vgic_cpu));
   DEFINE(VCPU_VGIC_CPU,		offsetof(struct kvm_vcpu, arch.vgic_cpu));
-  DEFINE(VGIC_SAVE_FN,		offsetof(struct vgic_sr_vectors, save_vgic));
-  DEFINE(VGIC_RESTORE_FN,	offsetof(struct vgic_sr_vectors, restore_vgic));
   DEFINE(VGIC_V2_CPU_HCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_HCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_hcr));
   DEFINE(VGIC_V2_CPU_VMCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_VMCR,	offsetof(struct vgic_cpu, vgic_v2.vgic_vmcr));
   DEFINE(VGIC_V2_CPU_MISR,	offsetof(struct vgic_cpu, vgic_v2.vgic_misr));
   DEFINE(VGIC_V2_CPU_MISR,	offsetof(struct vgic_cpu, vgic_v2.vgic_misr));

+ 0 - 12
arch/arm64/kernel/hw_breakpoint.c

@@ -48,18 +48,6 @@ static DEFINE_PER_CPU(int, stepping_kernel_bp);
 static int core_num_brps;
 static int core_num_brps;
 static int core_num_wrps;
 static int core_num_wrps;
 
 
-/* Determine number of BRP registers available. */
-static int get_num_brps(void)
-{
-	return ((read_cpuid(ID_AA64DFR0_EL1) >> 12) & 0xf) + 1;
-}
-
-/* Determine number of WRP registers available. */
-static int get_num_wrps(void)
-{
-	return ((read_cpuid(ID_AA64DFR0_EL1) >> 20) & 0xf) + 1;
-}
-
 int hw_breakpoint_slots(int type)
 int hw_breakpoint_slots(int type)
 {
 {
 	/*
 	/*

+ 1 - 1
arch/arm64/kvm/Makefile

@@ -17,7 +17,7 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(ARM)/psci.o $(ARM)/perf.o
 
 
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
 kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
-kvm-$(CONFIG_KVM_ARM_HOST) += guest.o reset.o sys_regs.o sys_regs_generic_v8.o
+kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
 
 
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
 kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o

+ 217 - 0
arch/arm64/kvm/debug.c

@@ -0,0 +1,217 @@
+/*
+ * Debug and Guest Debug support
+ *
+ * Copyright (C) 2015 - Linaro Ltd
+ * Author: Alex Bennée <alex.bennee@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hw_breakpoint.h>
+
+#include <asm/debug-monitors.h>
+#include <asm/kvm_asm.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_emulate.h>
+
+#include "trace.h"
+
+/* These are the bits of MDSCR_EL1 we may manipulate */
+#define MDSCR_EL1_DEBUG_MASK	(DBG_MDSCR_SS | \
+				DBG_MDSCR_KDE | \
+				DBG_MDSCR_MDE)
+
+static DEFINE_PER_CPU(u32, mdcr_el2);
+
+/**
+ * save/restore_guest_debug_regs
+ *
+ * For some debug operations we need to tweak some guest registers. As
+ * a result we need to save the state of those registers before we
+ * make those modifications.
+ *
+ * Guest access to MDSCR_EL1 is trapped by the hypervisor and handled
+ * after we have restored the preserved value to the main context.
+ */
+static void save_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.guest_debug_preserved.mdscr_el1 = vcpu_sys_reg(vcpu, MDSCR_EL1);
+
+	trace_kvm_arm_set_dreg32("Saved MDSCR_EL1",
+				vcpu->arch.guest_debug_preserved.mdscr_el1);
+}
+
+static void restore_guest_debug_regs(struct kvm_vcpu *vcpu)
+{
+	vcpu_sys_reg(vcpu, MDSCR_EL1) = vcpu->arch.guest_debug_preserved.mdscr_el1;
+
+	trace_kvm_arm_set_dreg32("Restored MDSCR_EL1",
+				vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+/**
+ * kvm_arm_init_debug - grab what we need for debug
+ *
+ * Currently the sole task of this function is to retrieve the initial
+ * value of mdcr_el2 so we can preserve MDCR_EL2.HPMN which has
+ * presumably been set-up by some knowledgeable bootcode.
+ *
+ * It is called once per-cpu during CPU hyp initialisation.
+ */
+
+void kvm_arm_init_debug(void)
+{
+	__this_cpu_write(mdcr_el2, kvm_call_hyp(__kvm_get_mdcr_el2));
+}
+
+/**
+ * kvm_arm_reset_debug_ptr - reset the debug ptr to point to the vcpu state
+ */
+
+void kvm_arm_reset_debug_ptr(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.debug_ptr = &vcpu->arch.vcpu_debug_state;
+}
+
+/**
+ * kvm_arm_setup_debug - set up debug related stuff
+ *
+ * @vcpu:	the vcpu pointer
+ *
+ * This is called before each entry into the hypervisor to setup any
+ * debug related registers. Currently this just ensures we will trap
+ * access to:
+ *  - Performance monitors (MDCR_EL2_TPM/MDCR_EL2_TPMCR)
+ *  - Debug ROM Address (MDCR_EL2_TDRA)
+ *  - OS related registers (MDCR_EL2_TDOSA)
+ *
+ * Additionally, KVM only traps guest accesses to the debug registers if
+ * the guest is not actively using them (see the KVM_ARM64_DEBUG_DIRTY
+ * flag on vcpu->arch.debug_flags).  Since the guest must not interfere
+ * with the hardware state when debugging the guest, we must ensure that
+ * trapping is enabled whenever we are debugging the guest using the
+ * debug registers.
+ */
+
+void kvm_arm_setup_debug(struct kvm_vcpu *vcpu)
+{
+	bool trap_debug = !(vcpu->arch.debug_flags & KVM_ARM64_DEBUG_DIRTY);
+
+	trace_kvm_arm_setup_debug(vcpu, vcpu->guest_debug);
+
+	vcpu->arch.mdcr_el2 = __this_cpu_read(mdcr_el2) & MDCR_EL2_HPMN_MASK;
+	vcpu->arch.mdcr_el2 |= (MDCR_EL2_TPM |
+				MDCR_EL2_TPMCR |
+				MDCR_EL2_TDRA |
+				MDCR_EL2_TDOSA);
+
+	/* Is Guest debugging in effect? */
+	if (vcpu->guest_debug) {
+		/* Route all software debug exceptions to EL2 */
+		vcpu->arch.mdcr_el2 |= MDCR_EL2_TDE;
+
+		/* Save guest debug state */
+		save_guest_debug_regs(vcpu);
+
+		/*
+		 * Single Step (ARM ARM D2.12.3 The software step state
+		 * machine)
+		 *
+		 * If we are doing Single Step we need to manipulate
+		 * the guest's MDSCR_EL1.SS and PSTATE.SS. Once the
+		 * step has occurred the hypervisor will trap the
+		 * debug exception and we return to userspace.
+		 *
+		 * If the guest attempts to single step its userspace
+		 * we would have to deal with a trapped exception
+		 * while in the guest kernel. Because this would be
+		 * hard to unwind we suppress the guest's ability to
+		 * do so by masking MDSCR_EL.SS.
+		 *
+		 * This confuses guest debuggers which use
+		 * single-step behind the scenes but everything
+		 * returns to normal once the host is no longer
+		 * debugging the system.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
+			*vcpu_cpsr(vcpu) |=  DBG_SPSR_SS;
+			vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_SS;
+		} else {
+			vcpu_sys_reg(vcpu, MDSCR_EL1) &= ~DBG_MDSCR_SS;
+		}
+
+		trace_kvm_arm_set_dreg32("SPSR_EL2", *vcpu_cpsr(vcpu));
+
+		/*
+		 * HW Breakpoints and watchpoints
+		 *
+		 * We simply switch the debug_ptr to point to our new
+		 * external_debug_state which has been populated by the
+		 * debug ioctl. The existing KVM_ARM64_DEBUG_DIRTY
+		 * mechanism ensures the registers are updated on the
+		 * world switch.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+			/* Enable breakpoints/watchpoints */
+			vcpu_sys_reg(vcpu, MDSCR_EL1) |= DBG_MDSCR_MDE;
+
+			vcpu->arch.debug_ptr = &vcpu->arch.external_debug_state;
+			vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+			trap_debug = true;
+
+			trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+						&vcpu->arch.debug_ptr->dbg_bcr[0],
+						&vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+			trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+						&vcpu->arch.debug_ptr->dbg_wcr[0],
+						&vcpu->arch.debug_ptr->dbg_wvr[0]);
+		}
+	}
+
+	BUG_ON(!vcpu->guest_debug &&
+		vcpu->arch.debug_ptr != &vcpu->arch.vcpu_debug_state);
+
+	/* Trap debug register access */
+	if (trap_debug)
+		vcpu->arch.mdcr_el2 |= MDCR_EL2_TDA;
+
+	trace_kvm_arm_set_dreg32("MDCR_EL2", vcpu->arch.mdcr_el2);
+	trace_kvm_arm_set_dreg32("MDSCR_EL1", vcpu_sys_reg(vcpu, MDSCR_EL1));
+}
+
+void kvm_arm_clear_debug(struct kvm_vcpu *vcpu)
+{
+	trace_kvm_arm_clear_debug(vcpu->guest_debug);
+
+	if (vcpu->guest_debug) {
+		restore_guest_debug_regs(vcpu);
+
+		/*
+		 * If we were using HW debug we need to restore the
+		 * debug_ptr to the guest debug state.
+		 */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+			kvm_arm_reset_debug_ptr(vcpu);
+
+			trace_kvm_arm_set_regset("BKPTS", get_num_brps(),
+						&vcpu->arch.debug_ptr->dbg_bcr[0],
+						&vcpu->arch.debug_ptr->dbg_bvr[0]);
+
+			trace_kvm_arm_set_regset("WAPTS", get_num_wrps(),
+						&vcpu->arch.debug_ptr->dbg_wcr[0],
+						&vcpu->arch.debug_ptr->dbg_wvr[0]);
+		}
+	}
+}

+ 42 - 1
arch/arm64/kvm/guest.c

@@ -32,6 +32,8 @@
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_coproc.h>
 #include <asm/kvm_coproc.h>
 
 
+#include "trace.h"
+
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 	{ NULL }
 };
 };
@@ -293,7 +295,8 @@ int __attribute_const__ kvm_target_cpu(void)
 		break;
 		break;
 	};
 	};
 
 
-	return -EINVAL;
+	/* Return a default generic target */
+	return KVM_ARM_TARGET_GENERIC_V8;
 }
 }
 
 
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
 int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init)
@@ -331,3 +334,41 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
 {
 	return -EINVAL;
 	return -EINVAL;
 }
 }
+
+#define KVM_GUESTDBG_VALID_MASK (KVM_GUESTDBG_ENABLE |    \
+			    KVM_GUESTDBG_USE_SW_BP | \
+			    KVM_GUESTDBG_USE_HW | \
+			    KVM_GUESTDBG_SINGLESTEP)
+
+/**
+ * kvm_arch_vcpu_ioctl_set_guest_debug - set up guest debugging
+ * @kvm:	pointer to the KVM struct
+ * @kvm_guest_debug: the ioctl data buffer
+ *
+ * This sets up and enables the VM for guest debugging. Userspace
+ * passes in a control flag to enable different debug types and
+ * potentially other architecture specific information in the rest of
+ * the structure.
+ */
+int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
+					struct kvm_guest_debug *dbg)
+{
+	trace_kvm_set_guest_debug(vcpu, dbg->control);
+
+	if (dbg->control & ~KVM_GUESTDBG_VALID_MASK)
+		return -EINVAL;
+
+	if (dbg->control & KVM_GUESTDBG_ENABLE) {
+		vcpu->guest_debug = dbg->control;
+
+		/* Hardware assisted Break and Watch points */
+		if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW) {
+			vcpu->arch.external_debug_state = dbg->arch;
+		}
+
+	} else {
+		/* If not enabled clear all flags */
+		vcpu->guest_debug = 0;
+	}
+	return 0;
+}

+ 44 - 0
arch/arm64/kvm/handle_exit.c

@@ -82,6 +82,45 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	return 1;
 	return 1;
 }
 }
 
 
+/**
+ * kvm_handle_guest_debug - handle a debug exception instruction
+ *
+ * @vcpu:	the vcpu pointer
+ * @run:	access to the kvm_run structure for results
+ *
+ * We route all debug exceptions through the same handler. If both the
+ * guest and host are using the same debug facilities it will be up to
+ * userspace to re-inject the correct exception for guest delivery.
+ *
+ * @return: 0 (while setting run->exit_reason), -1 for error
+ */
+static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu, struct kvm_run *run)
+{
+	u32 hsr = kvm_vcpu_get_hsr(vcpu);
+	int ret = 0;
+
+	run->exit_reason = KVM_EXIT_DEBUG;
+	run->debug.arch.hsr = hsr;
+
+	switch (hsr >> ESR_ELx_EC_SHIFT) {
+	case ESR_ELx_EC_WATCHPT_LOW:
+		run->debug.arch.far = vcpu->arch.fault.far_el2;
+		/* fall through */
+	case ESR_ELx_EC_SOFTSTP_LOW:
+	case ESR_ELx_EC_BREAKPT_LOW:
+	case ESR_ELx_EC_BKPT32:
+	case ESR_ELx_EC_BRK64:
+		break;
+	default:
+		kvm_err("%s: un-handled case hsr: %#08x\n",
+			__func__, (unsigned int) hsr);
+		ret = -1;
+		break;
+	}
+
+	return ret;
+}
+
 static exit_handle_fn arm_exit_handlers[] = {
 static exit_handle_fn arm_exit_handlers[] = {
 	[ESR_ELx_EC_WFx]	= kvm_handle_wfx,
 	[ESR_ELx_EC_WFx]	= kvm_handle_wfx,
 	[ESR_ELx_EC_CP15_32]	= kvm_handle_cp15_32,
 	[ESR_ELx_EC_CP15_32]	= kvm_handle_cp15_32,
@@ -96,6 +135,11 @@ static exit_handle_fn arm_exit_handlers[] = {
 	[ESR_ELx_EC_SYS64]	= kvm_handle_sys_reg,
 	[ESR_ELx_EC_SYS64]	= kvm_handle_sys_reg,
 	[ESR_ELx_EC_IABT_LOW]	= kvm_handle_guest_abort,
 	[ESR_ELx_EC_IABT_LOW]	= kvm_handle_guest_abort,
 	[ESR_ELx_EC_DABT_LOW]	= kvm_handle_guest_abort,
 	[ESR_ELx_EC_DABT_LOW]	= kvm_handle_guest_abort,
+	[ESR_ELx_EC_SOFTSTP_LOW]= kvm_handle_guest_debug,
+	[ESR_ELx_EC_WATCHPT_LOW]= kvm_handle_guest_debug,
+	[ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug,
+	[ESR_ELx_EC_BKPT32]	= kvm_handle_guest_debug,
+	[ESR_ELx_EC_BRK64]	= kvm_handle_guest_debug,
 };
 };
 
 
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)
 static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu)

+ 218 - 399
arch/arm64/kvm/hyp.S

@@ -230,199 +230,52 @@
 	stp	x24, x25, [x3, #160]
 	stp	x24, x25, [x3, #160]
 .endm
 .endm
 
 
-.macro save_debug
-	// x2: base address for cpu context
-	// x3: tmp register
-
-	mrs	x26, id_aa64dfr0_el1
-	ubfx	x24, x26, #12, #4	// Extract BRPs
-	ubfx	x25, x26, #20, #4	// Extract WRPs
-	mov	w26, #15
-	sub	w24, w26, w24		// How many BPs to skip
-	sub	w25, w26, w25		// How many WPs to skip
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
-1:
-	mrs	x20, dbgbcr15_el1
-	mrs	x19, dbgbcr14_el1
-	mrs	x18, dbgbcr13_el1
-	mrs	x17, dbgbcr12_el1
-	mrs	x16, dbgbcr11_el1
-	mrs	x15, dbgbcr10_el1
-	mrs	x14, dbgbcr9_el1
-	mrs	x13, dbgbcr8_el1
-	mrs	x12, dbgbcr7_el1
-	mrs	x11, dbgbcr6_el1
-	mrs	x10, dbgbcr5_el1
-	mrs	x9, dbgbcr4_el1
-	mrs	x8, dbgbcr3_el1
-	mrs	x7, dbgbcr2_el1
-	mrs	x6, dbgbcr1_el1
-	mrs	x5, dbgbcr0_el1
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
-
-1:
-	str	x20, [x3, #(15 * 8)]
-	str	x19, [x3, #(14 * 8)]
-	str	x18, [x3, #(13 * 8)]
-	str	x17, [x3, #(12 * 8)]
-	str	x16, [x3, #(11 * 8)]
-	str	x15, [x3, #(10 * 8)]
-	str	x14, [x3, #(9 * 8)]
-	str	x13, [x3, #(8 * 8)]
-	str	x12, [x3, #(7 * 8)]
-	str	x11, [x3, #(6 * 8)]
-	str	x10, [x3, #(5 * 8)]
-	str	x9, [x3, #(4 * 8)]
-	str	x8, [x3, #(3 * 8)]
-	str	x7, [x3, #(2 * 8)]
-	str	x6, [x3, #(1 * 8)]
-	str	x5, [x3, #(0 * 8)]
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
+.macro save_debug type
+	// x4: pointer to register set
+	// x5: number of registers to skip
+	// x6..x22 trashed
+
+	adr	x22, 1f
+	add	x22, x22, x5, lsl #2
+	br	x22
 1:
 1:
-	mrs	x20, dbgbvr15_el1
-	mrs	x19, dbgbvr14_el1
-	mrs	x18, dbgbvr13_el1
-	mrs	x17, dbgbvr12_el1
-	mrs	x16, dbgbvr11_el1
-	mrs	x15, dbgbvr10_el1
-	mrs	x14, dbgbvr9_el1
-	mrs	x13, dbgbvr8_el1
-	mrs	x12, dbgbvr7_el1
-	mrs	x11, dbgbvr6_el1
-	mrs	x10, dbgbvr5_el1
-	mrs	x9, dbgbvr4_el1
-	mrs	x8, dbgbvr3_el1
-	mrs	x7, dbgbvr2_el1
-	mrs	x6, dbgbvr1_el1
-	mrs	x5, dbgbvr0_el1
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
-
-1:
-	str	x20, [x3, #(15 * 8)]
-	str	x19, [x3, #(14 * 8)]
-	str	x18, [x3, #(13 * 8)]
-	str	x17, [x3, #(12 * 8)]
-	str	x16, [x3, #(11 * 8)]
-	str	x15, [x3, #(10 * 8)]
-	str	x14, [x3, #(9 * 8)]
-	str	x13, [x3, #(8 * 8)]
-	str	x12, [x3, #(7 * 8)]
-	str	x11, [x3, #(6 * 8)]
-	str	x10, [x3, #(5 * 8)]
-	str	x9, [x3, #(4 * 8)]
-	str	x8, [x3, #(3 * 8)]
-	str	x7, [x3, #(2 * 8)]
-	str	x6, [x3, #(1 * 8)]
-	str	x5, [x3, #(0 * 8)]
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	mrs	x20, dbgwcr15_el1
-	mrs	x19, dbgwcr14_el1
-	mrs	x18, dbgwcr13_el1
-	mrs	x17, dbgwcr12_el1
-	mrs	x16, dbgwcr11_el1
-	mrs	x15, dbgwcr10_el1
-	mrs	x14, dbgwcr9_el1
-	mrs	x13, dbgwcr8_el1
-	mrs	x12, dbgwcr7_el1
-	mrs	x11, dbgwcr6_el1
-	mrs	x10, dbgwcr5_el1
-	mrs	x9, dbgwcr4_el1
-	mrs	x8, dbgwcr3_el1
-	mrs	x7, dbgwcr2_el1
-	mrs	x6, dbgwcr1_el1
-	mrs	x5, dbgwcr0_el1
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-
-1:
-	str	x20, [x3, #(15 * 8)]
-	str	x19, [x3, #(14 * 8)]
-	str	x18, [x3, #(13 * 8)]
-	str	x17, [x3, #(12 * 8)]
-	str	x16, [x3, #(11 * 8)]
-	str	x15, [x3, #(10 * 8)]
-	str	x14, [x3, #(9 * 8)]
-	str	x13, [x3, #(8 * 8)]
-	str	x12, [x3, #(7 * 8)]
-	str	x11, [x3, #(6 * 8)]
-	str	x10, [x3, #(5 * 8)]
-	str	x9, [x3, #(4 * 8)]
-	str	x8, [x3, #(3 * 8)]
-	str	x7, [x3, #(2 * 8)]
-	str	x6, [x3, #(1 * 8)]
-	str	x5, [x3, #(0 * 8)]
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	mrs	x20, dbgwvr15_el1
-	mrs	x19, dbgwvr14_el1
-	mrs	x18, dbgwvr13_el1
-	mrs	x17, dbgwvr12_el1
-	mrs	x16, dbgwvr11_el1
-	mrs	x15, dbgwvr10_el1
-	mrs	x14, dbgwvr9_el1
-	mrs	x13, dbgwvr8_el1
-	mrs	x12, dbgwvr7_el1
-	mrs	x11, dbgwvr6_el1
-	mrs	x10, dbgwvr5_el1
-	mrs	x9, dbgwvr4_el1
-	mrs	x8, dbgwvr3_el1
-	mrs	x7, dbgwvr2_el1
-	mrs	x6, dbgwvr1_el1
-	mrs	x5, dbgwvr0_el1
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-
+	mrs	x21, \type\()15_el1
+	mrs	x20, \type\()14_el1
+	mrs	x19, \type\()13_el1
+	mrs	x18, \type\()12_el1
+	mrs	x17, \type\()11_el1
+	mrs	x16, \type\()10_el1
+	mrs	x15, \type\()9_el1
+	mrs	x14, \type\()8_el1
+	mrs	x13, \type\()7_el1
+	mrs	x12, \type\()6_el1
+	mrs	x11, \type\()5_el1
+	mrs	x10, \type\()4_el1
+	mrs	x9, \type\()3_el1
+	mrs	x8, \type\()2_el1
+	mrs	x7, \type\()1_el1
+	mrs	x6, \type\()0_el1
+
+	adr	x22, 1f
+	add	x22, x22, x5, lsl #2
+	br	x22
 1:
 1:
-	str	x20, [x3, #(15 * 8)]
-	str	x19, [x3, #(14 * 8)]
-	str	x18, [x3, #(13 * 8)]
-	str	x17, [x3, #(12 * 8)]
-	str	x16, [x3, #(11 * 8)]
-	str	x15, [x3, #(10 * 8)]
-	str	x14, [x3, #(9 * 8)]
-	str	x13, [x3, #(8 * 8)]
-	str	x12, [x3, #(7 * 8)]
-	str	x11, [x3, #(6 * 8)]
-	str	x10, [x3, #(5 * 8)]
-	str	x9, [x3, #(4 * 8)]
-	str	x8, [x3, #(3 * 8)]
-	str	x7, [x3, #(2 * 8)]
-	str	x6, [x3, #(1 * 8)]
-	str	x5, [x3, #(0 * 8)]
-
-	mrs	x21, mdccint_el1
-	str	x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+	str	x21, [x4, #(15 * 8)]
+	str	x20, [x4, #(14 * 8)]
+	str	x19, [x4, #(13 * 8)]
+	str	x18, [x4, #(12 * 8)]
+	str	x17, [x4, #(11 * 8)]
+	str	x16, [x4, #(10 * 8)]
+	str	x15, [x4, #(9 * 8)]
+	str	x14, [x4, #(8 * 8)]
+	str	x13, [x4, #(7 * 8)]
+	str	x12, [x4, #(6 * 8)]
+	str	x11, [x4, #(5 * 8)]
+	str	x10, [x4, #(4 * 8)]
+	str	x9, [x4, #(3 * 8)]
+	str	x8, [x4, #(2 * 8)]
+	str	x7, [x4, #(1 * 8)]
+	str	x6, [x4, #(0 * 8)]
 .endm
 .endm
 
 
 .macro restore_sysregs
 .macro restore_sysregs
@@ -467,195 +320,52 @@
 	msr	mdscr_el1,	x25
 	msr	mdscr_el1,	x25
 .endm
 .endm
 
 
-.macro restore_debug
-	// x2: base address for cpu context
-	// x3: tmp register
-
-	mrs	x26, id_aa64dfr0_el1
-	ubfx	x24, x26, #12, #4	// Extract BRPs
-	ubfx	x25, x26, #20, #4	// Extract WRPs
-	mov	w26, #15
-	sub	w24, w26, w24		// How many BPs to skip
-	sub	w25, w26, w25		// How many WPs to skip
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGBCR0_EL1)
+.macro restore_debug type
+	// x4: pointer to register set
+	// x5: number of registers to skip
+	// x6..x22 trashed
 
 
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
-1:
-	ldr	x20, [x3, #(15 * 8)]
-	ldr	x19, [x3, #(14 * 8)]
-	ldr	x18, [x3, #(13 * 8)]
-	ldr	x17, [x3, #(12 * 8)]
-	ldr	x16, [x3, #(11 * 8)]
-	ldr	x15, [x3, #(10 * 8)]
-	ldr	x14, [x3, #(9 * 8)]
-	ldr	x13, [x3, #(8 * 8)]
-	ldr	x12, [x3, #(7 * 8)]
-	ldr	x11, [x3, #(6 * 8)]
-	ldr	x10, [x3, #(5 * 8)]
-	ldr	x9, [x3, #(4 * 8)]
-	ldr	x8, [x3, #(3 * 8)]
-	ldr	x7, [x3, #(2 * 8)]
-	ldr	x6, [x3, #(1 * 8)]
-	ldr	x5, [x3, #(0 * 8)]
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
+	adr	x22, 1f
+	add	x22, x22, x5, lsl #2
+	br	x22
 1:
 1:
-	msr	dbgbcr15_el1, x20
-	msr	dbgbcr14_el1, x19
-	msr	dbgbcr13_el1, x18
-	msr	dbgbcr12_el1, x17
-	msr	dbgbcr11_el1, x16
-	msr	dbgbcr10_el1, x15
-	msr	dbgbcr9_el1, x14
-	msr	dbgbcr8_el1, x13
-	msr	dbgbcr7_el1, x12
-	msr	dbgbcr6_el1, x11
-	msr	dbgbcr5_el1, x10
-	msr	dbgbcr4_el1, x9
-	msr	dbgbcr3_el1, x8
-	msr	dbgbcr2_el1, x7
-	msr	dbgbcr1_el1, x6
-	msr	dbgbcr0_el1, x5
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGBVR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
+	ldr	x21, [x4, #(15 * 8)]
+	ldr	x20, [x4, #(14 * 8)]
+	ldr	x19, [x4, #(13 * 8)]
+	ldr	x18, [x4, #(12 * 8)]
+	ldr	x17, [x4, #(11 * 8)]
+	ldr	x16, [x4, #(10 * 8)]
+	ldr	x15, [x4, #(9 * 8)]
+	ldr	x14, [x4, #(8 * 8)]
+	ldr	x13, [x4, #(7 * 8)]
+	ldr	x12, [x4, #(6 * 8)]
+	ldr	x11, [x4, #(5 * 8)]
+	ldr	x10, [x4, #(4 * 8)]
+	ldr	x9, [x4, #(3 * 8)]
+	ldr	x8, [x4, #(2 * 8)]
+	ldr	x7, [x4, #(1 * 8)]
+	ldr	x6, [x4, #(0 * 8)]
+
+	adr	x22, 1f
+	add	x22, x22, x5, lsl #2
+	br	x22
 1:
 1:
-	ldr	x20, [x3, #(15 * 8)]
-	ldr	x19, [x3, #(14 * 8)]
-	ldr	x18, [x3, #(13 * 8)]
-	ldr	x17, [x3, #(12 * 8)]
-	ldr	x16, [x3, #(11 * 8)]
-	ldr	x15, [x3, #(10 * 8)]
-	ldr	x14, [x3, #(9 * 8)]
-	ldr	x13, [x3, #(8 * 8)]
-	ldr	x12, [x3, #(7 * 8)]
-	ldr	x11, [x3, #(6 * 8)]
-	ldr	x10, [x3, #(5 * 8)]
-	ldr	x9, [x3, #(4 * 8)]
-	ldr	x8, [x3, #(3 * 8)]
-	ldr	x7, [x3, #(2 * 8)]
-	ldr	x6, [x3, #(1 * 8)]
-	ldr	x5, [x3, #(0 * 8)]
-
-	adr	x26, 1f
-	add	x26, x26, x24, lsl #2
-	br	x26
-1:
-	msr	dbgbvr15_el1, x20
-	msr	dbgbvr14_el1, x19
-	msr	dbgbvr13_el1, x18
-	msr	dbgbvr12_el1, x17
-	msr	dbgbvr11_el1, x16
-	msr	dbgbvr10_el1, x15
-	msr	dbgbvr9_el1, x14
-	msr	dbgbvr8_el1, x13
-	msr	dbgbvr7_el1, x12
-	msr	dbgbvr6_el1, x11
-	msr	dbgbvr5_el1, x10
-	msr	dbgbvr4_el1, x9
-	msr	dbgbvr3_el1, x8
-	msr	dbgbvr2_el1, x7
-	msr	dbgbvr1_el1, x6
-	msr	dbgbvr0_el1, x5
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGWCR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	ldr	x20, [x3, #(15 * 8)]
-	ldr	x19, [x3, #(14 * 8)]
-	ldr	x18, [x3, #(13 * 8)]
-	ldr	x17, [x3, #(12 * 8)]
-	ldr	x16, [x3, #(11 * 8)]
-	ldr	x15, [x3, #(10 * 8)]
-	ldr	x14, [x3, #(9 * 8)]
-	ldr	x13, [x3, #(8 * 8)]
-	ldr	x12, [x3, #(7 * 8)]
-	ldr	x11, [x3, #(6 * 8)]
-	ldr	x10, [x3, #(5 * 8)]
-	ldr	x9, [x3, #(4 * 8)]
-	ldr	x8, [x3, #(3 * 8)]
-	ldr	x7, [x3, #(2 * 8)]
-	ldr	x6, [x3, #(1 * 8)]
-	ldr	x5, [x3, #(0 * 8)]
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	msr	dbgwcr15_el1, x20
-	msr	dbgwcr14_el1, x19
-	msr	dbgwcr13_el1, x18
-	msr	dbgwcr12_el1, x17
-	msr	dbgwcr11_el1, x16
-	msr	dbgwcr10_el1, x15
-	msr	dbgwcr9_el1, x14
-	msr	dbgwcr8_el1, x13
-	msr	dbgwcr7_el1, x12
-	msr	dbgwcr6_el1, x11
-	msr	dbgwcr5_el1, x10
-	msr	dbgwcr4_el1, x9
-	msr	dbgwcr3_el1, x8
-	msr	dbgwcr2_el1, x7
-	msr	dbgwcr1_el1, x6
-	msr	dbgwcr0_el1, x5
-
-	add	x3, x2, #CPU_SYSREG_OFFSET(DBGWVR0_EL1)
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	ldr	x20, [x3, #(15 * 8)]
-	ldr	x19, [x3, #(14 * 8)]
-	ldr	x18, [x3, #(13 * 8)]
-	ldr	x17, [x3, #(12 * 8)]
-	ldr	x16, [x3, #(11 * 8)]
-	ldr	x15, [x3, #(10 * 8)]
-	ldr	x14, [x3, #(9 * 8)]
-	ldr	x13, [x3, #(8 * 8)]
-	ldr	x12, [x3, #(7 * 8)]
-	ldr	x11, [x3, #(6 * 8)]
-	ldr	x10, [x3, #(5 * 8)]
-	ldr	x9, [x3, #(4 * 8)]
-	ldr	x8, [x3, #(3 * 8)]
-	ldr	x7, [x3, #(2 * 8)]
-	ldr	x6, [x3, #(1 * 8)]
-	ldr	x5, [x3, #(0 * 8)]
-
-	adr	x26, 1f
-	add	x26, x26, x25, lsl #2
-	br	x26
-1:
-	msr	dbgwvr15_el1, x20
-	msr	dbgwvr14_el1, x19
-	msr	dbgwvr13_el1, x18
-	msr	dbgwvr12_el1, x17
-	msr	dbgwvr11_el1, x16
-	msr	dbgwvr10_el1, x15
-	msr	dbgwvr9_el1, x14
-	msr	dbgwvr8_el1, x13
-	msr	dbgwvr7_el1, x12
-	msr	dbgwvr6_el1, x11
-	msr	dbgwvr5_el1, x10
-	msr	dbgwvr4_el1, x9
-	msr	dbgwvr3_el1, x8
-	msr	dbgwvr2_el1, x7
-	msr	dbgwvr1_el1, x6
-	msr	dbgwvr0_el1, x5
-
-	ldr	x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
-	msr	mdccint_el1, x21
+	msr	\type\()15_el1, x21
+	msr	\type\()14_el1, x20
+	msr	\type\()13_el1, x19
+	msr	\type\()12_el1, x18
+	msr	\type\()11_el1, x17
+	msr	\type\()10_el1, x16
+	msr	\type\()9_el1, x15
+	msr	\type\()8_el1, x14
+	msr	\type\()7_el1, x13
+	msr	\type\()6_el1, x12
+	msr	\type\()5_el1, x11
+	msr	\type\()4_el1, x10
+	msr	\type\()3_el1, x9
+	msr	\type\()2_el1, x8
+	msr	\type\()1_el1, x7
+	msr	\type\()0_el1, x6
 .endm
 .endm
 
 
 .macro skip_32bit_state tmp, target
 .macro skip_32bit_state tmp, target
@@ -675,6 +385,14 @@
 	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 	tbz	\tmp, #KVM_ARM64_DEBUG_DIRTY_SHIFT, \target
 .endm
 .endm
 
 
+/*
+ * Branch to target if CPTR_EL2.TFP bit is set (VFP/SIMD trapping enabled)
+ */
+.macro skip_fpsimd_state tmp, target
+	mrs	\tmp, cptr_el2
+	tbnz	\tmp, #CPTR_EL2_TFP_SHIFT, \target
+.endm
+
 .macro compute_debug_state target
 .macro compute_debug_state target
 	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
 	// Compute debug state: If any of KDE, MDE or KVM_ARM64_DEBUG_DIRTY
 	// is set, we do a full save/restore cycle and disable trapping.
 	// is set, we do a full save/restore cycle and disable trapping.
@@ -713,10 +431,12 @@
 	add	x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
 	add	x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
 	mrs	x4, dacr32_el2
 	mrs	x4, dacr32_el2
 	mrs	x5, ifsr32_el2
 	mrs	x5, ifsr32_el2
-	mrs	x6, fpexc32_el2
 	stp	x4, x5, [x3]
 	stp	x4, x5, [x3]
-	str	x6, [x3, #16]
 
 
+	skip_fpsimd_state x8, 3f
+	mrs	x6, fpexc32_el2
+	str	x6, [x3, #16]
+3:
 	skip_debug_state x8, 2f
 	skip_debug_state x8, 2f
 	mrs	x7, dbgvcr32_el2
 	mrs	x7, dbgvcr32_el2
 	str	x7, [x3, #24]
 	str	x7, [x3, #24]
@@ -743,10 +463,8 @@
 
 
 	add	x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
 	add	x3, x2, #CPU_SYSREG_OFFSET(DACR32_EL2)
 	ldp	x4, x5, [x3]
 	ldp	x4, x5, [x3]
-	ldr	x6, [x3, #16]
 	msr	dacr32_el2, x4
 	msr	dacr32_el2, x4
 	msr	ifsr32_el2, x5
 	msr	ifsr32_el2, x5
-	msr	fpexc32_el2, x6
 
 
 	skip_debug_state x8, 2f
 	skip_debug_state x8, 2f
 	ldr	x7, [x3, #24]
 	ldr	x7, [x3, #24]
@@ -763,31 +481,35 @@
 
 
 .macro activate_traps
 .macro activate_traps
 	ldr     x2, [x0, #VCPU_HCR_EL2]
 	ldr     x2, [x0, #VCPU_HCR_EL2]
+
+	/*
+	 * We are about to set CPTR_EL2.TFP to trap all floating point
+	 * register accesses to EL2, however, the ARM ARM clearly states that
+	 * traps are only taken to EL2 if the operation would not otherwise
+	 * trap to EL1.  Therefore, always make sure that for 32-bit guests,
+	 * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit.
+	 */
+	tbnz	x2, #HCR_RW_SHIFT, 99f // open code skip_32bit_state
+	mov	x3, #(1 << 30)
+	msr	fpexc32_el2, x3
+	isb
+99:
 	msr     hcr_el2, x2
 	msr     hcr_el2, x2
 	mov	x2, #CPTR_EL2_TTA
 	mov	x2, #CPTR_EL2_TTA
+	orr     x2, x2, #CPTR_EL2_TFP
 	msr	cptr_el2, x2
 	msr	cptr_el2, x2
 
 
 	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
 	mov	x2, #(1 << 15)	// Trap CP15 Cr=15
 	msr	hstr_el2, x2
 	msr	hstr_el2, x2
 
 
-	mrs	x2, mdcr_el2
-	and	x2, x2, #MDCR_EL2_HPMN_MASK
-	orr	x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
-	orr	x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
-
-	// Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
-	// if not dirty.
-	ldr	x3, [x0, #VCPU_DEBUG_FLAGS]
-	tbnz	x3, #KVM_ARM64_DEBUG_DIRTY_SHIFT, 1f
-	orr	x2, x2,  #MDCR_EL2_TDA
-1:
+	// Monitor Debug Config - see kvm_arm_setup_debug()
+	ldr	x2, [x0, #VCPU_MDCR_EL2]
 	msr	mdcr_el2, x2
 	msr	mdcr_el2, x2
 .endm
 .endm
 
 
 .macro deactivate_traps
 .macro deactivate_traps
 	mov	x2, #HCR_RW
 	mov	x2, #HCR_RW
 	msr	hcr_el2, x2
 	msr	hcr_el2, x2
-	msr	cptr_el2, xzr
 	msr	hstr_el2, xzr
 	msr	hstr_el2, xzr
 
 
 	mrs	x2, mdcr_el2
 	mrs	x2, mdcr_el2
@@ -900,21 +622,101 @@ __restore_sysregs:
 	restore_sysregs
 	restore_sysregs
 	ret
 	ret
 
 
+/* Save debug state */
 __save_debug:
 __save_debug:
-	save_debug
+	// x2: ptr to CPU context
+	// x3: ptr to debug reg struct
+	// x4/x5/x6-22/x24-26: trashed
+
+	mrs	x26, id_aa64dfr0_el1
+	ubfx	x24, x26, #12, #4	// Extract BRPs
+	ubfx	x25, x26, #20, #4	// Extract WRPs
+	mov	w26, #15
+	sub	w24, w26, w24		// How many BPs to skip
+	sub	w25, w26, w25		// How many WPs to skip
+
+	mov	x5, x24
+	add	x4, x3, #DEBUG_BCR
+	save_debug dbgbcr
+	add	x4, x3, #DEBUG_BVR
+	save_debug dbgbvr
+
+	mov	x5, x25
+	add	x4, x3, #DEBUG_WCR
+	save_debug dbgwcr
+	add	x4, x3, #DEBUG_WVR
+	save_debug dbgwvr
+
+	mrs	x21, mdccint_el1
+	str	x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
 	ret
 	ret
 
 
+/* Restore debug state */
 __restore_debug:
 __restore_debug:
-	restore_debug
+	// x2: ptr to CPU context
+	// x3: ptr to debug reg struct
+	// x4/x5/x6-22/x24-26: trashed
+
+	mrs	x26, id_aa64dfr0_el1
+	ubfx	x24, x26, #12, #4	// Extract BRPs
+	ubfx	x25, x26, #20, #4	// Extract WRPs
+	mov	w26, #15
+	sub	w24, w26, w24		// How many BPs to skip
+	sub	w25, w26, w25		// How many WPs to skip
+
+	mov	x5, x24
+	add	x4, x3, #DEBUG_BCR
+	restore_debug dbgbcr
+	add	x4, x3, #DEBUG_BVR
+	restore_debug dbgbvr
+
+	mov	x5, x25
+	add	x4, x3, #DEBUG_WCR
+	restore_debug dbgwcr
+	add	x4, x3, #DEBUG_WVR
+	restore_debug dbgwvr
+
+	ldr	x21, [x2, #CPU_SYSREG_OFFSET(MDCCINT_EL1)]
+	msr	mdccint_el1, x21
+
 	ret
 	ret
 
 
 __save_fpsimd:
 __save_fpsimd:
+	skip_fpsimd_state x3, 1f
 	save_fpsimd
 	save_fpsimd
-	ret
+1:	ret
 
 
 __restore_fpsimd:
 __restore_fpsimd:
+	skip_fpsimd_state x3, 1f
 	restore_fpsimd
 	restore_fpsimd
-	ret
+1:	ret
+
+switch_to_guest_fpsimd:
+	push	x4, lr
+
+	mrs	x2, cptr_el2
+	bic	x2, x2, #CPTR_EL2_TFP
+	msr	cptr_el2, x2
+	isb
+
+	mrs	x0, tpidr_el2
+
+	ldr	x2, [x0, #VCPU_HOST_CONTEXT]
+	kern_hyp_va x2
+	bl __save_fpsimd
+
+	add	x2, x0, #VCPU_CONTEXT
+	bl __restore_fpsimd
+
+	skip_32bit_state x3, 1f
+	ldr	x4, [x2, #CPU_SYSREG_OFFSET(FPEXC32_EL2)]
+	msr	fpexc32_el2, x4
+1:
+	pop	x4, lr
+	pop	x2, x3
+	pop	x0, x1
+
+	eret
 
 
 /*
 /*
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
  * u64 __kvm_vcpu_run(struct kvm_vcpu *vcpu);
@@ -936,10 +738,10 @@ ENTRY(__kvm_vcpu_run)
 	kern_hyp_va x2
 	kern_hyp_va x2
 
 
 	save_host_regs
 	save_host_regs
-	bl __save_fpsimd
 	bl __save_sysregs
 	bl __save_sysregs
 
 
 	compute_debug_state 1f
 	compute_debug_state 1f
+	add	x3, x0, #VCPU_HOST_DEBUG_STATE
 	bl	__save_debug
 	bl	__save_debug
 1:
 1:
 	activate_traps
 	activate_traps
@@ -952,9 +754,10 @@ ENTRY(__kvm_vcpu_run)
 	add	x2, x0, #VCPU_CONTEXT
 	add	x2, x0, #VCPU_CONTEXT
 
 
 	bl __restore_sysregs
 	bl __restore_sysregs
-	bl __restore_fpsimd
 
 
 	skip_debug_state x3, 1f
 	skip_debug_state x3, 1f
+	ldr	x3, [x0, #VCPU_DEBUG_PTR]
+	kern_hyp_va x3
 	bl	__restore_debug
 	bl	__restore_debug
 1:
 1:
 	restore_guest_32bit_state
 	restore_guest_32bit_state
@@ -975,6 +778,8 @@ __kvm_vcpu_return:
 	bl __save_sysregs
 	bl __save_sysregs
 
 
 	skip_debug_state x3, 1f
 	skip_debug_state x3, 1f
+	ldr	x3, [x0, #VCPU_DEBUG_PTR]
+	kern_hyp_va x3
 	bl	__save_debug
 	bl	__save_debug
 1:
 1:
 	save_guest_32bit_state
 	save_guest_32bit_state
@@ -991,12 +796,15 @@ __kvm_vcpu_return:
 
 
 	bl __restore_sysregs
 	bl __restore_sysregs
 	bl __restore_fpsimd
 	bl __restore_fpsimd
+	/* Clear FPSIMD and Trace trapping */
+	msr     cptr_el2, xzr
 
 
 	skip_debug_state x3, 1f
 	skip_debug_state x3, 1f
 	// Clear the dirty flag for the next run, as all the state has
 	// Clear the dirty flag for the next run, as all the state has
 	// already been saved. Note that we nuke the whole 64bit word.
 	// already been saved. Note that we nuke the whole 64bit word.
 	// If we ever add more flags, we'll have to be more careful...
 	// If we ever add more flags, we'll have to be more careful...
 	str	xzr, [x0, #VCPU_DEBUG_FLAGS]
 	str	xzr, [x0, #VCPU_DEBUG_FLAGS]
+	add	x3, x0, #VCPU_HOST_DEBUG_STATE
 	bl	__restore_debug
 	bl	__restore_debug
 1:
 1:
 	restore_host_regs
 	restore_host_regs
@@ -1199,6 +1007,11 @@ el1_trap:
 	 * x1: ESR
 	 * x1: ESR
 	 * x2: ESR_EC
 	 * x2: ESR_EC
 	 */
 	 */
+
+	/* Guest accessed VFP/SIMD registers, save host, restore Guest */
+	cmp	x2, #ESR_ELx_EC_FP_ASIMD
+	b.eq	switch_to_guest_fpsimd
+
 	cmp	x2, #ESR_ELx_EC_DABT_LOW
 	cmp	x2, #ESR_ELx_EC_DABT_LOW
 	mov	x0, #ESR_ELx_EC_IABT_LOW
 	mov	x0, #ESR_ELx_EC_IABT_LOW
 	ccmp	x2, x0, #4, ne
 	ccmp	x2, x0, #4, ne
@@ -1293,4 +1106,10 @@ ENTRY(__kvm_hyp_vector)
 	ventry	el1_error_invalid		// Error 32-bit EL1
 	ventry	el1_error_invalid		// Error 32-bit EL1
 ENDPROC(__kvm_hyp_vector)
 ENDPROC(__kvm_hyp_vector)
 
 
+
+ENTRY(__kvm_get_mdcr_el2)
+	mrs	x0, mdcr_el2
+	ret
+ENDPROC(__kvm_get_mdcr_el2)
+
 	.popsection
 	.popsection

+ 17 - 3
arch/arm64/kvm/reset.c

@@ -22,6 +22,7 @@
 #include <linux/errno.h>
 #include <linux/errno.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/kvm.h>
+#include <linux/hw_breakpoint.h>
 
 
 #include <kvm/arm_arch_timer.h>
 #include <kvm/arm_arch_timer.h>
 
 
@@ -56,6 +57,12 @@ static bool cpu_has_32bit_el1(void)
 	return !!(pfr0 & 0x20);
 	return !!(pfr0 & 0x20);
 }
 }
 
 
+/**
+ * kvm_arch_dev_ioctl_check_extension
+ *
+ * We currently assume that the number of HW registers is uniform
+ * across all CPUs (see cpuinfo_sanity_check).
+ */
 int kvm_arch_dev_ioctl_check_extension(long ext)
 int kvm_arch_dev_ioctl_check_extension(long ext)
 {
 {
 	int r;
 	int r;
@@ -64,6 +71,15 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_ARM_EL1_32BIT:
 	case KVM_CAP_ARM_EL1_32BIT:
 		r = cpu_has_32bit_el1();
 		r = cpu_has_32bit_el1();
 		break;
 		break;
+	case KVM_CAP_GUEST_DEBUG_HW_BPS:
+		r = get_num_brps();
+		break;
+	case KVM_CAP_GUEST_DEBUG_HW_WPS:
+		r = get_num_wrps();
+		break;
+	case KVM_CAP_SET_GUEST_DEBUG:
+		r = 1;
+		break;
 	default:
 	default:
 		r = 0;
 		r = 0;
 	}
 	}
@@ -105,7 +121,5 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
 	kvm_reset_sys_regs(vcpu);
 	kvm_reset_sys_regs(vcpu);
 
 
 	/* Reset timer */
 	/* Reset timer */
-	kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
-
-	return 0;
+	return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
 }
 }

+ 269 - 22
arch/arm64/kvm/sys_regs.c

@@ -38,6 +38,8 @@
 
 
 #include "sys_regs.h"
 #include "sys_regs.h"
 
 
+#include "trace.h"
+
 /*
 /*
  * All of this file is extremly similar to the ARM coproc.c, but the
  * All of this file is extremly similar to the ARM coproc.c, but the
  * types are different. My gut feeling is that it should be pretty
  * types are different. My gut feeling is that it should be pretty
@@ -208,9 +210,217 @@ static bool trap_debug_regs(struct kvm_vcpu *vcpu,
 		*vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
 		*vcpu_reg(vcpu, p->Rt) = vcpu_sys_reg(vcpu, r->reg);
 	}
 	}
 
 
+	trace_trap_reg(__func__, r->reg, p->is_write, *vcpu_reg(vcpu, p->Rt));
+
+	return true;
+}
+
+/*
+ * reg_to_dbg/dbg_to_reg
+ *
+ * A 32 bit write to a debug register leave top bits alone
+ * A 32 bit read from a debug register only returns the bottom bits
+ *
+ * All writes will set the KVM_ARM64_DEBUG_DIRTY flag to ensure the
+ * hyp.S code switches between host and guest values in future.
+ */
+static inline void reg_to_dbg(struct kvm_vcpu *vcpu,
+			      const struct sys_reg_params *p,
+			      u64 *dbg_reg)
+{
+	u64 val = *vcpu_reg(vcpu, p->Rt);
+
+	if (p->is_32bit) {
+		val &= 0xffffffffUL;
+		val |= ((*dbg_reg >> 32) << 32);
+	}
+
+	*dbg_reg = val;
+	vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+}
+
+static inline void dbg_to_reg(struct kvm_vcpu *vcpu,
+			      const struct sys_reg_params *p,
+			      u64 *dbg_reg)
+{
+	u64 val = *dbg_reg;
+
+	if (p->is_32bit)
+		val &= 0xffffffffUL;
+
+	*vcpu_reg(vcpu, p->Rt) = val;
+}
+
+static inline bool trap_bvr(struct kvm_vcpu *vcpu,
+			    const struct sys_reg_params *p,
+			    const struct sys_reg_desc *rd)
+{
+	u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+	if (p->is_write)
+		reg_to_dbg(vcpu, p, dbg_reg);
+	else
+		dbg_to_reg(vcpu, p, dbg_reg);
+
+	trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+	return true;
+}
+
+static int set_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+		const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+	if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int get_bvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+	const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+	if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static inline void reset_bvr(struct kvm_vcpu *vcpu,
+			     const struct sys_reg_desc *rd)
+{
+	vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_bcr(struct kvm_vcpu *vcpu,
+			    const struct sys_reg_params *p,
+			    const struct sys_reg_desc *rd)
+{
+	u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+	if (p->is_write)
+		reg_to_dbg(vcpu, p, dbg_reg);
+	else
+		dbg_to_reg(vcpu, p, dbg_reg);
+
+	trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+	return true;
+}
+
+static int set_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+		const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+	if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+
+	return 0;
+}
+
+static int get_bcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+	const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg];
+
+	if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static inline void reset_bcr(struct kvm_vcpu *vcpu,
+			     const struct sys_reg_desc *rd)
+{
+	vcpu->arch.vcpu_debug_state.dbg_bcr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wvr(struct kvm_vcpu *vcpu,
+			    const struct sys_reg_params *p,
+			    const struct sys_reg_desc *rd)
+{
+	u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+	if (p->is_write)
+		reg_to_dbg(vcpu, p, dbg_reg);
+	else
+		dbg_to_reg(vcpu, p, dbg_reg);
+
+	trace_trap_reg(__func__, rd->reg, p->is_write,
+		vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg]);
+
 	return true;
 	return true;
 }
 }
 
 
+static int set_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+		const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+	if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int get_wvr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+	const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg];
+
+	if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static inline void reset_wvr(struct kvm_vcpu *vcpu,
+			     const struct sys_reg_desc *rd)
+{
+	vcpu->arch.vcpu_debug_state.dbg_wvr[rd->reg] = rd->val;
+}
+
+static inline bool trap_wcr(struct kvm_vcpu *vcpu,
+			    const struct sys_reg_params *p,
+			    const struct sys_reg_desc *rd)
+{
+	u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+	if (p->is_write)
+		reg_to_dbg(vcpu, p, dbg_reg);
+	else
+		dbg_to_reg(vcpu, p, dbg_reg);
+
+	trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+	return true;
+}
+
+static int set_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+		const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+	if (copy_from_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static int get_wcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+	const struct kvm_one_reg *reg, void __user *uaddr)
+{
+	__u64 *r = &vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg];
+
+	if (copy_to_user(uaddr, r, KVM_REG_SIZE(reg->id)) != 0)
+		return -EFAULT;
+	return 0;
+}
+
+static inline void reset_wcr(struct kvm_vcpu *vcpu,
+			     const struct sys_reg_desc *rd)
+{
+	vcpu->arch.vcpu_debug_state.dbg_wcr[rd->reg] = rd->val;
+}
+
 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 static void reset_amair_el1(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 {
 {
 	u64 amair;
 	u64 amair;
@@ -240,16 +450,16 @@ static void reset_mpidr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r)
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)					\
 #define DBG_BCR_BVR_WCR_WVR_EL1(n)					\
 	/* DBGBVRn_EL1 */						\
 	/* DBGBVRn_EL1 */						\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),	\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100),	\
-	  trap_debug_regs, reset_val, (DBGBVR0_EL1 + (n)), 0 },		\
+	  trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr },		\
 	/* DBGBCRn_EL1 */						\
 	/* DBGBCRn_EL1 */						\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),	\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101),	\
-	  trap_debug_regs, reset_val, (DBGBCR0_EL1 + (n)), 0 },		\
+	  trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr },		\
 	/* DBGWVRn_EL1 */						\
 	/* DBGWVRn_EL1 */						\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),	\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110),	\
-	  trap_debug_regs, reset_val, (DBGWVR0_EL1 + (n)), 0 },		\
+	  trap_wvr, reset_wvr, n, 0,  get_wvr, set_wvr },		\
 	/* DBGWCRn_EL1 */						\
 	/* DBGWCRn_EL1 */						\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),	\
 	{ Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111),	\
-	  trap_debug_regs, reset_val, (DBGWCR0_EL1 + (n)), 0 }
+	  trap_wcr, reset_wcr, n, 0,  get_wcr, set_wcr }
 
 
 /*
 /*
  * Architected system registers.
  * Architected system registers.
@@ -516,28 +726,57 @@ static bool trap_debug32(struct kvm_vcpu *vcpu,
 	return true;
 	return true;
 }
 }
 
 
-#define DBG_BCR_BVR_WCR_WVR(n)					\
-	/* DBGBVRn */						\
-	{ Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_debug32,	\
-	  NULL, (cp14_DBGBVR0 + (n) * 2) },			\
-	/* DBGBCRn */						\
-	{ Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_debug32,	\
-	  NULL, (cp14_DBGBCR0 + (n) * 2) },			\
-	/* DBGWVRn */						\
-	{ Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_debug32,	\
-	  NULL, (cp14_DBGWVR0 + (n) * 2) },			\
-	/* DBGWCRn */						\
-	{ Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_debug32,	\
-	  NULL, (cp14_DBGWCR0 + (n) * 2) }
-
-#define DBGBXVR(n)						\
-	{ Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_debug32,	\
-	  NULL, cp14_DBGBXVR0 + n * 2 }
+/* AArch32 debug register mappings
+ *
+ * AArch32 DBGBVRn is mapped to DBGBVRn_EL1[31:0]
+ * AArch32 DBGBXVRn is mapped to DBGBVRn_EL1[63:32]
+ *
+ * All control registers and watchpoint value registers are mapped to
+ * the lower 32 bits of their AArch64 equivalents. We share the trap
+ * handlers with the above AArch64 code which checks what mode the
+ * system is in.
+ */
+
+static inline bool trap_xvr(struct kvm_vcpu *vcpu,
+			    const struct sys_reg_params *p,
+			    const struct sys_reg_desc *rd)
+{
+	u64 *dbg_reg = &vcpu->arch.vcpu_debug_state.dbg_bvr[rd->reg];
+
+	if (p->is_write) {
+		u64 val = *dbg_reg;
+
+		val &= 0xffffffffUL;
+		val |= *vcpu_reg(vcpu, p->Rt) << 32;
+		*dbg_reg = val;
+
+		vcpu->arch.debug_flags |= KVM_ARM64_DEBUG_DIRTY;
+	} else {
+		*vcpu_reg(vcpu, p->Rt) = *dbg_reg >> 32;
+	}
+
+	trace_trap_reg(__func__, rd->reg, p->is_write, *dbg_reg);
+
+	return true;
+}
+
+#define DBG_BCR_BVR_WCR_WVR(n)						\
+	/* DBGBVRn */							\
+	{ Op1( 0), CRn( 0), CRm((n)), Op2( 4), trap_bvr, NULL, n }, 	\
+	/* DBGBCRn */							\
+	{ Op1( 0), CRn( 0), CRm((n)), Op2( 5), trap_bcr, NULL, n },	\
+	/* DBGWVRn */							\
+	{ Op1( 0), CRn( 0), CRm((n)), Op2( 6), trap_wvr, NULL, n },	\
+	/* DBGWCRn */							\
+	{ Op1( 0), CRn( 0), CRm((n)), Op2( 7), trap_wcr, NULL, n }
+
+#define DBGBXVR(n)							\
+	{ Op1( 0), CRn( 1), CRm((n)), Op2( 1), trap_xvr, NULL, n }
 
 
 /*
 /*
  * Trapped cp14 registers. We generally ignore most of the external
  * Trapped cp14 registers. We generally ignore most of the external
  * debug, on the principle that they don't really make sense to a
  * debug, on the principle that they don't really make sense to a
- * guest. Revisit this one day, whould this principle change.
+ * guest. Revisit this one day, would this principle change.
  */
  */
 static const struct sys_reg_desc cp14_regs[] = {
 static const struct sys_reg_desc cp14_regs[] = {
 	/* DBGIDR */
 	/* DBGIDR */
@@ -999,6 +1238,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	struct sys_reg_params params;
 	struct sys_reg_params params;
 	unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 	unsigned long esr = kvm_vcpu_get_hsr(vcpu);
 
 
+	trace_kvm_handle_sys_reg(esr);
+
 	params.is_aarch32 = false;
 	params.is_aarch32 = false;
 	params.is_32bit = false;
 	params.is_32bit = false;
 	params.Op0 = (esr >> 20) & 3;
 	params.Op0 = (esr >> 20) & 3;
@@ -1303,6 +1544,9 @@ int kvm_arm_sys_reg_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
 	if (!r)
 	if (!r)
 		return get_invariant_sys_reg(reg->id, uaddr);
 		return get_invariant_sys_reg(reg->id, uaddr);
 
 
+	if (r->get_user)
+		return (r->get_user)(vcpu, r, reg, uaddr);
+
 	return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
 	return reg_to_user(uaddr, &vcpu_sys_reg(vcpu, r->reg), reg->id);
 }
 }
 
 
@@ -1321,6 +1565,9 @@ int kvm_arm_sys_reg_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg
 	if (!r)
 	if (!r)
 		return set_invariant_sys_reg(reg->id, uaddr);
 		return set_invariant_sys_reg(reg->id, uaddr);
 
 
+	if (r->set_user)
+		return (r->set_user)(vcpu, r, reg, uaddr);
+
 	return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
 	return reg_from_user(&vcpu_sys_reg(vcpu, r->reg), uaddr, reg->id);
 }
 }
 
 

+ 6 - 0
arch/arm64/kvm/sys_regs.h

@@ -55,6 +55,12 @@ struct sys_reg_desc {
 
 
 	/* Value (usually reset value) */
 	/* Value (usually reset value) */
 	u64 val;
 	u64 val;
+
+	/* Custom get/set_user functions, fallback to generic if NULL */
+	int (*get_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+			const struct kvm_one_reg *reg, void __user *uaddr);
+	int (*set_user)(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd,
+			const struct kvm_one_reg *reg, void __user *uaddr);
 };
 };
 
 
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)
 static inline void print_sys_reg_instr(const struct sys_reg_params *p)

+ 2 - 0
arch/arm64/kvm/sys_regs_generic_v8.c

@@ -94,6 +94,8 @@ static int __init sys_reg_genericv8_init(void)
 					  &genericv8_target_table);
 					  &genericv8_target_table);
 	kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
 	kvm_register_target_sys_reg_table(KVM_ARM_TARGET_XGENE_POTENZA,
 					  &genericv8_target_table);
 					  &genericv8_target_table);
+	kvm_register_target_sys_reg_table(KVM_ARM_TARGET_GENERIC_V8,
+					  &genericv8_target_table);
 
 
 	return 0;
 	return 0;
 }
 }

+ 123 - 0
arch/arm64/kvm/trace.h

@@ -44,6 +44,129 @@ TRACE_EVENT(kvm_hvc_arm64,
 		  __entry->vcpu_pc, __entry->r0, __entry->imm)
 		  __entry->vcpu_pc, __entry->r0, __entry->imm)
 );
 );
 
 
+TRACE_EVENT(kvm_arm_setup_debug,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+	TP_ARGS(vcpu, guest_debug),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_clear_debug,
+	TP_PROTO(__u32 guest_debug),
+	TP_ARGS(guest_debug),
+
+	TP_STRUCT__entry(
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("flags: 0x%08x", __entry->guest_debug)
+);
+
+TRACE_EVENT(kvm_arm_set_dreg32,
+	TP_PROTO(const char *name, __u32 value),
+	TP_ARGS(name, value),
+
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(__u32, value)
+	),
+
+	TP_fast_assign(
+		__entry->name = name;
+		__entry->value = value;
+	),
+
+	TP_printk("%s: 0x%08x", __entry->name, __entry->value)
+);
+
+TRACE_EVENT(kvm_arm_set_regset,
+	TP_PROTO(const char *type, int len, __u64 *control, __u64 *value),
+	TP_ARGS(type, len, control, value),
+	TP_STRUCT__entry(
+		__field(const char *, name)
+		__field(int, len)
+		__array(u64, ctrls, 16)
+		__array(u64, values, 16)
+	),
+	TP_fast_assign(
+		__entry->name = type;
+		__entry->len = len;
+		memcpy(__entry->ctrls, control, len << 3);
+		memcpy(__entry->values, value, len << 3);
+	),
+	TP_printk("%d %s CTRL:%s VALUE:%s", __entry->len, __entry->name,
+		__print_array(__entry->ctrls, __entry->len, sizeof(__u64)),
+		__print_array(__entry->values, __entry->len, sizeof(__u64)))
+);
+
+TRACE_EVENT(trap_reg,
+	TP_PROTO(const char *fn, int reg, bool is_write, u64 write_value),
+	TP_ARGS(fn, reg, is_write, write_value),
+
+	TP_STRUCT__entry(
+		__field(const char *, fn)
+		__field(int, reg)
+		__field(bool, is_write)
+		__field(u64, write_value)
+	),
+
+	TP_fast_assign(
+		__entry->fn = fn;
+		__entry->reg = reg;
+		__entry->is_write = is_write;
+		__entry->write_value = write_value;
+	),
+
+	TP_printk("%s %s reg %d (0x%08llx)", __entry->fn,  __entry->is_write?"write to":"read from", __entry->reg, __entry->write_value)
+);
+
+TRACE_EVENT(kvm_handle_sys_reg,
+	TP_PROTO(unsigned long hsr),
+	TP_ARGS(hsr),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	hsr)
+	),
+
+	TP_fast_assign(
+		__entry->hsr = hsr;
+	),
+
+	TP_printk("HSR 0x%08lx", __entry->hsr)
+);
+
+TRACE_EVENT(kvm_set_guest_debug,
+	TP_PROTO(struct kvm_vcpu *vcpu, __u32 guest_debug),
+	TP_ARGS(vcpu, guest_debug),
+
+	TP_STRUCT__entry(
+		__field(struct kvm_vcpu *, vcpu)
+		__field(__u32, guest_debug)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu = vcpu;
+		__entry->guest_debug = guest_debug;
+	),
+
+	TP_printk("vcpu: %p, flags: 0x%08x", __entry->vcpu, __entry->guest_debug)
+);
+
+
 #endif /* _TRACE_ARM64_KVM_H */
 #endif /* _TRACE_ARM64_KVM_H */
 
 
 #undef TRACE_INCLUDE_PATH
 #undef TRACE_INCLUDE_PATH

+ 3 - 2
arch/powerpc/include/asm/kvm_book3s.h

@@ -158,6 +158,7 @@ extern pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing,
 			bool *writable);
 			bool *writable);
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 			unsigned long *rmap, long pte_index, int realmode);
 			unsigned long *rmap, long pte_index, int realmode);
+extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 			unsigned long pte_index);
 			unsigned long pte_index);
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
 void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
@@ -225,12 +226,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr;
 	return vcpu->arch.cr;
 }
 }
 
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
 {
 	vcpu->arch.xer = val;
 	vcpu->arch.xer = val;
 }
 }
 
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
 {
 	return vcpu->arch.xer;
 	return vcpu->arch.xer;
 }
 }

+ 21 - 1
arch/powerpc/include/asm/kvm_book3s_asm.h

@@ -25,6 +25,12 @@
 #define XICS_MFRR		0xc
 #define XICS_MFRR		0xc
 #define XICS_IPI		2	/* interrupt source # for IPIs */
 #define XICS_IPI		2	/* interrupt source # for IPIs */
 
 
+/* Maximum number of threads per physical core */
+#define MAX_SMT_THREADS		8
+
+/* Maximum number of subcores per physical core */
+#define MAX_SUBCORES		4
+
 #ifdef __ASSEMBLY__
 #ifdef __ASSEMBLY__
 
 
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
 #ifdef CONFIG_KVM_BOOK3S_HANDLER
@@ -65,6 +71,19 @@ kvmppc_resume_\intno:
 
 
 #else  /*__ASSEMBLY__ */
 #else  /*__ASSEMBLY__ */
 
 
+struct kvmppc_vcore;
+
+/* Struct used for coordinating micro-threading (split-core) mode changes */
+struct kvm_split_mode {
+	unsigned long	rpr;
+	unsigned long	pmmar;
+	unsigned long	ldbar;
+	u8		subcore_size;
+	u8		do_nap;
+	u8		napped[MAX_SMT_THREADS];
+	struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
+};
+
 /*
 /*
  * This struct goes in the PACA on 64-bit processors.  It is used
  * This struct goes in the PACA on 64-bit processors.  It is used
  * to store host state that needs to be saved when we enter a guest
  * to store host state that needs to be saved when we enter a guest
@@ -100,6 +119,7 @@ struct kvmppc_host_state {
 	u64 host_spurr;
 	u64 host_spurr;
 	u64 host_dscr;
 	u64 host_dscr;
 	u64 dec_expires;
 	u64 dec_expires;
+	struct kvm_split_mode *kvm_split_mode;
 #endif
 #endif
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_PPC_BOOK3S_64
 	u64 cfar;
 	u64 cfar;
@@ -112,7 +132,7 @@ struct kvmppc_book3s_shadow_vcpu {
 	bool in_use;
 	bool in_use;
 	ulong gpr[14];
 	ulong gpr[14];
 	u32 cr;
 	u32 cr;
-	u32 xer;
+	ulong xer;
 	ulong ctr;
 	ulong ctr;
 	ulong lr;
 	ulong lr;
 	ulong pc;
 	ulong pc;

+ 2 - 2
arch/powerpc/include/asm/kvm_booke.h

@@ -54,12 +54,12 @@ static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
 	return vcpu->arch.cr;
 	return vcpu->arch.cr;
 }
 }
 
 
-static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, u32 val)
+static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
 {
 {
 	vcpu->arch.xer = val;
 	vcpu->arch.xer = val;
 }
 }
 
 
-static inline u32 kvmppc_get_xer(struct kvm_vcpu *vcpu)
+static inline ulong kvmppc_get_xer(struct kvm_vcpu *vcpu)
 {
 {
 	return vcpu->arch.xer;
 	return vcpu->arch.xer;
 }
 }

+ 20 - 6
arch/powerpc/include/asm/kvm_host.h

@@ -205,8 +205,10 @@ struct revmap_entry {
  */
  */
 #define KVMPPC_RMAP_LOCK_BIT	63
 #define KVMPPC_RMAP_LOCK_BIT	63
 #define KVMPPC_RMAP_RC_SHIFT	32
 #define KVMPPC_RMAP_RC_SHIFT	32
+#define KVMPPC_RMAP_CHG_SHIFT	48
 #define KVMPPC_RMAP_REFERENCED	(HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_REFERENCED	(HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED	(HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
 #define KVMPPC_RMAP_CHANGED	(HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
+#define KVMPPC_RMAP_CHG_ORDER	(0x3ful << KVMPPC_RMAP_CHG_SHIFT)
 #define KVMPPC_RMAP_PRESENT	0x100000000ul
 #define KVMPPC_RMAP_PRESENT	0x100000000ul
 #define KVMPPC_RMAP_INDEX	0xfffffffful
 #define KVMPPC_RMAP_INDEX	0xfffffffful
 
 
@@ -278,7 +280,9 @@ struct kvmppc_vcore {
 	u16 last_cpu;
 	u16 last_cpu;
 	u8 vcore_state;
 	u8 vcore_state;
 	u8 in_guest;
 	u8 in_guest;
+	struct kvmppc_vcore *master_vcore;
 	struct list_head runnable_threads;
 	struct list_head runnable_threads;
+	struct list_head preempt_list;
 	spinlock_t lock;
 	spinlock_t lock;
 	wait_queue_head_t wq;
 	wait_queue_head_t wq;
 	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
 	spinlock_t stoltb_lock;	/* protects stolen_tb and preempt_tb */
@@ -300,12 +304,21 @@ struct kvmppc_vcore {
 #define VCORE_EXIT_MAP(vc)	((vc)->entry_exit_map >> 8)
 #define VCORE_EXIT_MAP(vc)	((vc)->entry_exit_map >> 8)
 #define VCORE_IS_EXITING(vc)	(VCORE_EXIT_MAP(vc) != 0)
 #define VCORE_IS_EXITING(vc)	(VCORE_EXIT_MAP(vc) != 0)
 
 
-/* Values for vcore_state */
+/* This bit is used when a vcore exit is triggered from outside the vcore */
+#define VCORE_EXIT_REQ		0x10000
+
+/*
+ * Values for vcore_state.
+ * Note that these are arranged such that lower values
+ * (< VCORE_SLEEPING) don't require stolen time accounting
+ * on load/unload, and higher values do.
+ */
 #define VCORE_INACTIVE	0
 #define VCORE_INACTIVE	0
-#define VCORE_SLEEPING	1
-#define VCORE_PREEMPT	2
-#define VCORE_RUNNING	3
-#define VCORE_EXITING	4
+#define VCORE_PREEMPT	1
+#define VCORE_PIGGYBACK	2
+#define VCORE_SLEEPING	3
+#define VCORE_RUNNING	4
+#define VCORE_EXITING	5
 
 
 /*
 /*
  * Struct used to manage memory for a virtual processor area
  * Struct used to manage memory for a virtual processor area
@@ -473,7 +486,7 @@ struct kvm_vcpu_arch {
 	ulong ciabr;
 	ulong ciabr;
 	ulong cfar;
 	ulong cfar;
 	ulong ppr;
 	ulong ppr;
-	ulong pspb;
+	u32 pspb;
 	ulong fscr;
 	ulong fscr;
 	ulong shadow_fscr;
 	ulong shadow_fscr;
 	ulong ebbhr;
 	ulong ebbhr;
@@ -619,6 +632,7 @@ struct kvm_vcpu_arch {
 	int trap;
 	int trap;
 	int state;
 	int state;
 	int ptid;
 	int ptid;
+	int thread_cpu;
 	bool timer_running;
 	bool timer_running;
 	wait_queue_head_t cpu_run;
 	wait_queue_head_t cpu_run;
 
 

+ 1 - 1
arch/powerpc/include/asm/ppc-opcode.h

@@ -287,7 +287,7 @@
 
 
 /* POWER8 Micro Partition Prefetch (MPP) parameters */
 /* POWER8 Micro Partition Prefetch (MPP) parameters */
 /* Address mask is common for LOGMPP instruction and MPPR SPR */
 /* Address mask is common for LOGMPP instruction and MPPR SPR */
-#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000
+#define PPC_MPPE_ADDRESS_MASK 0xffffffffc000ULL
 
 
 /* Bits 60 and 61 of MPP SPR should be set to one of the following */
 /* Bits 60 and 61 of MPP SPR should be set to one of the following */
 /* Aborting the fetch is indeed setting 00 in the table size bits */
 /* Aborting the fetch is indeed setting 00 in the table size bits */

+ 9 - 0
arch/powerpc/kernel/asm-offsets.c

@@ -511,6 +511,8 @@ int main(void)
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr));
 	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
 	DEFINE(VCPU_VPA_DIRTY, offsetof(struct kvm_vcpu, arch.vpa.dirty));
 	DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
 	DEFINE(VCPU_HEIR, offsetof(struct kvm_vcpu, arch.emul_inst));
+	DEFINE(VCPU_CPU, offsetof(struct kvm_vcpu, cpu));
+	DEFINE(VCPU_THREAD_CPU, offsetof(struct kvm_vcpu, arch.thread_cpu));
 #endif
 #endif
 #ifdef CONFIG_PPC_BOOK3S
 #ifdef CONFIG_PPC_BOOK3S
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
 	DEFINE(VCPU_VCPUID, offsetof(struct kvm_vcpu, vcpu_id));
@@ -673,7 +675,14 @@ int main(void)
 	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
 	HSTATE_FIELD(HSTATE_DSCR, host_dscr);
 	HSTATE_FIELD(HSTATE_DABR, dabr);
 	HSTATE_FIELD(HSTATE_DABR, dabr);
 	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
 	HSTATE_FIELD(HSTATE_DECEXP, dec_expires);
+	HSTATE_FIELD(HSTATE_SPLIT_MODE, kvm_split_mode);
 	DEFINE(IPI_PRIORITY, IPI_PRIORITY);
 	DEFINE(IPI_PRIORITY, IPI_PRIORITY);
+	DEFINE(KVM_SPLIT_RPR, offsetof(struct kvm_split_mode, rpr));
+	DEFINE(KVM_SPLIT_PMMAR, offsetof(struct kvm_split_mode, pmmar));
+	DEFINE(KVM_SPLIT_LDBAR, offsetof(struct kvm_split_mode, ldbar));
+	DEFINE(KVM_SPLIT_SIZE, offsetof(struct kvm_split_mode, subcore_size));
+	DEFINE(KVM_SPLIT_DO_NAP, offsetof(struct kvm_split_mode, do_nap));
+	DEFINE(KVM_SPLIT_NAPPED, offsetof(struct kvm_split_mode, napped));
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 
 #ifdef CONFIG_PPC_BOOK3S_64
 #ifdef CONFIG_PPC_BOOK3S_64

+ 4 - 4
arch/powerpc/kvm/Kconfig

@@ -74,14 +74,14 @@ config KVM_BOOK3S_64
 	  If unsure, say N.
 	  If unsure, say N.
 
 
 config KVM_BOOK3S_64_HV
 config KVM_BOOK3S_64_HV
-	tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host"
+	tristate "KVM for POWER7 and later using hypervisor mode in host"
 	depends on KVM_BOOK3S_64 && PPC_POWERNV
 	depends on KVM_BOOK3S_64 && PPC_POWERNV
 	select KVM_BOOK3S_HV_POSSIBLE
 	select KVM_BOOK3S_HV_POSSIBLE
 	select MMU_NOTIFIER
 	select MMU_NOTIFIER
 	select CMA
 	select CMA
 	---help---
 	---help---
 	  Support running unmodified book3s_64 guest kernels in
 	  Support running unmodified book3s_64 guest kernels in
-	  virtual machines on POWER7 and PPC970 processors that have
+	  virtual machines on POWER7 and newer processors that have
 	  hypervisor mode available to the host.
 	  hypervisor mode available to the host.
 
 
 	  If you say Y here, KVM will use the hardware virtualization
 	  If you say Y here, KVM will use the hardware virtualization
@@ -89,8 +89,8 @@ config KVM_BOOK3S_64_HV
 	  guest operating systems will run at full hardware speed
 	  guest operating systems will run at full hardware speed
 	  using supervisor and user modes.  However, this also means
 	  using supervisor and user modes.  However, this also means
 	  that KVM is not usable under PowerVM (pHyp), is only usable
 	  that KVM is not usable under PowerVM (pHyp), is only usable
-	  on POWER7 (or later) processors and PPC970-family processors,
-	  and cannot emulate a different processor from the host processor.
+	  on POWER7 or later processors, and cannot emulate a
+	  different processor from the host processor.
 
 
 	  If unsure, say N.
 	  If unsure, say N.
 
 

+ 2 - 1
arch/powerpc/kvm/book3s.c

@@ -240,7 +240,8 @@ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags)
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE);
 }
 }
 
 
-int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority)
+static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
+					 unsigned int priority)
 {
 {
 	int deliver = 1;
 	int deliver = 1;
 	int vec = 0;
 	int vec = 0;

+ 1 - 0
arch/powerpc/kvm/book3s_32_mmu_host.c

@@ -26,6 +26,7 @@
 #include <asm/machdep.h>
 #include <asm/machdep.h>
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 #include <asm/hw_irq.h>
+#include "book3s.h"
 
 
 /* #define DEBUG_MMU */
 /* #define DEBUG_MMU */
 /* #define DEBUG_SR */
 /* #define DEBUG_SR */

+ 1 - 0
arch/powerpc/kvm/book3s_64_mmu_host.c

@@ -28,6 +28,7 @@
 #include <asm/mmu_context.h>
 #include <asm/mmu_context.h>
 #include <asm/hw_irq.h>
 #include <asm/hw_irq.h>
 #include "trace_pr.h"
 #include "trace_pr.h"
+#include "book3s.h"
 
 
 #define PTE_SIZE 12
 #define PTE_SIZE 12
 
 

+ 7 - 1
arch/powerpc/kvm/book3s_64_mmu_hv.c

@@ -761,6 +761,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 			/* Harvest R and C */
 			/* Harvest R and C */
 			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
 			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+			if (rcbits & HPTE_R_C)
+				kvmppc_update_rmap_change(rmapp, psize);
 			if (rcbits & ~rev[i].guest_rpte) {
 			if (rcbits & ~rev[i].guest_rpte) {
 				rev[i].guest_rpte = ptel | rcbits;
 				rev[i].guest_rpte = ptel | rcbits;
 				note_hpte_modification(kvm, &rev[i]);
 				note_hpte_modification(kvm, &rev[i]);
@@ -927,8 +929,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
  retry:
  retry:
 	lock_rmap(rmapp);
 	lock_rmap(rmapp);
 	if (*rmapp & KVMPPC_RMAP_CHANGED) {
 	if (*rmapp & KVMPPC_RMAP_CHANGED) {
-		*rmapp &= ~KVMPPC_RMAP_CHANGED;
+		long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
+			>> KVMPPC_RMAP_CHG_SHIFT;
+		*rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
 		npages_dirty = 1;
 		npages_dirty = 1;
+		if (change_order > PAGE_SHIFT)
+			npages_dirty = 1ul << (change_order - PAGE_SHIFT);
 	}
 	}
 	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 	if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
 		unlock_rmap(rmapp);
 		unlock_rmap(rmapp);

+ 1 - 0
arch/powerpc/kvm/book3s_emulate.c

@@ -23,6 +23,7 @@
 #include <asm/reg.h>
 #include <asm/reg.h>
 #include <asm/switch_to.h>
 #include <asm/switch_to.h>
 #include <asm/time.h>
 #include <asm/time.h>
+#include "book3s.h"
 
 
 #define OP_19_XOP_RFID		18
 #define OP_19_XOP_RFID		18
 #define OP_19_XOP_RFI		50
 #define OP_19_XOP_RFI		50

+ 582 - 82
arch/powerpc/kvm/book3s_hv.c

@@ -81,6 +81,12 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1);
 #define MPP_BUFFER_ORDER	3
 #define MPP_BUFFER_ORDER	3
 #endif
 #endif
 
 
+static int dynamic_mt_modes = 6;
+module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)");
+static int target_smt_mode;
+module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
 
 
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
@@ -114,7 +120,7 @@ static bool kvmppc_ipi_thread(int cpu)
 
 
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 {
 {
-	int cpu = vcpu->cpu;
+	int cpu;
 	wait_queue_head_t *wqp;
 	wait_queue_head_t *wqp;
 
 
 	wqp = kvm_arch_vcpu_wq(vcpu);
 	wqp = kvm_arch_vcpu_wq(vcpu);
@@ -123,10 +129,11 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
 		++vcpu->stat.halt_wakeup;
 		++vcpu->stat.halt_wakeup;
 	}
 	}
 
 
-	if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid))
+	if (kvmppc_ipi_thread(vcpu->arch.thread_cpu))
 		return;
 		return;
 
 
 	/* CPU points to the first thread of the core */
 	/* CPU points to the first thread of the core */
+	cpu = vcpu->cpu;
 	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
 	if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu))
 		smp_send_reschedule(cpu);
 		smp_send_reschedule(cpu);
 }
 }
@@ -164,6 +171,27 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
  * they should never fail.)
  * they should never fail.)
  */
  */
 
 
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	vc->preempt_tb = mftb();
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&vc->stoltb_lock, flags);
+	if (vc->preempt_tb != TB_NIL) {
+		vc->stolen_tb += mftb() - vc->preempt_tb;
+		vc->preempt_tb = TB_NIL;
+	}
+	spin_unlock_irqrestore(&vc->stoltb_lock, flags);
+}
+
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 {
 {
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -175,14 +203,9 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu)
 	 * vcpu, and once it is set to this vcpu, only this task
 	 * vcpu, and once it is set to this vcpu, only this task
 	 * ever sets it to NULL.
 	 * ever sets it to NULL.
 	 */
 	 */
-	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-		spin_lock_irqsave(&vc->stoltb_lock, flags);
-		if (vc->preempt_tb != TB_NIL) {
-			vc->stolen_tb += mftb() - vc->preempt_tb;
-			vc->preempt_tb = TB_NIL;
-		}
-		spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-	}
+	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+		kvmppc_core_end_stolen(vc);
+
 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
 	    vcpu->arch.busy_preempt != TB_NIL) {
 	    vcpu->arch.busy_preempt != TB_NIL) {
@@ -197,11 +220,9 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	unsigned long flags;
 	unsigned long flags;
 
 
-	if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) {
-		spin_lock_irqsave(&vc->stoltb_lock, flags);
-		vc->preempt_tb = mftb();
-		spin_unlock_irqrestore(&vc->stoltb_lock, flags);
-	}
+	if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
+		kvmppc_core_start_stolen(vc);
+
 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 	spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
 	if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
 		vcpu->arch.busy_preempt = mftb();
 		vcpu->arch.busy_preempt = mftb();
@@ -214,12 +235,12 @@ static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr)
 	kvmppc_end_cede(vcpu);
 	kvmppc_end_cede(vcpu);
 }
 }
 
 
-void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
+static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 {
 {
 	vcpu->arch.pvr = pvr;
 	vcpu->arch.pvr = pvr;
 }
 }
 
 
-int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
+static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 {
 {
 	unsigned long pcr = 0;
 	unsigned long pcr = 0;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
@@ -259,7 +280,7 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 	return 0;
 	return 0;
 }
 }
 
 
-void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
+static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 {
 {
 	int r;
 	int r;
 
 
@@ -292,7 +313,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 	       vcpu->arch.last_inst);
 	       vcpu->arch.last_inst);
 }
 }
 
 
-struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
+static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
 {
 {
 	int r;
 	int r;
 	struct kvm_vcpu *v, *ret = NULL;
 	struct kvm_vcpu *v, *ret = NULL;
@@ -641,7 +662,8 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 
 
 	spin_lock(&vcore->lock);
 	spin_lock(&vcore->lock);
 	if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
-	    vcore->vcore_state != VCORE_INACTIVE)
+	    vcore->vcore_state != VCORE_INACTIVE &&
+	    vcore->runner)
 		target = vcore->runner;
 		target = vcore->runner;
 	spin_unlock(&vcore->lock);
 	spin_unlock(&vcore->lock);
 
 
@@ -1431,6 +1453,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->lpcr = kvm->arch.lpcr;
 	vcore->first_vcpuid = core * threads_per_subcore;
 	vcore->first_vcpuid = core * threads_per_subcore;
 	vcore->kvm = kvm;
 	vcore->kvm = kvm;
+	INIT_LIST_HEAD(&vcore->preempt_list);
 
 
 	vcore->mpp_buffer_is_valid = false;
 	vcore->mpp_buffer_is_valid = false;
 
 
@@ -1655,6 +1678,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 	spin_unlock(&vcore->lock);
 	spin_unlock(&vcore->lock);
 	vcpu->arch.vcore = vcore;
 	vcpu->arch.vcore = vcore;
 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
 	vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid;
+	vcpu->arch.thread_cpu = -1;
 
 
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	vcpu->arch.cpu_type = KVM_CPU_3S_64;
 	kvmppc_sanity_check(vcpu);
 	kvmppc_sanity_check(vcpu);
@@ -1749,6 +1773,7 @@ static int kvmppc_grab_hwthread(int cpu)
 
 
 	/* Ensure the thread won't go into the kernel if it wakes */
 	/* Ensure the thread won't go into the kernel if it wakes */
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.kvm_vcore = NULL;
 	tpaca->kvm_hstate.napping = 0;
 	tpaca->kvm_hstate.napping = 0;
 	smp_wmb();
 	smp_wmb();
 	tpaca->kvm_hstate.hwthread_req = 1;
 	tpaca->kvm_hstate.hwthread_req = 1;
@@ -1780,26 +1805,32 @@ static void kvmppc_release_hwthread(int cpu)
 	tpaca = &paca[cpu];
 	tpaca = &paca[cpu];
 	tpaca->kvm_hstate.hwthread_req = 0;
 	tpaca->kvm_hstate.hwthread_req = 0;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
 	tpaca->kvm_hstate.kvm_vcpu = NULL;
+	tpaca->kvm_hstate.kvm_vcore = NULL;
+	tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 }
 
 
-static void kvmppc_start_thread(struct kvm_vcpu *vcpu)
+static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
 {
 {
 	int cpu;
 	int cpu;
 	struct paca_struct *tpaca;
 	struct paca_struct *tpaca;
-	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	struct kvmppc_vcore *mvc = vc->master_vcore;
 
 
-	if (vcpu->arch.timer_running) {
-		hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
-		vcpu->arch.timer_running = 0;
+	cpu = vc->pcpu;
+	if (vcpu) {
+		if (vcpu->arch.timer_running) {
+			hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
+			vcpu->arch.timer_running = 0;
+		}
+		cpu += vcpu->arch.ptid;
+		vcpu->cpu = mvc->pcpu;
+		vcpu->arch.thread_cpu = cpu;
 	}
 	}
-	cpu = vc->pcpu + vcpu->arch.ptid;
 	tpaca = &paca[cpu];
 	tpaca = &paca[cpu];
-	tpaca->kvm_hstate.kvm_vcore = vc;
-	tpaca->kvm_hstate.ptid = vcpu->arch.ptid;
-	vcpu->cpu = vc->pcpu;
-	/* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */
-	smp_wmb();
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
 	tpaca->kvm_hstate.kvm_vcpu = vcpu;
+	tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
+	/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
+	smp_wmb();
+	tpaca->kvm_hstate.kvm_vcore = mvc;
 	if (cpu != smp_processor_id())
 	if (cpu != smp_processor_id())
 		kvmppc_ipi_thread(cpu);
 		kvmppc_ipi_thread(cpu);
 }
 }
@@ -1812,12 +1843,12 @@ static void kvmppc_wait_for_nap(void)
 	for (loops = 0; loops < 1000000; ++loops) {
 	for (loops = 0; loops < 1000000; ++loops) {
 		/*
 		/*
 		 * Check if all threads are finished.
 		 * Check if all threads are finished.
-		 * We set the vcpu pointer when starting a thread
+		 * We set the vcore pointer when starting a thread
 		 * and the thread clears it when finished, so we look
 		 * and the thread clears it when finished, so we look
-		 * for any threads that still have a non-NULL vcpu ptr.
+		 * for any threads that still have a non-NULL vcore ptr.
 		 */
 		 */
 		for (i = 1; i < threads_per_subcore; ++i)
 		for (i = 1; i < threads_per_subcore; ++i)
-			if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+			if (paca[cpu + i].kvm_hstate.kvm_vcore)
 				break;
 				break;
 		if (i == threads_per_subcore) {
 		if (i == threads_per_subcore) {
 			HMT_medium();
 			HMT_medium();
@@ -1827,7 +1858,7 @@ static void kvmppc_wait_for_nap(void)
 	}
 	}
 	HMT_medium();
 	HMT_medium();
 	for (i = 1; i < threads_per_subcore; ++i)
 	for (i = 1; i < threads_per_subcore; ++i)
-		if (paca[cpu + i].kvm_hstate.kvm_vcpu)
+		if (paca[cpu + i].kvm_hstate.kvm_vcore)
 			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
 }
 }
 
 
@@ -1890,6 +1921,278 @@ static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc)
 	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 	mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE);
 }
 }
 
 
+/*
+ * A list of virtual cores for each physical CPU.
+ * These are vcores that could run but their runner VCPU tasks are
+ * (or may be) preempted.
+ */
+struct preempted_vcore_list {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores);
+
+static void init_vcore_lists(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu);
+		spin_lock_init(&lp->lock);
+		INIT_LIST_HEAD(&lp->list);
+	}
+}
+
+static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
+{
+	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+
+	vc->vcore_state = VCORE_PREEMPT;
+	vc->pcpu = smp_processor_id();
+	if (vc->num_threads < threads_per_subcore) {
+		spin_lock(&lp->lock);
+		list_add_tail(&vc->preempt_list, &lp->list);
+		spin_unlock(&lp->lock);
+	}
+
+	/* Start accumulating stolen time */
+	kvmppc_core_start_stolen(vc);
+}
+
+static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc)
+{
+	struct preempted_vcore_list *lp;
+
+	kvmppc_core_end_stolen(vc);
+	if (!list_empty(&vc->preempt_list)) {
+		lp = &per_cpu(preempted_vcores, vc->pcpu);
+		spin_lock(&lp->lock);
+		list_del_init(&vc->preempt_list);
+		spin_unlock(&lp->lock);
+	}
+	vc->vcore_state = VCORE_INACTIVE;
+}
+
+/*
+ * This stores information about the virtual cores currently
+ * assigned to a physical core.
+ */
+struct core_info {
+	int		n_subcores;
+	int		max_subcore_threads;
+	int		total_threads;
+	int		subcore_threads[MAX_SUBCORES];
+	struct kvm	*subcore_vm[MAX_SUBCORES];
+	struct list_head vcs[MAX_SUBCORES];
+};
+
+/*
+ * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
+ * respectively in 2-way micro-threading (split-core) mode.
+ */
+static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
+
+static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
+{
+	int sub;
+
+	memset(cip, 0, sizeof(*cip));
+	cip->n_subcores = 1;
+	cip->max_subcore_threads = vc->num_threads;
+	cip->total_threads = vc->num_threads;
+	cip->subcore_threads[0] = vc->num_threads;
+	cip->subcore_vm[0] = vc->kvm;
+	for (sub = 0; sub < MAX_SUBCORES; ++sub)
+		INIT_LIST_HEAD(&cip->vcs[sub]);
+	list_add_tail(&vc->preempt_list, &cip->vcs[0]);
+}
+
+static bool subcore_config_ok(int n_subcores, int n_threads)
+{
+	/* Can only dynamically split if unsplit to begin with */
+	if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
+		return false;
+	if (n_subcores > MAX_SUBCORES)
+		return false;
+	if (n_subcores > 1) {
+		if (!(dynamic_mt_modes & 2))
+			n_subcores = 4;
+		if (n_subcores > 2 && !(dynamic_mt_modes & 4))
+			return false;
+	}
+
+	return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
+}
+
+static void init_master_vcore(struct kvmppc_vcore *vc)
+{
+	vc->master_vcore = vc;
+	vc->entry_exit_map = 0;
+	vc->in_guest = 0;
+	vc->napping_threads = 0;
+	vc->conferring_threads = 0;
+}
+
+/*
+ * See if the existing subcores can be split into 3 (or fewer) subcores
+ * of at most two threads each, so we can fit in another vcore.  This
+ * assumes there are at most two subcores and at most 6 threads in total.
+ */
+static bool can_split_piggybacked_subcores(struct core_info *cip)
+{
+	int sub, new_sub;
+	int large_sub = -1;
+	int thr;
+	int n_subcores = cip->n_subcores;
+	struct kvmppc_vcore *vc, *vcnext;
+	struct kvmppc_vcore *master_vc = NULL;
+
+	for (sub = 0; sub < cip->n_subcores; ++sub) {
+		if (cip->subcore_threads[sub] <= 2)
+			continue;
+		if (large_sub >= 0)
+			return false;
+		large_sub = sub;
+		vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+				      preempt_list);
+		if (vc->num_threads > 2)
+			return false;
+		n_subcores += (cip->subcore_threads[sub] - 1) >> 1;
+	}
+	if (n_subcores > 3 || large_sub < 0)
+		return false;
+
+	/*
+	 * Seems feasible, so go through and move vcores to new subcores.
+	 * Note that when we have two or more vcores in one subcore,
+	 * all those vcores must have only one thread each.
+	 */
+	new_sub = cip->n_subcores;
+	thr = 0;
+	sub = large_sub;
+	list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) {
+		if (thr >= 2) {
+			list_del(&vc->preempt_list);
+			list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]);
+			/* vc->num_threads must be 1 */
+			if (++cip->subcore_threads[new_sub] == 1) {
+				cip->subcore_vm[new_sub] = vc->kvm;
+				init_master_vcore(vc);
+				master_vc = vc;
+				++cip->n_subcores;
+			} else {
+				vc->master_vcore = master_vc;
+				++new_sub;
+			}
+		}
+		thr += vc->num_threads;
+	}
+	cip->subcore_threads[large_sub] = 2;
+	cip->max_subcore_threads = 2;
+
+	return true;
+}
+
+static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
+{
+	int n_threads = vc->num_threads;
+	int sub;
+
+	if (!cpu_has_feature(CPU_FTR_ARCH_207S))
+		return false;
+
+	if (n_threads < cip->max_subcore_threads)
+		n_threads = cip->max_subcore_threads;
+	if (subcore_config_ok(cip->n_subcores + 1, n_threads)) {
+		cip->max_subcore_threads = n_threads;
+	} else if (cip->n_subcores <= 2 && cip->total_threads <= 6 &&
+		   vc->num_threads <= 2) {
+		/*
+		 * We may be able to fit another subcore in by
+		 * splitting an existing subcore with 3 or 4
+		 * threads into two 2-thread subcores, or one
+		 * with 5 or 6 threads into three subcores.
+		 * We can only do this if those subcores have
+		 * piggybacked virtual cores.
+		 */
+		if (!can_split_piggybacked_subcores(cip))
+			return false;
+	} else {
+		return false;
+	}
+
+	sub = cip->n_subcores;
+	++cip->n_subcores;
+	cip->total_threads += vc->num_threads;
+	cip->subcore_threads[sub] = vc->num_threads;
+	cip->subcore_vm[sub] = vc->kvm;
+	init_master_vcore(vc);
+	list_del(&vc->preempt_list);
+	list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
+
+	return true;
+}
+
+static bool can_piggyback_subcore(struct kvmppc_vcore *pvc,
+				  struct core_info *cip, int sub)
+{
+	struct kvmppc_vcore *vc;
+	int n_thr;
+
+	vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore,
+			      preempt_list);
+
+	/* require same VM and same per-core reg values */
+	if (pvc->kvm != vc->kvm ||
+	    pvc->tb_offset != vc->tb_offset ||
+	    pvc->pcr != vc->pcr ||
+	    pvc->lpcr != vc->lpcr)
+		return false;
+
+	/* P8 guest with > 1 thread per core would see wrong TIR value */
+	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
+	    (vc->num_threads > 1 || pvc->num_threads > 1))
+		return false;
+
+	n_thr = cip->subcore_threads[sub] + pvc->num_threads;
+	if (n_thr > cip->max_subcore_threads) {
+		if (!subcore_config_ok(cip->n_subcores, n_thr))
+			return false;
+		cip->max_subcore_threads = n_thr;
+	}
+
+	cip->total_threads += pvc->num_threads;
+	cip->subcore_threads[sub] = n_thr;
+	pvc->master_vcore = vc;
+	list_del(&pvc->preempt_list);
+	list_add_tail(&pvc->preempt_list, &cip->vcs[sub]);
+
+	return true;
+}
+
+/*
+ * Work out whether it is possible to piggyback the execution of
+ * vcore *pvc onto the execution of the other vcores described in *cip.
+ */
+static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip,
+			  int target_threads)
+{
+	int sub;
+
+	if (cip->total_threads + pvc->num_threads > target_threads)
+		return false;
+	for (sub = 0; sub < cip->n_subcores; ++sub)
+		if (cip->subcore_threads[sub] &&
+		    can_piggyback_subcore(pvc, cip, sub))
+			return true;
+
+	if (can_dynamic_split(pvc, cip))
+		return true;
+
+	return false;
+}
+
 static void prepare_threads(struct kvmppc_vcore *vc)
 static void prepare_threads(struct kvmppc_vcore *vc)
 {
 {
 	struct kvm_vcpu *vcpu, *vnext;
 	struct kvm_vcpu *vcpu, *vnext;
@@ -1909,12 +2212,45 @@ static void prepare_threads(struct kvmppc_vcore *vc)
 	}
 	}
 }
 }
 
 
-static void post_guest_process(struct kvmppc_vcore *vc)
+static void collect_piggybacks(struct core_info *cip, int target_threads)
+{
+	struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
+	struct kvmppc_vcore *pvc, *vcnext;
+
+	spin_lock(&lp->lock);
+	list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) {
+		if (!spin_trylock(&pvc->lock))
+			continue;
+		prepare_threads(pvc);
+		if (!pvc->n_runnable) {
+			list_del_init(&pvc->preempt_list);
+			if (pvc->runner == NULL) {
+				pvc->vcore_state = VCORE_INACTIVE;
+				kvmppc_core_end_stolen(pvc);
+			}
+			spin_unlock(&pvc->lock);
+			continue;
+		}
+		if (!can_piggyback(pvc, cip, target_threads)) {
+			spin_unlock(&pvc->lock);
+			continue;
+		}
+		kvmppc_core_end_stolen(pvc);
+		pvc->vcore_state = VCORE_PIGGYBACK;
+		if (cip->total_threads >= target_threads)
+			break;
+	}
+	spin_unlock(&lp->lock);
+}
+
+static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 {
 {
+	int still_running = 0;
 	u64 now;
 	u64 now;
 	long ret;
 	long ret;
 	struct kvm_vcpu *vcpu, *vnext;
 	struct kvm_vcpu *vcpu, *vnext;
 
 
+	spin_lock(&vc->lock);
 	now = get_tb();
 	now = get_tb();
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 	list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads,
 				 arch.run_list) {
 				 arch.run_list) {
@@ -1933,17 +2269,36 @@ static void post_guest_process(struct kvmppc_vcore *vc)
 		vcpu->arch.ret = ret;
 		vcpu->arch.ret = ret;
 		vcpu->arch.trap = 0;
 		vcpu->arch.trap = 0;
 
 
-		if (vcpu->arch.ceded) {
-			if (!is_kvmppc_resume_guest(ret))
-				kvmppc_end_cede(vcpu);
-			else
+		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			if (vcpu->arch.pending_exceptions)
+				kvmppc_core_prepare_to_enter(vcpu);
+			if (vcpu->arch.ceded)
 				kvmppc_set_timer(vcpu);
 				kvmppc_set_timer(vcpu);
-		}
-		if (!is_kvmppc_resume_guest(vcpu->arch.ret)) {
+			else
+				++still_running;
+		} else {
 			kvmppc_remove_runnable(vc, vcpu);
 			kvmppc_remove_runnable(vc, vcpu);
 			wake_up(&vcpu->arch.cpu_run);
 			wake_up(&vcpu->arch.cpu_run);
 		}
 		}
 	}
 	}
+	list_del_init(&vc->preempt_list);
+	if (!is_master) {
+		if (still_running > 0) {
+			kvmppc_vcore_preempt(vc);
+		} else if (vc->runner) {
+			vc->vcore_state = VCORE_PREEMPT;
+			kvmppc_core_start_stolen(vc);
+		} else {
+			vc->vcore_state = VCORE_INACTIVE;
+		}
+		if (vc->n_runnable > 0 && vc->runner == NULL) {
+			/* make sure there's a candidate runner awake */
+			vcpu = list_first_entry(&vc->runnable_threads,
+						struct kvm_vcpu, arch.run_list);
+			wake_up(&vcpu->arch.cpu_run);
+		}
+	}
+	spin_unlock(&vc->lock);
 }
 }
 
 
 /*
 /*
@@ -1955,6 +2310,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	struct kvm_vcpu *vcpu, *vnext;
 	struct kvm_vcpu *vcpu, *vnext;
 	int i;
 	int i;
 	int srcu_idx;
 	int srcu_idx;
+	struct core_info core_info;
+	struct kvmppc_vcore *pvc, *vcnext;
+	struct kvm_split_mode split_info, *sip;
+	int split, subcore_size, active;
+	int sub;
+	bool thr0_done;
+	unsigned long cmd_bit, stat_bit;
+	int pcpu, thr;
+	int target_threads;
 
 
 	/*
 	/*
 	 * Remove from the list any threads that have a signal pending
 	 * Remove from the list any threads that have a signal pending
@@ -1969,11 +2333,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 	/*
 	/*
 	 * Initialize *vc.
 	 * Initialize *vc.
 	 */
 	 */
-	vc->entry_exit_map = 0;
+	init_master_vcore(vc);
 	vc->preempt_tb = TB_NIL;
 	vc->preempt_tb = TB_NIL;
-	vc->in_guest = 0;
-	vc->napping_threads = 0;
-	vc->conferring_threads = 0;
 
 
 	/*
 	/*
 	 * Make sure we are running on primary threads, and that secondary
 	 * Make sure we are running on primary threads, and that secondary
@@ -1991,24 +2352,120 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 		goto out;
 		goto out;
 	}
 	}
 
 
+	/*
+	 * See if we could run any other vcores on the physical core
+	 * along with this one.
+	 */
+	init_core_info(&core_info, vc);
+	pcpu = smp_processor_id();
+	target_threads = threads_per_subcore;
+	if (target_smt_mode && target_smt_mode < target_threads)
+		target_threads = target_smt_mode;
+	if (vc->num_threads < target_threads)
+		collect_piggybacks(&core_info, target_threads);
+
+	/* Decide on micro-threading (split-core) mode */
+	subcore_size = threads_per_subcore;
+	cmd_bit = stat_bit = 0;
+	split = core_info.n_subcores;
+	sip = NULL;
+	if (split > 1) {
+		/* threads_per_subcore must be MAX_SMT_THREADS (8) here */
+		if (split == 2 && (dynamic_mt_modes & 2)) {
+			cmd_bit = HID0_POWER8_1TO2LPAR;
+			stat_bit = HID0_POWER8_2LPARMODE;
+		} else {
+			split = 4;
+			cmd_bit = HID0_POWER8_1TO4LPAR;
+			stat_bit = HID0_POWER8_4LPARMODE;
+		}
+		subcore_size = MAX_SMT_THREADS / split;
+		sip = &split_info;
+		memset(&split_info, 0, sizeof(split_info));
+		split_info.rpr = mfspr(SPRN_RPR);
+		split_info.pmmar = mfspr(SPRN_PMMAR);
+		split_info.ldbar = mfspr(SPRN_LDBAR);
+		split_info.subcore_size = subcore_size;
+		for (sub = 0; sub < core_info.n_subcores; ++sub)
+			split_info.master_vcs[sub] =
+				list_first_entry(&core_info.vcs[sub],
+					struct kvmppc_vcore, preempt_list);
+		/* order writes to split_info before kvm_split_mode pointer */
+		smp_wmb();
+	}
+	pcpu = smp_processor_id();
+	for (thr = 0; thr < threads_per_subcore; ++thr)
+		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
+
+	/* Initiate micro-threading (split-core) if required */
+	if (cmd_bit) {
+		unsigned long hid0 = mfspr(SPRN_HID0);
+
+		hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS;
+		mb();
+		mtspr(SPRN_HID0, hid0);
+		isync();
+		for (;;) {
+			hid0 = mfspr(SPRN_HID0);
+			if (hid0 & stat_bit)
+				break;
+			cpu_relax();
+		}
+	}
 
 
-	vc->pcpu = smp_processor_id();
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) {
-		kvmppc_start_thread(vcpu);
-		kvmppc_create_dtl_entry(vcpu, vc);
-		trace_kvm_guest_enter(vcpu);
+	/* Start all the threads */
+	active = 0;
+	for (sub = 0; sub < core_info.n_subcores; ++sub) {
+		thr = subcore_thread_map[sub];
+		thr0_done = false;
+		active |= 1 << thr;
+		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
+			pvc->pcpu = pcpu + thr;
+			list_for_each_entry(vcpu, &pvc->runnable_threads,
+					    arch.run_list) {
+				kvmppc_start_thread(vcpu, pvc);
+				kvmppc_create_dtl_entry(vcpu, pvc);
+				trace_kvm_guest_enter(vcpu);
+				if (!vcpu->arch.ptid)
+					thr0_done = true;
+				active |= 1 << (thr + vcpu->arch.ptid);
+			}
+			/*
+			 * We need to start the first thread of each subcore
+			 * even if it doesn't have a vcpu.
+			 */
+			if (pvc->master_vcore == pvc && !thr0_done)
+				kvmppc_start_thread(NULL, pvc);
+			thr += pvc->num_threads;
+		}
 	}
 	}
 
 
-	/* Set this explicitly in case thread 0 doesn't have a vcpu */
-	get_paca()->kvm_hstate.kvm_vcore = vc;
-	get_paca()->kvm_hstate.ptid = 0;
+	/*
+	 * Ensure that split_info.do_nap is set after setting
+	 * the vcore pointer in the PACA of the secondaries.
+	 */
+	smp_mb();
+	if (cmd_bit)
+		split_info.do_nap = 1;	/* ask secondaries to nap when done */
+
+	/*
+	 * When doing micro-threading, poke the inactive threads as well.
+	 * This gets them to the nap instruction after kvm_do_nap,
+	 * which reduces the time taken to unsplit later.
+	 */
+	if (split > 1)
+		for (thr = 1; thr < threads_per_subcore; ++thr)
+			if (!(active & (1 << thr)))
+				kvmppc_ipi_thread(pcpu + thr);
 
 
 	vc->vcore_state = VCORE_RUNNING;
 	vc->vcore_state = VCORE_RUNNING;
 	preempt_disable();
 	preempt_disable();
 
 
 	trace_kvmppc_run_core(vc, 0);
 	trace_kvmppc_run_core(vc, 0);
 
 
-	spin_unlock(&vc->lock);
+	for (sub = 0; sub < core_info.n_subcores; ++sub)
+		list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
+			spin_unlock(&pvc->lock);
 
 
 	kvm_guest_enter();
 	kvm_guest_enter();
 
 
@@ -2019,32 +2476,58 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
 
 	__kvmppc_vcore_entry();
 	__kvmppc_vcore_entry();
 
 
-	spin_lock(&vc->lock);
-
 	if (vc->mpp_buffer)
 	if (vc->mpp_buffer)
 		kvmppc_start_saving_l2_cache(vc);
 		kvmppc_start_saving_l2_cache(vc);
 
 
-	/* disable sending of IPIs on virtual external irqs */
-	list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list)
-		vcpu->cpu = -1;
-	/* wait for secondary threads to finish writing their state to memory */
-	kvmppc_wait_for_nap();
-	for (i = 0; i < threads_per_subcore; ++i)
-		kvmppc_release_hwthread(vc->pcpu + i);
+	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+
+	spin_lock(&vc->lock);
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
 	/* prevent other vcpu threads from doing kvmppc_start_thread() now */
 	vc->vcore_state = VCORE_EXITING;
 	vc->vcore_state = VCORE_EXITING;
-	spin_unlock(&vc->lock);
 
 
-	srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
+	/* wait for secondary threads to finish writing their state to memory */
+	kvmppc_wait_for_nap();
+
+	/* Return to whole-core mode if we split the core earlier */
+	if (split > 1) {
+		unsigned long hid0 = mfspr(SPRN_HID0);
+		unsigned long loops = 0;
+
+		hid0 &= ~HID0_POWER8_DYNLPARDIS;
+		stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE;
+		mb();
+		mtspr(SPRN_HID0, hid0);
+		isync();
+		for (;;) {
+			hid0 = mfspr(SPRN_HID0);
+			if (!(hid0 & stat_bit))
+				break;
+			cpu_relax();
+			++loops;
+		}
+		split_info.do_nap = 0;
+	}
+
+	/* Let secondaries go back to the offline loop */
+	for (i = 0; i < threads_per_subcore; ++i) {
+		kvmppc_release_hwthread(pcpu + i);
+		if (sip && sip->napped[i])
+			kvmppc_ipi_thread(pcpu + i);
+	}
+
+	spin_unlock(&vc->lock);
 
 
 	/* make sure updates to secondary vcpu structs are visible now */
 	/* make sure updates to secondary vcpu structs are visible now */
 	smp_mb();
 	smp_mb();
 	kvm_guest_exit();
 	kvm_guest_exit();
 
 
-	preempt_enable();
+	for (sub = 0; sub < core_info.n_subcores; ++sub)
+		list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
+					 preempt_list)
+			post_guest_process(pvc, pvc == vc);
 
 
 	spin_lock(&vc->lock);
 	spin_lock(&vc->lock);
-	post_guest_process(vc);
+	preempt_enable();
 
 
  out:
  out:
 	vc->vcore_state = VCORE_INACTIVE;
 	vc->vcore_state = VCORE_INACTIVE;
@@ -2055,13 +2538,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
  * Wait for some other vcpu thread to execute us, and
  * Wait for some other vcpu thread to execute us, and
  * wake us up when we need to handle something in the host.
  * wake us up when we need to handle something in the host.
  */
  */
-static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state)
+static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc,
+				 struct kvm_vcpu *vcpu, int wait_state)
 {
 {
 	DEFINE_WAIT(wait);
 	DEFINE_WAIT(wait);
 
 
 	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
 	prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state);
-	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE)
+	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
+		spin_unlock(&vc->lock);
 		schedule();
 		schedule();
+		spin_lock(&vc->lock);
+	}
 	finish_wait(&vcpu->arch.cpu_run, &wait);
 	finish_wait(&vcpu->arch.cpu_run, &wait);
 }
 }
 
 
@@ -2137,9 +2624,21 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	 * this thread straight away and have it join in.
 	 * this thread straight away and have it join in.
 	 */
 	 */
 	if (!signal_pending(current)) {
 	if (!signal_pending(current)) {
-		if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) {
+		if (vc->vcore_state == VCORE_PIGGYBACK) {
+			struct kvmppc_vcore *mvc = vc->master_vcore;
+			if (spin_trylock(&mvc->lock)) {
+				if (mvc->vcore_state == VCORE_RUNNING &&
+				    !VCORE_IS_EXITING(mvc)) {
+					kvmppc_create_dtl_entry(vcpu, vc);
+					kvmppc_start_thread(vcpu, vc);
+					trace_kvm_guest_enter(vcpu);
+				}
+				spin_unlock(&mvc->lock);
+			}
+		} else if (vc->vcore_state == VCORE_RUNNING &&
+			   !VCORE_IS_EXITING(vc)) {
 			kvmppc_create_dtl_entry(vcpu, vc);
 			kvmppc_create_dtl_entry(vcpu, vc);
-			kvmppc_start_thread(vcpu);
+			kvmppc_start_thread(vcpu, vc);
 			trace_kvm_guest_enter(vcpu);
 			trace_kvm_guest_enter(vcpu);
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
 		} else if (vc->vcore_state == VCORE_SLEEPING) {
 			wake_up(&vc->wq);
 			wake_up(&vc->wq);
@@ -2149,10 +2648,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	       !signal_pending(current)) {
 	       !signal_pending(current)) {
+		if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL)
+			kvmppc_vcore_end_preempt(vc);
+
 		if (vc->vcore_state != VCORE_INACTIVE) {
 		if (vc->vcore_state != VCORE_INACTIVE) {
-			spin_unlock(&vc->lock);
-			kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE);
-			spin_lock(&vc->lock);
+			kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE);
 			continue;
 			continue;
 		}
 		}
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
 		list_for_each_entry_safe(v, vn, &vc->runnable_threads,
@@ -2179,10 +2679,11 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 		if (n_ceded == vc->n_runnable) {
 		if (n_ceded == vc->n_runnable) {
 			kvmppc_vcore_blocked(vc);
 			kvmppc_vcore_blocked(vc);
 		} else if (need_resched()) {
 		} else if (need_resched()) {
-			vc->vcore_state = VCORE_PREEMPT;
+			kvmppc_vcore_preempt(vc);
 			/* Let something else run */
 			/* Let something else run */
 			cond_resched_lock(&vc->lock);
 			cond_resched_lock(&vc->lock);
-			vc->vcore_state = VCORE_INACTIVE;
+			if (vc->vcore_state == VCORE_PREEMPT)
+				kvmppc_vcore_end_preempt(vc);
 		} else {
 		} else {
 			kvmppc_run_core(vc);
 			kvmppc_run_core(vc);
 		}
 		}
@@ -2191,11 +2692,8 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE &&
 	       (vc->vcore_state == VCORE_RUNNING ||
 	       (vc->vcore_state == VCORE_RUNNING ||
-		vc->vcore_state == VCORE_EXITING)) {
-		spin_unlock(&vc->lock);
-		kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE);
-		spin_lock(&vc->lock);
-	}
+		vc->vcore_state == VCORE_EXITING))
+		kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE);
 
 
 	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
 	if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) {
 		kvmppc_remove_runnable(vc, vcpu);
 		kvmppc_remove_runnable(vc, vcpu);
@@ -2755,6 +3253,8 @@ static int kvmppc_book3s_init_hv(void)
 
 
 	init_default_hcalls();
 	init_default_hcalls();
 
 
+	init_vcore_lists();
+
 	r = kvmppc_mmu_hv_init();
 	r = kvmppc_mmu_hv_init();
 	return r;
 	return r;
 }
 }

+ 28 - 4
arch/powerpc/kvm/book3s_hv_builtin.c

@@ -110,14 +110,15 @@ void __init kvm_cma_reserve(void)
 long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
 long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
 			    unsigned int yield_count)
 			    unsigned int yield_count)
 {
 {
-	struct kvmppc_vcore *vc = vcpu->arch.vcore;
+	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
+	int ptid = local_paca->kvm_hstate.ptid;
 	int threads_running;
 	int threads_running;
 	int threads_ceded;
 	int threads_ceded;
 	int threads_conferring;
 	int threads_conferring;
 	u64 stop = get_tb() + 10 * tb_ticks_per_usec;
 	u64 stop = get_tb() + 10 * tb_ticks_per_usec;
 	int rv = H_SUCCESS; /* => don't yield */
 	int rv = H_SUCCESS; /* => don't yield */
 
 
-	set_bit(vcpu->arch.ptid, &vc->conferring_threads);
+	set_bit(ptid, &vc->conferring_threads);
 	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
 	while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) {
 		threads_running = VCORE_ENTRY_MAP(vc);
 		threads_running = VCORE_ENTRY_MAP(vc);
 		threads_ceded = vc->napping_threads;
 		threads_ceded = vc->napping_threads;
@@ -127,7 +128,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
 			break;
 			break;
 		}
 		}
 	}
 	}
-	clear_bit(vcpu->arch.ptid, &vc->conferring_threads);
+	clear_bit(ptid, &vc->conferring_threads);
 	return rv;
 	return rv;
 }
 }
 
 
@@ -238,7 +239,8 @@ void kvmhv_commence_exit(int trap)
 {
 {
 	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
 	struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
 	int ptid = local_paca->kvm_hstate.ptid;
 	int ptid = local_paca->kvm_hstate.ptid;
-	int me, ee;
+	struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
+	int me, ee, i;
 
 
 	/* Set our bit in the threads-exiting-guest map in the 0xff00
 	/* Set our bit in the threads-exiting-guest map in the 0xff00
 	   bits of vcore->entry_exit_map */
 	   bits of vcore->entry_exit_map */
@@ -258,4 +260,26 @@ void kvmhv_commence_exit(int trap)
 	 */
 	 */
 	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
 	if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER)
 		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
 		kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid));
+
+	/*
+	 * If we are doing dynamic micro-threading, interrupt the other
+	 * subcores to pull them out of their guests too.
+	 */
+	if (!sip)
+		return;
+
+	for (i = 0; i < MAX_SUBCORES; ++i) {
+		vc = sip->master_vcs[i];
+		if (!vc)
+			break;
+		do {
+			ee = vc->entry_exit_map;
+			/* Already asked to exit? */
+			if ((ee >> 8) != 0)
+				break;
+		} while (cmpxchg(&vc->entry_exit_map, ee,
+				 ee | VCORE_EXIT_REQ) != ee);
+		if ((ee >> 8) == 0)
+			kvmhv_interrupt_vcore(vc, ee);
+	}
 }
 }

+ 148 - 13
arch/powerpc/kvm/book3s_hv_rm_mmu.c

@@ -12,6 +12,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/hugetlb.h>
 #include <linux/hugetlb.h>
 #include <linux/module.h>
 #include <linux/module.h>
+#include <linux/log2.h>
 
 
 #include <asm/tlbflush.h>
 #include <asm/tlbflush.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
@@ -97,25 +98,52 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
 }
 }
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
 
 
+/* Update the changed page order field of an rmap entry */
+void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
+{
+	unsigned long order;
+
+	if (!psize)
+		return;
+	order = ilog2(psize);
+	order <<= KVMPPC_RMAP_CHG_SHIFT;
+	if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
+		*rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
+}
+EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
+
+/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
+static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
+				      unsigned long hpte_gr)
+{
+	struct kvm_memory_slot *memslot;
+	unsigned long *rmap;
+	unsigned long gfn;
+
+	gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
+	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+	if (!memslot)
+		return NULL;
+
+	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
+	return rmap;
+}
+
 /* Remove this HPTE from the chain for a real page */
 /* Remove this HPTE from the chain for a real page */
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 				struct revmap_entry *rev,
 				struct revmap_entry *rev,
 				unsigned long hpte_v, unsigned long hpte_r)
 				unsigned long hpte_v, unsigned long hpte_r)
 {
 {
 	struct revmap_entry *next, *prev;
 	struct revmap_entry *next, *prev;
-	unsigned long gfn, ptel, head;
-	struct kvm_memory_slot *memslot;
+	unsigned long ptel, head;
 	unsigned long *rmap;
 	unsigned long *rmap;
 	unsigned long rcbits;
 	unsigned long rcbits;
 
 
 	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 	rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
 	ptel = rev->guest_rpte |= rcbits;
 	ptel = rev->guest_rpte |= rcbits;
-	gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel));
-	memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
-	if (!memslot)
+	rmap = revmap_for_hpte(kvm, hpte_v, ptel);
+	if (!rmap)
 		return;
 		return;
-
-	rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]);
 	lock_rmap(rmap);
 	lock_rmap(rmap);
 
 
 	head = *rmap & KVMPPC_RMAP_INDEX;
 	head = *rmap & KVMPPC_RMAP_INDEX;
@@ -131,6 +159,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
 			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 			*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head;
 	}
 	}
 	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
 	*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
+	if (rcbits & HPTE_R_C)
+		kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
 	unlock_rmap(rmap);
 	unlock_rmap(rmap);
 }
 }
 
 
@@ -421,14 +451,20 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
 	v = pte & ~HPTE_V_HVLOCK;
 	v = pte & ~HPTE_V_HVLOCK;
 	if (v & HPTE_V_VALID) {
 	if (v & HPTE_V_VALID) {
-		u64 pte1;
-
-		pte1 = be64_to_cpu(hpte[1]);
 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
-		rb = compute_tlbie_rb(v, pte1, pte_index);
+		rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
 		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
 		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
-		/* Read PTE low word after tlbie to get final R/C values */
-		remove_revmap_chain(kvm, pte_index, rev, v, pte1);
+		/*
+		 * The reference (R) and change (C) bits in a HPT
+		 * entry can be set by hardware at any time up until
+		 * the HPTE is invalidated and the TLB invalidation
+		 * sequence has completed.  This means that when
+		 * removing a HPTE, we need to re-read the HPTE after
+		 * the invalidation sequence has completed in order to
+		 * obtain reliable values of R and C.
+		 */
+		remove_revmap_chain(kvm, pte_index, rev, v,
+				    be64_to_cpu(hpte[1]));
 	}
 	}
 	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
 	r = rev->guest_rpte & ~HPTE_GR_RESERVED;
 	note_hpte_modification(kvm, rev);
 	note_hpte_modification(kvm, rev);
@@ -655,6 +691,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 	return H_SUCCESS;
 	return H_SUCCESS;
 }
 }
 
 
+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
+			unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r, gr;
+	struct revmap_entry *rev;
+	unsigned long *rmap;
+	long ret = H_NOT_FOUND;
+
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = be64_to_cpu(hpte[0]);
+	r = be64_to_cpu(hpte[1]);
+	if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+		goto out;
+
+	gr = rev->guest_rpte;
+	if (rev->guest_rpte & HPTE_R_R) {
+		rev->guest_rpte &= ~HPTE_R_R;
+		note_hpte_modification(kvm, rev);
+	}
+	if (v & HPTE_V_VALID) {
+		gr |= r & (HPTE_R_R | HPTE_R_C);
+		if (r & HPTE_R_R) {
+			kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
+			rmap = revmap_for_hpte(kvm, v, gr);
+			if (rmap) {
+				lock_rmap(rmap);
+				*rmap |= KVMPPC_RMAP_REFERENCED;
+				unlock_rmap(rmap);
+			}
+		}
+	}
+	vcpu->arch.gpr[4] = gr;
+	ret = H_SUCCESS;
+ out:
+	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	return ret;
+}
+
+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
+			unsigned long pte_index)
+{
+	struct kvm *kvm = vcpu->kvm;
+	__be64 *hpte;
+	unsigned long v, r, gr;
+	struct revmap_entry *rev;
+	unsigned long *rmap;
+	long ret = H_NOT_FOUND;
+
+	if (pte_index >= kvm->arch.hpt_npte)
+		return H_PARAMETER;
+
+	rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]);
+	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
+	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
+		cpu_relax();
+	v = be64_to_cpu(hpte[0]);
+	r = be64_to_cpu(hpte[1]);
+	if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT)))
+		goto out;
+
+	gr = rev->guest_rpte;
+	if (gr & HPTE_R_C) {
+		rev->guest_rpte &= ~HPTE_R_C;
+		note_hpte_modification(kvm, rev);
+	}
+	if (v & HPTE_V_VALID) {
+		/* need to make it temporarily absent so C is stable */
+		hpte[0] |= cpu_to_be64(HPTE_V_ABSENT);
+		kvmppc_invalidate_hpte(kvm, hpte, pte_index);
+		r = be64_to_cpu(hpte[1]);
+		gr |= r & (HPTE_R_R | HPTE_R_C);
+		if (r & HPTE_R_C) {
+			unsigned long psize = hpte_page_size(v, r);
+			hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
+			eieio();
+			rmap = revmap_for_hpte(kvm, v, gr);
+			if (rmap) {
+				lock_rmap(rmap);
+				*rmap |= KVMPPC_RMAP_CHANGED;
+				kvmppc_update_rmap_change(rmap, psize);
+				unlock_rmap(rmap);
+			}
+		}
+	}
+	vcpu->arch.gpr[4] = gr;
+	ret = H_SUCCESS;
+ out:
+	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
+	return ret;
+}
+
 void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 			unsigned long pte_index)
 			unsigned long pte_index)
 {
 {

+ 1 - 3
arch/powerpc/kvm/book3s_hv_rm_xics.c

@@ -67,14 +67,12 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 	}
 	}
 
 
 	/* Check if the core is loaded, if not, too hard */
 	/* Check if the core is loaded, if not, too hard */
-	cpu = vcpu->cpu;
+	cpu = vcpu->arch.thread_cpu;
 	if (cpu < 0 || cpu >= nr_cpu_ids) {
 	if (cpu < 0 || cpu >= nr_cpu_ids) {
 		this_icp->rm_action |= XICS_RM_KICK_VCPU;
 		this_icp->rm_action |= XICS_RM_KICK_VCPU;
 		this_icp->rm_kick_target = vcpu;
 		this_icp->rm_kick_target = vcpu;
 		return;
 		return;
 	}
 	}
-	/* In SMT cpu will always point to thread 0, we adjust it */
-	cpu += vcpu->arch.ptid;
 
 
 	smp_mb();
 	smp_mb();
 	kvmhv_rm_send_ipi(cpu);
 	kvmhv_rm_send_ipi(cpu);

+ 117 - 20
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -128,6 +128,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	subf	r4, r4, r3
 	subf	r4, r4, r3
 	mtspr	SPRN_DEC, r4
 	mtspr	SPRN_DEC, r4
 
 
+	/* hwthread_req may have got set by cede or no vcpu, so clear it */
+	li	r0, 0
+	stb	r0, HSTATE_HWTHREAD_REQ(r13)
+
 	/*
 	/*
 	 * For external and machine check interrupts, we need
 	 * For external and machine check interrupts, we need
 	 * to call the Linux handler to process the interrupt.
 	 * to call the Linux handler to process the interrupt.
@@ -215,7 +219,6 @@ kvm_novcpu_wakeup:
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	ld	r5, HSTATE_KVM_VCORE(r13)
 	li	r0, 0
 	li	r0, 0
 	stb	r0, HSTATE_NAPPING(r13)
 	stb	r0, HSTATE_NAPPING(r13)
-	stb	r0, HSTATE_HWTHREAD_REQ(r13)
 
 
 	/* check the wake reason */
 	/* check the wake reason */
 	bl	kvmppc_check_wake_reason
 	bl	kvmppc_check_wake_reason
@@ -315,10 +318,10 @@ kvm_start_guest:
 	cmpdi	r3, 0
 	cmpdi	r3, 0
 	bge	kvm_no_guest
 	bge	kvm_no_guest
 
 
-	/* get vcpu pointer, NULL if we have no vcpu to run */
-	ld	r4,HSTATE_KVM_VCPU(r13)
-	cmpdi	r4,0
-	/* if we have no vcpu to run, go back to sleep */
+	/* get vcore pointer, NULL if we have nothing to run */
+	ld	r5,HSTATE_KVM_VCORE(r13)
+	cmpdi	r5,0
+	/* if we have no vcore to run, go back to sleep */
 	beq	kvm_no_guest
 	beq	kvm_no_guest
 
 
 kvm_secondary_got_guest:
 kvm_secondary_got_guest:
@@ -327,21 +330,42 @@ kvm_secondary_got_guest:
 	ld	r6, PACA_DSCR_DEFAULT(r13)
 	ld	r6, PACA_DSCR_DEFAULT(r13)
 	std	r6, HSTATE_DSCR(r13)
 	std	r6, HSTATE_DSCR(r13)
 
 
-	/* Order load of vcore, ptid etc. after load of vcpu */
+	/* On thread 0 of a subcore, set HDEC to max */
+	lbz	r4, HSTATE_PTID(r13)
+	cmpwi	r4, 0
+	bne	63f
+	lis	r6, 0x7fff
+	ori	r6, r6, 0xffff
+	mtspr	SPRN_HDEC, r6
+	/* and set per-LPAR registers, if doing dynamic micro-threading */
+	ld	r6, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r6, 0
+	beq	63f
+	ld	r0, KVM_SPLIT_RPR(r6)
+	mtspr	SPRN_RPR, r0
+	ld	r0, KVM_SPLIT_PMMAR(r6)
+	mtspr	SPRN_PMMAR, r0
+	ld	r0, KVM_SPLIT_LDBAR(r6)
+	mtspr	SPRN_LDBAR, r0
+	isync
+63:
+	/* Order load of vcpu after load of vcore */
 	lwsync
 	lwsync
+	ld	r4, HSTATE_KVM_VCPU(r13)
 	bl	kvmppc_hv_entry
 	bl	kvmppc_hv_entry
 
 
 	/* Back from the guest, go back to nap */
 	/* Back from the guest, go back to nap */
-	/* Clear our vcpu pointer so we don't come back in early */
+	/* Clear our vcpu and vcore pointers so we don't come back in early */
 	li	r0, 0
 	li	r0, 0
+	std	r0, HSTATE_KVM_VCPU(r13)
 	/*
 	/*
-	 * Once we clear HSTATE_KVM_VCPU(r13), the code in
+	 * Once we clear HSTATE_KVM_VCORE(r13), the code in
 	 * kvmppc_run_core() is going to assume that all our vcpu
 	 * kvmppc_run_core() is going to assume that all our vcpu
 	 * state is visible in memory.  This lwsync makes sure
 	 * state is visible in memory.  This lwsync makes sure
 	 * that that is true.
 	 * that that is true.
 	 */
 	 */
 	lwsync
 	lwsync
-	std	r0, HSTATE_KVM_VCPU(r13)
+	std	r0, HSTATE_KVM_VCORE(r13)
 
 
 /*
 /*
  * At this point we have finished executing in the guest.
  * At this point we have finished executing in the guest.
@@ -374,16 +398,71 @@ kvm_no_guest:
 	b	power7_wakeup_loss
 	b	power7_wakeup_loss
 
 
 53:	HMT_LOW
 53:	HMT_LOW
-	ld	r4, HSTATE_KVM_VCPU(r13)
-	cmpdi	r4, 0
+	ld	r5, HSTATE_KVM_VCORE(r13)
+	cmpdi	r5, 0
+	bne	60f
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	cmpdi	r3, 0
+	beq	kvm_no_guest
+	lbz	r0, KVM_SPLIT_DO_NAP(r3)
+	cmpwi	r0, 0
 	beq	kvm_no_guest
 	beq	kvm_no_guest
 	HMT_MEDIUM
 	HMT_MEDIUM
+	b	kvm_unsplit_nap
+60:	HMT_MEDIUM
 	b	kvm_secondary_got_guest
 	b	kvm_secondary_got_guest
 
 
 54:	li	r0, KVM_HWTHREAD_IN_KVM
 54:	li	r0, KVM_HWTHREAD_IN_KVM
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
 	stb	r0, HSTATE_HWTHREAD_STATE(r13)
 	b	kvm_no_guest
 	b	kvm_no_guest
 
 
+/*
+ * Here the primary thread is trying to return the core to
+ * whole-core mode, so we need to nap.
+ */
+kvm_unsplit_nap:
+	/*
+	 * Ensure that secondary doesn't nap when it has
+	 * its vcore pointer set.
+	 */
+	sync		/* matches smp_mb() before setting split_info.do_nap */
+	ld	r0, HSTATE_KVM_VCORE(r13)
+	cmpdi	r0, 0
+	bne	kvm_no_guest
+	/* clear any pending message */
+BEGIN_FTR_SECTION
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+	/* Set kvm_split_mode.napped[tid] = 1 */
+	ld	r3, HSTATE_SPLIT_MODE(r13)
+	li	r0, 1
+	lhz	r4, PACAPACAINDEX(r13)
+	clrldi	r4, r4, 61	/* micro-threading => P8 => 8 threads/core */
+	addi	r4, r4, KVM_SPLIT_NAPPED
+	stbx	r0, r3, r4
+	/* Check the do_nap flag again after setting napped[] */
+	sync
+	lbz	r0, KVM_SPLIT_DO_NAP(r3)
+	cmpwi	r0, 0
+	beq	57f
+	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
+	mfspr	r4, SPRN_LPCR
+	rlwimi	r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
+	mtspr	SPRN_LPCR, r4
+	isync
+	std	r0, HSTATE_SCRATCH0(r13)
+	ptesync
+	ld	r0, HSTATE_SCRATCH0(r13)
+1:	cmpd	r0, r0
+	bne	1b
+	nap
+	b	.
+
+57:	li	r0, 0
+	stbx	r0, r3, r4
+	b	kvm_no_guest
+
 /******************************************************************************
 /******************************************************************************
  *                                                                            *
  *                                                                            *
  *                               Entry code                                   *
  *                               Entry code                                   *
@@ -854,7 +933,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	cmpwi	r0, 0
 	cmpwi	r0, 0
 	bne	21f
 	bne	21f
 	HMT_LOW
 	HMT_LOW
-20:	lbz	r0, VCORE_IN_GUEST(r5)
+20:	lwz	r3, VCORE_ENTRY_EXIT(r5)
+	cmpwi	r3, 0x100
+	bge	no_switch_exit
+	lbz	r0, VCORE_IN_GUEST(r5)
 	cmpwi	r0, 0
 	cmpwi	r0, 0
 	beq	20b
 	beq	20b
 	HMT_MEDIUM
 	HMT_MEDIUM
@@ -870,7 +952,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 	blt	hdec_soon
 	blt	hdec_soon
 
 
 	ld	r6, VCPU_CTR(r4)
 	ld	r6, VCPU_CTR(r4)
-	lwz	r7, VCPU_XER(r4)
+	ld	r7, VCPU_XER(r4)
 
 
 	mtctr	r6
 	mtctr	r6
 	mtxer	r7
 	mtxer	r7
@@ -985,9 +1067,13 @@ secondary_too_late:
 #endif
 #endif
 11:	b	kvmhv_switch_to_host
 11:	b	kvmhv_switch_to_host
 
 
+no_switch_exit:
+	HMT_MEDIUM
+	li	r12, 0
+	b	12f
 hdec_soon:
 hdec_soon:
 	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
 	li	r12, BOOK3S_INTERRUPT_HV_DECREMENTER
-	stw	r12, VCPU_TRAP(r4)
+12:	stw	r12, VCPU_TRAP(r4)
 	mr	r9, r4
 	mr	r9, r4
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
 	addi	r3, r4, VCPU_TB_RMEXIT
 	addi	r3, r4, VCPU_TB_RMEXIT
@@ -1103,7 +1189,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	mfctr	r3
 	mfctr	r3
 	mfxer	r4
 	mfxer	r4
 	std	r3, VCPU_CTR(r9)
 	std	r3, VCPU_CTR(r9)
-	stw	r4, VCPU_XER(r9)
+	std	r4, VCPU_XER(r9)
 
 
 	/* If this is a page table miss then see if it's theirs or ours */
 	/* If this is a page table miss then see if it's theirs or ours */
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
@@ -1127,6 +1213,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	bne	3f
 	bne	3f
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	lbz	r0, HSTATE_HOST_IPI(r13)
+	cmpwi	r0, 0
 	beq	4f
 	beq	4f
 	b	guest_exit_cont
 	b	guest_exit_cont
 3:
 3:
@@ -1176,6 +1263,11 @@ mc_cont:
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	lwz	r12, VCPU_TRAP(r9)
 	lwz	r12, VCPU_TRAP(r9)
 
 
+	/* Stop others sending VCPU interrupts to this physical CPU */
+	li	r0, -1
+	stw	r0, VCPU_CPU(r9)
+	stw	r0, VCPU_THREAD_CPU(r9)
+
 	/* Save guest CTRL register, set runlatch to 1 */
 	/* Save guest CTRL register, set runlatch to 1 */
 	mfspr	r6,SPRN_CTRLF
 	mfspr	r6,SPRN_CTRLF
 	stw	r6,VCPU_CTRL(r9)
 	stw	r6,VCPU_CTRL(r9)
@@ -1540,12 +1632,17 @@ kvmhv_switch_to_host:
 
 
 	/* Primary thread waits for all the secondaries to exit guest */
 	/* Primary thread waits for all the secondaries to exit guest */
 15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
 15:	lwz	r3,VCORE_ENTRY_EXIT(r5)
-	srwi	r0,r3,8
+	rlwinm	r0,r3,32-8,0xff
 	clrldi	r3,r3,56
 	clrldi	r3,r3,56
 	cmpw	r3,r0
 	cmpw	r3,r0
 	bne	15b
 	bne	15b
 	isync
 	isync
 
 
+	/* Did we actually switch to the guest at all? */
+	lbz	r6, VCORE_IN_GUEST(r5)
+	cmpwi	r6, 0
+	beq	19f
+
 	/* Primary thread switches back to host partition */
 	/* Primary thread switches back to host partition */
 	ld	r6,KVM_HOST_SDR1(r4)
 	ld	r6,KVM_HOST_SDR1(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
 	lwz	r7,KVM_HOST_LPID(r4)
@@ -1589,7 +1686,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 18:
 18:
 	/* Signal secondary CPUs to continue */
 	/* Signal secondary CPUs to continue */
 	stb	r0,VCORE_IN_GUEST(r5)
 	stb	r0,VCORE_IN_GUEST(r5)
-	lis	r8,0x7fff		/* MAX_INT@h */
+19:	lis	r8,0x7fff		/* MAX_INT@h */
 	mtspr	SPRN_HDEC,r8
 	mtspr	SPRN_HDEC,r8
 
 
 16:	ld	r8,KVM_HOST_LPCR(r4)
 16:	ld	r8,KVM_HOST_LPCR(r4)
@@ -1675,7 +1772,7 @@ kvmppc_hdsi:
 	bl	kvmppc_msr_interrupt
 	bl	kvmppc_msr_interrupt
 fast_interrupt_c_return:
 fast_interrupt_c_return:
 6:	ld	r7, VCPU_CTR(r9)
 6:	ld	r7, VCPU_CTR(r9)
-	lwz	r8, VCPU_XER(r9)
+	ld	r8, VCPU_XER(r9)
 	mtctr	r7
 	mtctr	r7
 	mtxer	r8
 	mtxer	r8
 	mr	r4, r9
 	mr	r4, r9
@@ -1816,8 +1913,8 @@ hcall_real_table:
 	.long	DOTSYM(kvmppc_h_remove) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_remove) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_enter) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_enter) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_read) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_read) - hcall_real_table
-	.long	0		/* 0x10 - H_CLEAR_MOD */
-	.long	0		/* 0x14 - H_CLEAR_REF */
+	.long	DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
+	.long	DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_protect) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_protect) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_get_tce) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_get_tce) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_put_tce) - hcall_real_table
 	.long	DOTSYM(kvmppc_h_put_tce) - hcall_real_table

+ 1 - 1
arch/powerpc/kvm/book3s_paired_singles.c

@@ -352,7 +352,7 @@ static inline u32 inst_get_field(u32 inst, int msb, int lsb)
 	return kvmppc_get_field(inst, msb + 32, lsb + 32);
 	return kvmppc_get_field(inst, msb + 32, lsb + 32);
 }
 }
 
 
-bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
+static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst)
 {
 {
 	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
 	if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE))
 		return false;
 		return false;

+ 2 - 2
arch/powerpc/kvm/book3s_segment.S

@@ -123,7 +123,7 @@ no_dcbz32_on:
 	PPC_LL	r8, SVCPU_CTR(r3)
 	PPC_LL	r8, SVCPU_CTR(r3)
 	PPC_LL	r9, SVCPU_LR(r3)
 	PPC_LL	r9, SVCPU_LR(r3)
 	lwz	r10, SVCPU_CR(r3)
 	lwz	r10, SVCPU_CR(r3)
-	lwz	r11, SVCPU_XER(r3)
+	PPC_LL	r11, SVCPU_XER(r3)
 
 
 	mtctr	r8
 	mtctr	r8
 	mtlr	r9
 	mtlr	r9
@@ -237,7 +237,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	mfctr	r8
 	mfctr	r8
 	mflr	r9
 	mflr	r9
 
 
-	stw	r5, SVCPU_XER(r13)
+	PPC_STL	r5, SVCPU_XER(r13)
 	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
 	PPC_STL	r6, SVCPU_FAULT_DAR(r13)
 	stw	r7, SVCPU_FAULT_DSISR(r13)
 	stw	r7, SVCPU_FAULT_DSISR(r13)
 	PPC_STL	r8, SVCPU_CTR(r13)
 	PPC_STL	r8, SVCPU_CTR(r13)

+ 1 - 1
arch/powerpc/kvm/book3s_xics.c

@@ -41,7 +41,7 @@
  * =======
  * =======
  *
  *
  * Each ICS has a spin lock protecting the information about the IRQ
  * Each ICS has a spin lock protecting the information about the IRQ
- * sources and avoiding simultaneous deliveries if the same interrupt.
+ * sources and avoiding simultaneous deliveries of the same interrupt.
  *
  *
  * ICP operations are done via a single compare & swap transaction
  * ICP operations are done via a single compare & swap transaction
  * (most ICP state fits in the union kvmppc_icp_state)
  * (most ICP state fits in the union kvmppc_icp_state)

+ 1 - 0
arch/powerpc/kvm/booke.c

@@ -933,6 +933,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu,
 #endif
 #endif
 		break;
 		break;
 	case BOOKE_INTERRUPT_CRITICAL:
 	case BOOKE_INTERRUPT_CRITICAL:
+		kvmppc_fill_pt_regs(&regs);
 		unknown_exception(&regs);
 		unknown_exception(&regs);
 		break;
 		break;
 	case BOOKE_INTERRUPT_DEBUG:
 	case BOOKE_INTERRUPT_DEBUG:

+ 1 - 1
arch/powerpc/kvm/e500_mmu.c

@@ -377,7 +377,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea)
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 			| MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]);
 		vcpu->arch.shared->mas1 =
 		vcpu->arch.shared->mas1 =
 			  (vcpu->arch.shared->mas6 & MAS6_SPID0)
 			  (vcpu->arch.shared->mas6 & MAS6_SPID0)
-			| (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0))
+			| ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0)
 			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
 			| (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0));
 		vcpu->arch.shared->mas2 &= MAS2_EPN;
 		vcpu->arch.shared->mas2 &= MAS2_EPN;
 		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &
 		vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 &

+ 1 - 1
arch/powerpc/kvm/powerpc.c

@@ -660,7 +660,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return kvmppc_core_pending_dec(vcpu);
 	return kvmppc_core_pending_dec(vcpu);
 }
 }
 
 
-enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
+static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer)
 {
 {
 	struct kvm_vcpu *vcpu;
 	struct kvm_vcpu *vcpu;
 
 

+ 1 - 1
arch/x86/kvm/emulate.c

@@ -650,6 +650,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	u16 sel;
 	u16 sel;
 
 
 	la = seg_base(ctxt, addr.seg) + addr.ea;
 	la = seg_base(ctxt, addr.seg) + addr.ea;
+	*linear = la;
 	*max_size = 0;
 	*max_size = 0;
 	switch (mode) {
 	switch (mode) {
 	case X86EMUL_MODE_PROT64:
 	case X86EMUL_MODE_PROT64:
@@ -693,7 +694,6 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt,
 	}
 	}
 	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
 	if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0))
 		return emulate_gp(ctxt, 0);
 		return emulate_gp(ctxt, 0);
-	*linear = la;
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 bad:
 bad:
 	if (addr.seg == VCPU_SREG_SS)
 	if (addr.seg == VCPU_SREG_SS)

+ 4 - 3
arch/x86/kvm/mmu.c

@@ -3309,13 +3309,14 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 
 
 	walk_shadow_page_lockless_begin(vcpu);
 	walk_shadow_page_lockless_begin(vcpu);
 
 
-	for (shadow_walk_init(&iterator, vcpu, addr), root = iterator.level;
+	for (shadow_walk_init(&iterator, vcpu, addr),
+		 leaf = root = iterator.level;
 	     shadow_walk_okay(&iterator);
 	     shadow_walk_okay(&iterator);
 	     __shadow_walk_next(&iterator, spte)) {
 	     __shadow_walk_next(&iterator, spte)) {
-		leaf = iterator.level;
 		spte = mmu_spte_get_lockless(iterator.sptep);
 		spte = mmu_spte_get_lockless(iterator.sptep);
 
 
 		sptes[leaf - 1] = spte;
 		sptes[leaf - 1] = spte;
+		leaf--;
 
 
 		if (!is_shadow_present_pte(spte))
 		if (!is_shadow_present_pte(spte))
 			break;
 			break;
@@ -3329,7 +3330,7 @@ walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 	if (reserved) {
 	if (reserved) {
 		pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
 		pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
 		       __func__, addr);
 		       __func__, addr);
-		while (root >= leaf) {
+		while (root > leaf) {
 			pr_err("------ spte 0x%llx level %d.\n",
 			pr_err("------ spte 0x%llx level %d.\n",
 			       sptes[root - 1], root);
 			       sptes[root - 1], root);
 			root--;
 			root--;

+ 2 - 0
arch/x86/kvm/x86.c

@@ -5943,6 +5943,7 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
 	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 	put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
 }
 }
 
 
+#ifdef CONFIG_X86_64
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 {
 {
 	struct kvm_segment seg;
 	struct kvm_segment seg;
@@ -5958,6 +5959,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
 	put_smstate(u32, buf, offset + 4, seg.limit);
 	put_smstate(u32, buf, offset + 4, seg.limit);
 	put_smstate(u64, buf, offset + 8, seg.base);
 	put_smstate(u64, buf, offset + 8, seg.base);
 }
 }
+#endif
 
 
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
 {
 {

+ 5 - 2
include/kvm/arm_arch_timer.h

@@ -52,13 +52,16 @@ struct arch_timer_cpu {
 
 
 	/* Timer IRQ */
 	/* Timer IRQ */
 	const struct kvm_irq_level	*irq;
 	const struct kvm_irq_level	*irq;
+
+	/* VGIC mapping */
+	struct irq_phys_map		*map;
 };
 };
 
 
 int kvm_timer_hyp_init(void);
 int kvm_timer_hyp_init(void);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_enable(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
 void kvm_timer_init(struct kvm *kvm);
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-			  const struct kvm_irq_level *irq);
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+			 const struct kvm_irq_level *irq);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu);

+ 36 - 3
include/kvm/arm_vgic.h

@@ -95,11 +95,15 @@ enum vgic_type {
 #define LR_STATE_ACTIVE		(1 << 1)
 #define LR_STATE_ACTIVE		(1 << 1)
 #define LR_STATE_MASK		(3 << 0)
 #define LR_STATE_MASK		(3 << 0)
 #define LR_EOI_INT		(1 << 2)
 #define LR_EOI_INT		(1 << 2)
+#define LR_HW			(1 << 3)
 
 
 struct vgic_lr {
 struct vgic_lr {
-	u16	irq;
-	u8	source;
-	u8	state;
+	unsigned irq:10;
+	union {
+		unsigned hwirq:10;
+		unsigned source:3;
+	};
+	unsigned state:4;
 };
 };
 
 
 struct vgic_vmcr {
 struct vgic_vmcr {
@@ -155,6 +159,19 @@ struct vgic_io_device {
 	struct kvm_io_device dev;
 	struct kvm_io_device dev;
 };
 };
 
 
+struct irq_phys_map {
+	u32			virt_irq;
+	u32			phys_irq;
+	u32			irq;
+	bool			active;
+};
+
+struct irq_phys_map_entry {
+	struct list_head	entry;
+	struct rcu_head		rcu;
+	struct irq_phys_map	map;
+};
+
 struct vgic_dist {
 struct vgic_dist {
 	spinlock_t		lock;
 	spinlock_t		lock;
 	bool			in_kernel;
 	bool			in_kernel;
@@ -252,6 +269,10 @@ struct vgic_dist {
 	struct vgic_vm_ops	vm_ops;
 	struct vgic_vm_ops	vm_ops;
 	struct vgic_io_device	dist_iodev;
 	struct vgic_io_device	dist_iodev;
 	struct vgic_io_device	*redist_iodevs;
 	struct vgic_io_device	*redist_iodevs;
+
+	/* Virtual irq to hwirq mapping */
+	spinlock_t		irq_phys_map_lock;
+	struct list_head	irq_phys_map_list;
 };
 };
 
 
 struct vgic_v2_cpu_if {
 struct vgic_v2_cpu_if {
@@ -303,6 +324,9 @@ struct vgic_cpu {
 		struct vgic_v2_cpu_if	vgic_v2;
 		struct vgic_v2_cpu_if	vgic_v2;
 		struct vgic_v3_cpu_if	vgic_v3;
 		struct vgic_v3_cpu_if	vgic_v3;
 	};
 	};
+
+	/* Protected by the distributor's irq_phys_map_lock */
+	struct list_head	irq_phys_map_list;
 };
 };
 
 
 #define LR_EMPTY	0xff
 #define LR_EMPTY	0xff
@@ -317,16 +341,25 @@ int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_hyp_init(void);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_map_resources(struct kvm *kvm);
 int kvm_vgic_get_max_vcpus(void);
 int kvm_vgic_get_max_vcpus(void);
+void kvm_vgic_early_init(struct kvm *kvm);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 int kvm_vgic_create(struct kvm *kvm, u32 type);
 void kvm_vgic_destroy(struct kvm *kvm);
 void kvm_vgic_destroy(struct kvm *kvm);
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
 			bool level);
 			bool level);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+			       struct irq_phys_map *map, bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+					   int virt_irq, int irq);
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))

+ 3 - 0
include/linux/irqchip/arm-gic-v3.h

@@ -270,9 +270,12 @@
 
 
 #define ICH_LR_EOI			(1UL << 41)
 #define ICH_LR_EOI			(1UL << 41)
 #define ICH_LR_GROUP			(1UL << 60)
 #define ICH_LR_GROUP			(1UL << 60)
+#define ICH_LR_HW			(1UL << 61)
 #define ICH_LR_STATE			(3UL << 62)
 #define ICH_LR_STATE			(3UL << 62)
 #define ICH_LR_PENDING_BIT		(1UL << 62)
 #define ICH_LR_PENDING_BIT		(1UL << 62)
 #define ICH_LR_ACTIVE_BIT		(1UL << 63)
 #define ICH_LR_ACTIVE_BIT		(1UL << 63)
+#define ICH_LR_PHYS_ID_SHIFT		32
+#define ICH_LR_PHYS_ID_MASK		(0x3ffUL << ICH_LR_PHYS_ID_SHIFT)
 
 
 #define ICH_MISR_EOI			(1 << 0)
 #define ICH_MISR_EOI			(1 << 0)
 #define ICH_MISR_U			(1 << 1)
 #define ICH_MISR_U			(1 << 1)

+ 2 - 1
include/linux/irqchip/arm-gic.h

@@ -75,11 +75,12 @@
 
 
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_VIRTUALID		(0x3ff << 0)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
 #define GICH_LR_PHYSID_CPUID_SHIFT	(10)
-#define GICH_LR_PHYSID_CPUID		(7 << GICH_LR_PHYSID_CPUID_SHIFT)
+#define GICH_LR_PHYSID_CPUID		(0x3ff << GICH_LR_PHYSID_CPUID_SHIFT)
 #define GICH_LR_STATE			(3 << 28)
 #define GICH_LR_STATE			(3 << 28)
 #define GICH_LR_PENDING_BIT		(1 << 28)
 #define GICH_LR_PENDING_BIT		(1 << 28)
 #define GICH_LR_ACTIVE_BIT		(1 << 29)
 #define GICH_LR_ACTIVE_BIT		(1 << 29)
 #define GICH_LR_EOI			(1 << 19)
 #define GICH_LR_EOI			(1 << 19)
+#define GICH_LR_HW			(1 << 31)
 
 
 #define GICH_VMCR_CTRL_SHIFT		0
 #define GICH_VMCR_CTRL_SHIFT		0
 #define GICH_VMCR_CTRL_MASK		(0x21f << GICH_VMCR_CTRL_SHIFT)
 #define GICH_VMCR_CTRL_MASK		(0x21f << GICH_VMCR_CTRL_SHIFT)

+ 1 - 0
include/linux/kvm_host.h

@@ -242,6 +242,7 @@ struct kvm_vcpu {
 	int sigset_active;
 	int sigset_active;
 	sigset_t sigset;
 	sigset_t sigset;
 	struct kvm_vcpu_stat stat;
 	struct kvm_vcpu_stat stat;
+	unsigned int halt_poll_ns;
 
 
 #ifdef CONFIG_HAS_IOMEM
 #ifdef CONFIG_HAS_IOMEM
 	int mmio_needed;
 	int mmio_needed;

+ 30 - 0
include/trace/events/kvm.h

@@ -358,6 +358,36 @@ TRACE_EVENT(
 
 
 #endif
 #endif
 
 
+TRACE_EVENT(kvm_halt_poll_ns,
+	TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+	TP_ARGS(grow, vcpu_id, new, old),
+
+	TP_STRUCT__entry(
+		__field(bool, grow)
+		__field(unsigned int, vcpu_id)
+		__field(int, new)
+		__field(int, old)
+	),
+
+	TP_fast_assign(
+		__entry->grow           = grow;
+		__entry->vcpu_id        = vcpu_id;
+		__entry->new            = new;
+		__entry->old            = old;
+	),
+
+	TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+			__entry->vcpu_id,
+			__entry->new,
+			__entry->grow ? "grow" : "shrink",
+			__entry->old)
+);
+
+#define trace_kvm_halt_poll_ns_grow(vcpu_id, new, old) \
+	trace_kvm_halt_poll_ns(true, vcpu_id, new, old)
+#define trace_kvm_halt_poll_ns_shrink(vcpu_id, new, old) \
+	trace_kvm_halt_poll_ns(false, vcpu_id, new, old)
+
 #endif /* _TRACE_KVM_MAIN_H */
 #endif /* _TRACE_KVM_MAIN_H */
 
 
 /* This part must be outside protection */
 /* This part must be outside protection */

+ 5 - 0
include/uapi/linux/kvm.h

@@ -237,6 +237,7 @@ struct kvm_run {
 			__u32 count;
 			__u32 count;
 			__u64 data_offset; /* relative to kvm_run start */
 			__u64 data_offset; /* relative to kvm_run start */
 		} io;
 		} io;
+		/* KVM_EXIT_DEBUG */
 		struct {
 		struct {
 			struct kvm_debug_exit_arch arch;
 			struct kvm_debug_exit_arch arch;
 		} debug;
 		} debug;
@@ -285,6 +286,7 @@ struct kvm_run {
 			__u32 data;
 			__u32 data;
 			__u8  is_write;
 			__u8  is_write;
 		} dcr;
 		} dcr;
+		/* KVM_EXIT_INTERNAL_ERROR */
 		struct {
 		struct {
 			__u32 suberror;
 			__u32 suberror;
 			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
 			/* Available with KVM_CAP_INTERNAL_ERROR_DATA: */
@@ -295,6 +297,7 @@ struct kvm_run {
 		struct {
 		struct {
 			__u64 gprs[32];
 			__u64 gprs[32];
 		} osi;
 		} osi;
+		/* KVM_EXIT_PAPR_HCALL */
 		struct {
 		struct {
 			__u64 nr;
 			__u64 nr;
 			__u64 ret;
 			__u64 ret;
@@ -819,6 +822,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_DISABLE_QUIRKS 116
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_X86_SMM 117
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
+#define KVM_CAP_GUEST_DEBUG_HW_BPS 119
+#define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 
 
 #ifdef KVM_CAP_IRQ_ROUTING
 #ifdef KVM_CAP_IRQ_ROUTING
 
 

+ 22 - 7
virt/kvm/arm/arch_timer.c

@@ -64,10 +64,10 @@ static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
 	int ret;
 	int ret;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
 
-	timer->cntv_ctl |= ARCH_TIMER_CTRL_IT_MASK;
-	ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id,
-				  timer->irq->irq,
-				  timer->irq->level);
+	kvm_vgic_set_phys_irq_active(timer->map, true);
+	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+					 timer->map,
+					 timer->irq->level);
 	WARN_ON(ret);
 	WARN_ON(ret);
 }
 }
 
 
@@ -117,7 +117,8 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 	cycle_t cval, now;
 	cycle_t cval, now;
 
 
 	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
 	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-		!(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+	    !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
+	    kvm_vgic_get_phys_irq_active(timer->map))
 		return false;
 		return false;
 
 
 	cval = timer->cntv_cval;
 	cval = timer->cntv_cval;
@@ -184,10 +185,11 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 	timer_arm(timer, ns);
 	timer_arm(timer, ns);
 }
 }
 
 
-void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
-			  const struct kvm_irq_level *irq)
+int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
+			 const struct kvm_irq_level *irq)
 {
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	struct irq_phys_map *map;
 
 
 	/*
 	/*
 	 * The vcpu timer irq number cannot be determined in
 	 * The vcpu timer irq number cannot be determined in
@@ -196,6 +198,17 @@ void kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * vcpu timer irq number when the vcpu is reset.
 	 * vcpu timer irq number when the vcpu is reset.
 	 */
 	 */
 	timer->irq = irq;
 	timer->irq = irq;
+
+	/*
+	 * Tell the VGIC that the virtual interrupt is tied to a
+	 * physical interrupt. We do that once per VCPU.
+	 */
+	map = kvm_vgic_map_phys_irq(vcpu, irq->irq, host_vtimer_irq);
+	if (WARN_ON(IS_ERR(map)))
+		return PTR_ERR(map);
+
+	timer->map = map;
+	return 0;
 }
 }
 
 
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
 void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu)
@@ -335,6 +348,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu)
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 
 
 	timer_disarm(timer);
 	timer_disarm(timer);
+	if (timer->map)
+		kvm_vgic_unmap_phys_irq(vcpu, timer->map);
 }
 }
 
 
 void kvm_timer_enable(struct kvm *kvm)
 void kvm_timer_enable(struct kvm *kvm)

+ 15 - 1
virt/kvm/arm/vgic-v2.c

@@ -48,6 +48,10 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 		lr_desc.state |= LR_STATE_ACTIVE;
 		lr_desc.state |= LR_STATE_ACTIVE;
 	if (val & GICH_LR_EOI)
 	if (val & GICH_LR_EOI)
 		lr_desc.state |= LR_EOI_INT;
 		lr_desc.state |= LR_EOI_INT;
+	if (val & GICH_LR_HW) {
+		lr_desc.state |= LR_HW;
+		lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
+	}
 
 
 	return lr_desc;
 	return lr_desc;
 }
 }
@@ -55,7 +59,9 @@ static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 			   struct vgic_lr lr_desc)
 			   struct vgic_lr lr_desc)
 {
 {
-	u32 lr_val = (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT) | lr_desc.irq;
+	u32 lr_val;
+
+	lr_val = lr_desc.irq;
 
 
 	if (lr_desc.state & LR_STATE_PENDING)
 	if (lr_desc.state & LR_STATE_PENDING)
 		lr_val |= GICH_LR_PENDING_BIT;
 		lr_val |= GICH_LR_PENDING_BIT;
@@ -64,6 +70,14 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 	if (lr_desc.state & LR_EOI_INT)
 	if (lr_desc.state & LR_EOI_INT)
 		lr_val |= GICH_LR_EOI;
 		lr_val |= GICH_LR_EOI;
 
 
+	if (lr_desc.state & LR_HW) {
+		lr_val |= GICH_LR_HW;
+		lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
+	}
+
+	if (lr_desc.irq < VGIC_NR_SGIS)
+		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
+
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 }
 }
 
 

+ 18 - 3
virt/kvm/arm/vgic-v3.c

@@ -67,6 +67,10 @@ static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
 		lr_desc.state |= LR_STATE_ACTIVE;
 		lr_desc.state |= LR_STATE_ACTIVE;
 	if (val & ICH_LR_EOI)
 	if (val & ICH_LR_EOI)
 		lr_desc.state |= LR_EOI_INT;
 		lr_desc.state |= LR_EOI_INT;
+	if (val & ICH_LR_HW) {
+		lr_desc.state |= LR_HW;
+		lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
+	}
 
 
 	return lr_desc;
 	return lr_desc;
 }
 }
@@ -84,10 +88,17 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 	 * Eventually we want to make this configurable, so we may revisit
 	 * Eventually we want to make this configurable, so we may revisit
 	 * this in the future.
 	 * this in the future.
 	 */
 	 */
-	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+	switch (vcpu->kvm->arch.vgic.vgic_model) {
+	case KVM_DEV_TYPE_ARM_VGIC_V3:
 		lr_val |= ICH_LR_GROUP;
 		lr_val |= ICH_LR_GROUP;
-	else
-		lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+		break;
+	case  KVM_DEV_TYPE_ARM_VGIC_V2:
+		if (lr_desc.irq < VGIC_NR_SGIS)
+			lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
+		break;
+	default:
+		BUG();
+	}
 
 
 	if (lr_desc.state & LR_STATE_PENDING)
 	if (lr_desc.state & LR_STATE_PENDING)
 		lr_val |= ICH_LR_PENDING_BIT;
 		lr_val |= ICH_LR_PENDING_BIT;
@@ -95,6 +106,10 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 		lr_val |= ICH_LR_ACTIVE_BIT;
 		lr_val |= ICH_LR_ACTIVE_BIT;
 	if (lr_desc.state & LR_EOI_INT)
 	if (lr_desc.state & LR_EOI_INT)
 		lr_val |= ICH_LR_EOI;
 		lr_val |= ICH_LR_EOI;
+	if (lr_desc.state & LR_HW) {
+		lr_val |= ICH_LR_HW;
+		lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
+	}
 
 
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 }
 }

+ 389 - 38
virt/kvm/arm/vgic.c

@@ -24,6 +24,7 @@
 #include <linux/of.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_address.h>
 #include <linux/of_irq.h>
 #include <linux/of_irq.h>
+#include <linux/rculist.h>
 #include <linux/uaccess.h>
 #include <linux/uaccess.h>
 
 
 #include <asm/kvm_emulate.h>
 #include <asm/kvm_emulate.h>
@@ -74,6 +75,28 @@
  *   cause the interrupt to become inactive in such a situation.
  *   cause the interrupt to become inactive in such a situation.
  *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
  *   Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
  *   inactive as long as the external input line is held high.
  *   inactive as long as the external input line is held high.
+ *
+ *
+ * Initialization rules: there are multiple stages to the vgic
+ * initialization, both for the distributor and the CPU interfaces.
+ *
+ * Distributor:
+ *
+ * - kvm_vgic_early_init(): initialization of static data that doesn't
+ *   depend on any sizing information or emulation type. No allocation
+ *   is allowed there.
+ *
+ * - vgic_init(): allocation and initialization of the generic data
+ *   structures that depend on sizing information (number of CPUs,
+ *   number of interrupts). Also initializes the vcpu specific data
+ *   structures. Can be executed lazily for GICv2.
+ *   [to be renamed to kvm_vgic_init??]
+ *
+ * CPU Interface:
+ *
+ * - kvm_vgic_cpu_early_init(): initialization of static data that
+ *   doesn't depend on any sizing information or emulation type. No
+ *   allocation is allowed there.
  */
  */
 
 
 #include "vgic.h"
 #include "vgic.h"
@@ -82,6 +105,8 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+						int virt_irq);
 
 
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
 static const struct vgic_params *vgic;
@@ -375,7 +400,7 @@ void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
 
 
 static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
 static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
 {
 {
-	return vgic_irq_is_edge(vcpu, irq) || !vgic_irq_is_queued(vcpu, irq);
+	return !vgic_irq_is_queued(vcpu, irq);
 }
 }
 
 
 /**
 /**
@@ -1115,6 +1140,39 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 	if (!vgic_irq_is_edge(vcpu, irq))
 	if (!vgic_irq_is_edge(vcpu, irq))
 		vlr.state |= LR_EOI_INT;
 		vlr.state |= LR_EOI_INT;
 
 
+	if (vlr.irq >= VGIC_NR_SGIS) {
+		struct irq_phys_map *map;
+		map = vgic_irq_map_search(vcpu, irq);
+
+		/*
+		 * If we have a mapping, and the virtual interrupt is
+		 * being injected, then we must set the state to
+		 * active in the physical world. Otherwise the
+		 * physical interrupt will fire and the guest will
+		 * exit before processing the virtual interrupt.
+		 */
+		if (map) {
+			int ret;
+
+			BUG_ON(!map->active);
+			vlr.hwirq = map->phys_irq;
+			vlr.state |= LR_HW;
+			vlr.state &= ~LR_EOI_INT;
+
+			ret = irq_set_irqchip_state(map->irq,
+						    IRQCHIP_STATE_ACTIVE,
+						    true);
+			WARN_ON(ret);
+
+			/*
+			 * Make sure we're not going to sample this
+			 * again, as a HW-backed interrupt cannot be
+			 * in the PENDING_ACTIVE stage.
+			 */
+			vgic_irq_set_queued(vcpu, irq);
+		}
+	}
+
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 }
@@ -1339,6 +1397,39 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 	return level_pending;
 	return level_pending;
 }
 }
 
 
+/*
+ * Save the physical active state, and reset it to inactive.
+ *
+ * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ */
+static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+{
+	struct irq_phys_map *map;
+	int ret;
+
+	if (!(vlr.state & LR_HW))
+		return 0;
+
+	map = vgic_irq_map_search(vcpu, vlr.irq);
+	BUG_ON(!map || !map->active);
+
+	ret = irq_get_irqchip_state(map->irq,
+				    IRQCHIP_STATE_ACTIVE,
+				    &map->active);
+
+	WARN_ON(ret);
+
+	if (map->active) {
+		ret = irq_set_irqchip_state(map->irq,
+					    IRQCHIP_STATE_ACTIVE,
+					    false);
+		WARN_ON(ret);
+		return 0;
+	}
+
+	return 1;
+}
+
 /* Sync back the VGIC state after a guest run */
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 {
@@ -1353,14 +1444,31 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr = vgic_get_elrsr(vcpu);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 	elrsr_ptr = u64_to_bitmask(&elrsr);
 
 
-	/* Clear mappings for empty LRs */
-	for_each_set_bit(lr, elrsr_ptr, vgic->nr_lr) {
+	/* Deal with HW interrupts, and clear mappings for empty LRs */
+	for (lr = 0; lr < vgic->nr_lr; lr++) {
 		struct vgic_lr vlr;
 		struct vgic_lr vlr;
 
 
-		if (!test_and_clear_bit(lr, vgic_cpu->lr_used))
+		if (!test_bit(lr, vgic_cpu->lr_used))
 			continue;
 			continue;
 
 
 		vlr = vgic_get_lr(vcpu, lr);
 		vlr = vgic_get_lr(vcpu, lr);
+		if (vgic_sync_hwirq(vcpu, vlr)) {
+			/*
+			 * So this is a HW interrupt that the guest
+			 * EOI-ed. Clean the LR state and allow the
+			 * interrupt to be sampled again.
+			 */
+			vlr.state = 0;
+			vlr.hwirq = 0;
+			vgic_set_lr(vcpu, lr, vlr);
+			vgic_irq_clear_queued(vcpu, vlr.irq);
+			set_bit(lr, elrsr_ptr);
+		}
+
+		if (!test_bit(lr, elrsr_ptr))
+			continue;
+
+		clear_bit(lr, vgic_cpu->lr_used);
 
 
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
 		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
@@ -1447,7 +1555,8 @@ static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
 }
 }
 
 
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
-				  unsigned int irq_num, bool level)
+				   struct irq_phys_map *map,
+				   unsigned int irq_num, bool level)
 {
 {
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct vgic_dist *dist = &kvm->arch.vgic;
 	struct kvm_vcpu *vcpu;
 	struct kvm_vcpu *vcpu;
@@ -1455,6 +1564,9 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	int enabled;
 	int enabled;
 	bool ret = true, can_inject = true;
 	bool ret = true, can_inject = true;
 
 
+	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+		return -EINVAL;
+
 	spin_lock(&dist->lock);
 	spin_lock(&dist->lock);
 
 
 	vcpu = kvm_get_vcpu(kvm, cpuid);
 	vcpu = kvm_get_vcpu(kvm, cpuid);
@@ -1517,18 +1629,46 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 out:
 out:
 	spin_unlock(&dist->lock);
 	spin_unlock(&dist->lock);
 
 
-	return ret ? cpuid : -EINVAL;
+	if (ret) {
+		/* kick the specified vcpu */
+		kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
+	}
+
+	return 0;
+}
+
+static int vgic_lazy_init(struct kvm *kvm)
+{
+	int ret = 0;
+
+	if (unlikely(!vgic_initialized(kvm))) {
+		/*
+		 * We only provide the automatic initialization of the VGIC
+		 * for the legacy case of a GICv2. Any other type must
+		 * be explicitly initialized once setup with the respective
+		 * KVM device call.
+		 */
+		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
+			return -EBUSY;
+
+		mutex_lock(&kvm->lock);
+		ret = vgic_init(kvm);
+		mutex_unlock(&kvm->lock);
+	}
+
+	return ret;
 }
 }
 
 
 /**
 /**
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
  * @kvm:     The VM structure pointer
  * @kvm:     The VM structure pointer
  * @cpuid:   The CPU for PPIs
  * @cpuid:   The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device
+ * @irq_num: The IRQ number that is assigned to the device. This IRQ
+ *           must not be mapped to a HW interrupt.
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  * @level:   Edge-triggered:  true:  to trigger the interrupt
  *			      false: to ignore the call
  *			      false: to ignore the call
- *	     Level-sensitive  true:  activates an interrupt
- *			      false: deactivates an interrupt
+ *	     Level-sensitive  true:  raise the input signal
+ *			      false: lower the input signal
  *
  *
  * The GIC is not concerned with devices being active-LOW or active-HIGH for
  * The GIC is not concerned with devices being active-LOW or active-HIGH for
  * level-sensitive interrupts.  You can think of the level parameter as 1
  * level-sensitive interrupts.  You can think of the level parameter as 1
@@ -1537,39 +1677,44 @@ out:
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
 int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
 			bool level)
 			bool level)
 {
 {
-	int ret = 0;
-	int vcpu_id;
-
-	if (unlikely(!vgic_initialized(kvm))) {
-		/*
-		 * We only provide the automatic initialization of the VGIC
-		 * for the legacy case of a GICv2. Any other type must
-		 * be explicitly initialized once setup with the respective
-		 * KVM device call.
-		 */
-		if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2) {
-			ret = -EBUSY;
-			goto out;
-		}
-		mutex_lock(&kvm->lock);
-		ret = vgic_init(kvm);
-		mutex_unlock(&kvm->lock);
+	struct irq_phys_map *map;
+	int ret;
 
 
-		if (ret)
-			goto out;
-	}
+	ret = vgic_lazy_init(kvm);
+	if (ret)
+		return ret;
 
 
-	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
+	map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
+	if (map)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	vcpu_id = vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-	if (vcpu_id >= 0) {
-		/* kick the specified vcpu */
-		kvm_vcpu_kick(kvm_get_vcpu(kvm, vcpu_id));
-	}
+	return vgic_update_irq_pending(kvm, cpuid, NULL, irq_num, level);
+}
 
 
-out:
-	return ret;
+/**
+ * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
+ * @kvm:     The VM structure pointer
+ * @cpuid:   The CPU for PPIs
+ * @map:     Pointer to a irq_phys_map structure describing the mapping
+ * @level:   Edge-triggered:  true:  to trigger the interrupt
+ *			      false: to ignore the call
+ *	     Level-sensitive  true:  raise the input signal
+ *			      false: lower the input signal
+ *
+ * The GIC is not concerned with devices being active-LOW or active-HIGH for
+ * level-sensitive interrupts.  You can think of the level parameter as 1
+ * being HIGH and 0 being LOW and all devices being active-HIGH.
+ */
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
+			       struct irq_phys_map *map, bool level)
+{
+	int ret;
+
+	ret = vgic_lazy_init(kvm);
+	if (ret)
+		return ret;
+
+	return vgic_update_irq_pending(kvm, cpuid, map, map->virt_irq, level);
 }
 }
 
 
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
 static irqreturn_t vgic_maintenance_handler(int irq, void *data)
@@ -1583,6 +1728,188 @@ static irqreturn_t vgic_maintenance_handler(int irq, void *data)
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
 
 
+static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
+						    int virt_irq)
+{
+	if (virt_irq < VGIC_NR_PRIVATE_IRQS)
+		return &vcpu->arch.vgic_cpu.irq_phys_map_list;
+	else
+		return &vcpu->kvm->arch.vgic.irq_phys_map_list;
+}
+
+/**
+ * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
+ * @vcpu: The VCPU pointer
+ * @virt_irq: The virtual irq number
+ * @irq: The Linux IRQ number
+ *
+ * Establish a mapping between a guest visible irq (@virt_irq) and a
+ * Linux irq (@irq). On injection, @virt_irq will be associated with
+ * the physical interrupt represented by @irq. This mapping can be
+ * established multiple times as long as the parameters are the same.
+ *
+ * Returns a valid pointer on success, and an error pointer otherwise
+ */
+struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
+					   int virt_irq, int irq)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+	struct irq_phys_map *map;
+	struct irq_phys_map_entry *entry;
+	struct irq_desc *desc;
+	struct irq_data *data;
+	int phys_irq;
+
+	desc = irq_to_desc(irq);
+	if (!desc) {
+		kvm_err("%s: no interrupt descriptor\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	data = irq_desc_get_irq_data(desc);
+	while (data->parent_data)
+		data = data->parent_data;
+
+	phys_irq = data->hwirq;
+
+	/* Create a new mapping */
+	entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+	if (!entry)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock(&dist->irq_phys_map_lock);
+
+	/* Try to match an existing mapping */
+	map = vgic_irq_map_search(vcpu, virt_irq);
+	if (map) {
+		/* Make sure this mapping matches */
+		if (map->phys_irq != phys_irq	||
+		    map->irq      != irq)
+			map = ERR_PTR(-EINVAL);
+
+		/* Found an existing, valid mapping */
+		goto out;
+	}
+
+	map           = &entry->map;
+	map->virt_irq = virt_irq;
+	map->phys_irq = phys_irq;
+	map->irq      = irq;
+
+	list_add_tail_rcu(&entry->entry, root);
+
+out:
+	spin_unlock(&dist->irq_phys_map_lock);
+	/* If we've found a hit in the existing list, free the useless
+	 * entry */
+	if (IS_ERR(map) || map != &entry->map)
+		kfree(entry);
+	return map;
+}
+
+static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
+						int virt_irq)
+{
+	struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
+	struct irq_phys_map_entry *entry;
+	struct irq_phys_map *map;
+
+	rcu_read_lock();
+
+	list_for_each_entry_rcu(entry, root, entry) {
+		map = &entry->map;
+		if (map->virt_irq == virt_irq) {
+			rcu_read_unlock();
+			return map;
+		}
+	}
+
+	rcu_read_unlock();
+
+	return NULL;
+}
+
+static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
+{
+	struct irq_phys_map_entry *entry;
+
+	entry = container_of(rcu, struct irq_phys_map_entry, rcu);
+	kfree(entry);
+}
+
+/**
+ * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
+ *
+ * Return the logical active state of a mapped interrupt. This doesn't
+ * necessarily reflects the current HW state.
+ */
+bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
+{
+	BUG_ON(!map);
+	return map->active;
+}
+
+/**
+ * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
+ *
+ * Set the logical active state of a mapped interrupt. This doesn't
+ * immediately affects the HW state.
+ */
+void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
+{
+	BUG_ON(!map);
+	map->active = active;
+}
+
+/**
+ * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
+ * @vcpu: The VCPU pointer
+ * @map: The pointer to a mapping obtained through kvm_vgic_map_phys_irq
+ *
+ * Remove an existing mapping between virtual and physical interrupts.
+ */
+int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map)
+{
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	struct irq_phys_map_entry *entry;
+	struct list_head *root;
+
+	if (!map)
+		return -EINVAL;
+
+	root = vgic_get_irq_phys_map_list(vcpu, map->virt_irq);
+
+	spin_lock(&dist->irq_phys_map_lock);
+
+	list_for_each_entry(entry, root, entry) {
+		if (&entry->map == map) {
+			list_del_rcu(&entry->entry);
+			call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+			break;
+		}
+	}
+
+	spin_unlock(&dist->irq_phys_map_lock);
+
+	return 0;
+}
+
+static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
+{
+	struct vgic_dist *dist = &kvm->arch.vgic;
+	struct irq_phys_map_entry *entry;
+
+	spin_lock(&dist->irq_phys_map_lock);
+
+	list_for_each_entry(entry, root, entry) {
+		list_del_rcu(&entry->entry);
+		call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
+	}
+
+	spin_unlock(&dist->irq_phys_map_lock);
+}
+
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
@@ -1591,6 +1918,7 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kfree(vgic_cpu->active_shared);
 	kfree(vgic_cpu->active_shared);
 	kfree(vgic_cpu->pend_act_shared);
 	kfree(vgic_cpu->pend_act_shared);
 	kfree(vgic_cpu->vgic_irq_lr_map);
 	kfree(vgic_cpu->vgic_irq_lr_map);
+	vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
 	vgic_cpu->pending_shared = NULL;
 	vgic_cpu->pending_shared = NULL;
 	vgic_cpu->active_shared = NULL;
 	vgic_cpu->active_shared = NULL;
 	vgic_cpu->pend_act_shared = NULL;
 	vgic_cpu->pend_act_shared = NULL;
@@ -1627,6 +1955,17 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 	return 0;
 	return 0;
 }
 }
 
 
+/**
+ * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
+{
+	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
+}
+
 /**
 /**
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
  * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
  *
  *
@@ -1664,6 +2003,7 @@ void kvm_vgic_destroy(struct kvm *kvm)
 	kfree(dist->irq_spi_target);
 	kfree(dist->irq_spi_target);
 	kfree(dist->irq_pending_on_cpu);
 	kfree(dist->irq_pending_on_cpu);
 	kfree(dist->irq_active_on_cpu);
 	kfree(dist->irq_active_on_cpu);
+	vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
 	dist->irq_sgi_sources = NULL;
 	dist->irq_sgi_sources = NULL;
 	dist->irq_spi_cpu = NULL;
 	dist->irq_spi_cpu = NULL;
 	dist->irq_spi_target = NULL;
 	dist->irq_spi_target = NULL;
@@ -1787,6 +2127,18 @@ static int init_vgic_model(struct kvm *kvm, int type)
 	return 0;
 	return 0;
 }
 }
 
 
+/**
+ * kvm_vgic_early_init - Earliest possible vgic initialization stage
+ *
+ * No memory allocation should be performed here, only static init.
+ */
+void kvm_vgic_early_init(struct kvm *kvm)
+{
+	spin_lock_init(&kvm->arch.vgic.lock);
+	spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
+	INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
+}
+
 int kvm_vgic_create(struct kvm *kvm, u32 type)
 int kvm_vgic_create(struct kvm *kvm, u32 type)
 {
 {
 	int i, vcpu_lock_idx = -1, ret;
 	int i, vcpu_lock_idx = -1, ret;
@@ -1832,7 +2184,6 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
 	if (ret)
 	if (ret)
 		goto out_unlock;
 		goto out_unlock;
 
 
-	spin_lock_init(&kvm->arch.vgic.lock);
 	kvm->arch.vgic.in_kernel = true;
 	kvm->arch.vgic.in_kernel = true;
 	kvm->arch.vgic.vgic_model = type;
 	kvm->arch.vgic.vgic_model = type;
 	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
 	kvm->arch.vgic.vctrl_base = vgic->vctrl_base;

+ 6 - 2
virt/kvm/irqchip.c

@@ -213,11 +213,15 @@ int kvm_set_irq_routing(struct kvm *kvm,
 			goto out;
 			goto out;
 
 
 		r = -EINVAL;
 		r = -EINVAL;
-		if (ue->flags)
+		if (ue->flags) {
+			kfree(e);
 			goto out;
 			goto out;
+		}
 		r = setup_routing_entry(new, e, ue);
 		r = setup_routing_entry(new, e, ue);
-		if (r)
+		if (r) {
+			kfree(e);
 			goto out;
 			goto out;
+		}
 		++ue;
 		++ue;
 	}
 	}
 
 

+ 58 - 4
virt/kvm/kvm_main.c

@@ -66,9 +66,18 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 MODULE_LICENSE("GPL");
 
 
-static unsigned int halt_poll_ns;
+/* halt polling only reduces halt latency by 5-7 us, 500us is enough */
+static unsigned int halt_poll_ns = 500000;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
 
+/* Default doubles per-vcpu halt_poll_ns. */
+static unsigned int halt_poll_ns_grow = 2;
+module_param(halt_poll_ns_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu halt_poll_ns . */
+static unsigned int halt_poll_ns_shrink;
+module_param(halt_poll_ns_shrink, int, S_IRUGO);
+
 /*
 /*
  * Ordering of locks:
  * Ordering of locks:
  *
  *
@@ -217,6 +226,7 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	vcpu->kvm = kvm;
 	vcpu->kvm = kvm;
 	vcpu->vcpu_id = id;
 	vcpu->vcpu_id = id;
 	vcpu->pid = NULL;
 	vcpu->pid = NULL;
+	vcpu->halt_poll_ns = 0;
 	init_waitqueue_head(&vcpu->wq);
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 	kvm_async_pf_vcpu_init(vcpu);
 
 
@@ -1906,6 +1916,35 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
 
+static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+	int old, val;
+
+	old = val = vcpu->halt_poll_ns;
+	/* 10us base */
+	if (val == 0 && halt_poll_ns_grow)
+		val = 10000;
+	else
+		val *= halt_poll_ns_grow;
+
+	vcpu->halt_poll_ns = val;
+	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
+}
+
+static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
+{
+	int old, val;
+
+	old = val = vcpu->halt_poll_ns;
+	if (halt_poll_ns_shrink == 0)
+		val = 0;
+	else
+		val /= halt_poll_ns_shrink;
+
+	vcpu->halt_poll_ns = val;
+	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
+}
+
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
 {
 {
 	if (kvm_arch_vcpu_runnable(vcpu)) {
 	if (kvm_arch_vcpu_runnable(vcpu)) {
@@ -1928,10 +1967,11 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	ktime_t start, cur;
 	ktime_t start, cur;
 	DEFINE_WAIT(wait);
 	DEFINE_WAIT(wait);
 	bool waited = false;
 	bool waited = false;
+	u64 block_ns;
 
 
 	start = cur = ktime_get();
 	start = cur = ktime_get();
-	if (halt_poll_ns) {
-		ktime_t stop = ktime_add_ns(ktime_get(), halt_poll_ns);
+	if (vcpu->halt_poll_ns) {
+		ktime_t stop = ktime_add_ns(ktime_get(), vcpu->halt_poll_ns);
 
 
 		do {
 		do {
 			/*
 			/*
@@ -1960,7 +2000,21 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	cur = ktime_get();
 	cur = ktime_get();
 
 
 out:
 out:
-	trace_kvm_vcpu_wakeup(ktime_to_ns(cur) - ktime_to_ns(start), waited);
+	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
+
+	if (halt_poll_ns) {
+		if (block_ns <= vcpu->halt_poll_ns)
+			;
+		/* we had a long block, shrink polling */
+		else if (vcpu->halt_poll_ns && block_ns > halt_poll_ns)
+			shrink_halt_poll_ns(vcpu);
+		/* we had a short halt and our poll time is too small */
+		else if (vcpu->halt_poll_ns < halt_poll_ns &&
+			block_ns < halt_poll_ns)
+			grow_halt_poll_ns(vcpu);
+	}
+
+	trace_kvm_vcpu_wakeup(block_ns, waited);
 }
 }
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);
 EXPORT_SYMBOL_GPL(kvm_vcpu_block);