Browse Source

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull KVM updates from Paolo Bonzini:
 "First batch of KVM changes for 4.4.

  s390:
     A bunch of fixes and optimizations for interrupt and time handling.

  PPC:
     Mostly bug fixes.

  ARM:
     No big features, but many small fixes and prerequisites including:

      - a number of fixes for the arch-timer

      - introducing proper level-triggered semantics for the arch-timers

      - a series of patches to synchronously halt a guest (prerequisite
        for IRQ forwarding)

      - some tracepoint improvements

      - a tweak for the EL2 panic handlers

      - some more VGIC cleanups getting rid of redundant state

  x86:
     Quite a few changes:

      - support for VT-d posted interrupts (i.e. PCI devices can inject
        interrupts directly into vCPUs).  This introduces a new
        component (in virt/lib/) that connects VFIO and KVM together.
        The same infrastructure will be used for ARM interrupt
        forwarding as well.

      - more Hyper-V features, though the main one Hyper-V synthetic
        interrupt controller will have to wait for 4.5.  These will let
        KVM expose Hyper-V devices.

      - nested virtualization now supports VPID (same as PCID but for
        vCPUs) which makes it quite a bit faster

      - for future hardware that supports NVDIMM, there is support for
        clflushopt, clwb, pcommit

      - support for "split irqchip", i.e.  LAPIC in kernel +
        IOAPIC/PIC/PIT in userspace, which reduces the attack surface of
        the hypervisor

      - obligatory smattering of SMM fixes

      - on the guest side, stable scheduler clock support was rewritten
        to not require help from the hypervisor"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (123 commits)
  KVM: VMX: Fix commit which broke PML
  KVM: x86: obey KVM_X86_QUIRK_CD_NW_CLEARED in kvm_set_cr0()
  KVM: x86: allow RSM from 64-bit mode
  KVM: VMX: fix SMEP and SMAP without EPT
  KVM: x86: move kvm_set_irq_inatomic to legacy device assignment
  KVM: device assignment: remove pointless #ifdefs
  KVM: x86: merge kvm_arch_set_irq with kvm_set_msi_inatomic
  KVM: x86: zero apic_arb_prio on reset
  drivers/hv: share Hyper-V SynIC constants with userspace
  KVM: x86: handle SMBASE as physical address in RSM
  KVM: x86: add read_phys to x86_emulate_ops
  KVM: x86: removing unused variable
  KVM: don't pointlessly leave KVM_COMPAT=y in non-KVM configs
  KVM: arm/arm64: Merge vgic_set_lr() and vgic_sync_lr_elrsr()
  KVM: arm/arm64: Clean up vgic_retire_lr() and surroundings
  KVM: arm/arm64: Optimize away redundant LR tracking
  KVM: s390: use simple switch statement as multiplexer
  KVM: s390: drop useless newline in debugging data
  KVM: s390: SCA must not cross page boundaries
  KVM: arm: Do not indent the arguments of DECLARE_BITMAP
  ...
Linus Torvalds 9 years ago
parent
commit
933425fb00
89 changed files with 2956 additions and 1029 deletions
  1. 1 0
      Documentation/kernel-parameters.txt
  2. 47 5
      Documentation/virtual/kvm/api.txt
  3. 187 0
      Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt
  4. 10 8
      Documentation/virtual/kvm/devices/arm-vgic.txt
  5. 12 0
      Documentation/virtual/kvm/locking.txt
  6. 7 0
      MAINTAINERS
  7. 6 4
      Makefile
  8. 20 0
      arch/arm/include/asm/kvm_arm.h
  9. 4 1
      arch/arm/include/asm/kvm_host.h
  10. 2 0
      arch/arm/kvm/Kconfig
  11. 60 16
      arch/arm/kvm/arm.c
  12. 5 5
      arch/arm/kvm/psci.c
  13. 7 3
      arch/arm/kvm/trace.h
  14. 16 0
      arch/arm64/include/asm/kvm_arm.h
  15. 4 1
      arch/arm64/include/asm/kvm_host.h
  16. 2 0
      arch/arm64/kvm/Kconfig
  17. 8 0
      arch/arm64/kvm/hyp.S
  18. 2 0
      arch/mips/include/asm/kvm_host.h
  19. 5 0
      arch/powerpc/include/asm/disassemble.h
  20. 2 0
      arch/powerpc/include/asm/kvm_host.h
  21. 6 0
      arch/powerpc/include/asm/reg_booke.h
  22. 2 1
      arch/powerpc/kvm/book3s_64_mmu_hv.c
  23. 2 0
      arch/powerpc/kvm/book3s_hv_rm_mmu.c
  24. 22 7
      arch/powerpc/kvm/book3s_hv_rmhandlers.S
  25. 2 1
      arch/powerpc/kvm/e500.c
  26. 19 0
      arch/powerpc/kvm/e500_emulate.c
  27. 2 2
      arch/powerpc/kvm/e500_mmu_host.c
  28. 3 0
      arch/powerpc/kvm/powerpc.c
  29. 2 0
      arch/s390/include/asm/kvm_host.h
  30. 21 21
      arch/s390/kvm/intercept.c
  31. 43 73
      arch/s390/kvm/interrupt.c
  32. 27 31
      arch/s390/kvm/kvm-s390.c
  33. 31 4
      arch/s390/kvm/kvm-s390.h
  34. 3 16
      arch/s390/kvm/priv.c
  35. 5 5
      arch/x86/include/asm/irq_remapping.h
  36. 10 0
      arch/x86/include/asm/kvm_emulate.h
  37. 36 2
      arch/x86/include/asm/kvm_host.h
  38. 2 1
      arch/x86/include/asm/vmx.h
  39. 18 0
      arch/x86/include/uapi/asm/hyperv.h
  40. 3 1
      arch/x86/include/uapi/asm/vmx.h
  41. 35 11
      arch/x86/kernel/kvmclock.c
  42. 2 0
      arch/x86/kvm/Kconfig
  43. 37 25
      arch/x86/kvm/assigned-dev.c
  44. 1 1
      arch/x86/kvm/cpuid.c
  45. 37 0
      arch/x86/kvm/cpuid.h
  46. 27 8
      arch/x86/kvm/emulate.c
  47. 29 2
      arch/x86/kvm/hyperv.c
  48. 3 1
      arch/x86/kvm/i8254.c
  49. 6 23
      arch/x86/kvm/ioapic.c
  50. 8 7
      arch/x86/kvm/ioapic.h
  51. 30 10
      arch/x86/kvm/irq.c
  52. 26 1
      arch/x86/kvm/irq.h
  53. 88 41
      arch/x86/kvm/irq_comm.c
  54. 104 23
      arch/x86/kvm/lapic.c
  55. 4 3
      arch/x86/kvm/lapic.h
  56. 53 38
      arch/x86/kvm/mmu.c
  57. 9 10
      arch/x86/kvm/paging_tmpl.h
  58. 19 24
      arch/x86/kvm/svm.c
  59. 51 0
      arch/x86/kvm/trace.h
  60. 605 145
      arch/x86/kvm/vmx.c
  61. 197 59
      arch/x86/kvm/x86.c
  62. 0 5
      drivers/hv/hyperv_vmbus.h
  63. 8 4
      drivers/iommu/irq_remapping.c
  64. 1 0
      drivers/vfio/Kconfig
  65. 1 0
      drivers/vfio/pci/Kconfig
  66. 9 0
      drivers/vfio/pci/vfio_pci_intrs.c
  67. 2 0
      drivers/vfio/pci/vfio_pci_private.h
  68. 3 1
      include/kvm/arm_arch_timer.h
  69. 3 13
      include/kvm/arm_vgic.h
  70. 1 0
      include/linux/hyperv.h
  71. 90 0
      include/linux/irqbypass.h
  72. 40 2
      include/linux/kvm_host.h
  73. 71 0
      include/linux/kvm_irqfd.h
  74. 7 0
      include/uapi/linux/kvm.h
  75. 2 0
      kernel/sched/cputime.c
  76. 1 0
      virt/Makefile
  77. 4 1
      virt/kvm/Kconfig
  78. 117 56
      virt/kvm/arm/arch_timer.c
  79. 63 0
      virt/kvm/arm/trace.h
  80. 1 5
      virt/kvm/arm/vgic-v2.c
  81. 1 5
      virt/kvm/arm/vgic-v3.c
  82. 119 189
      virt/kvm/arm/vgic.c
  83. 4 0
      virt/kvm/async_pf.c
  84. 97 93
      virt/kvm/eventfd.c
  85. 5 13
      virt/kvm/irqchip.c
  86. 9 2
      virt/kvm/kvm_main.c
  87. 2 0
      virt/lib/Kconfig
  88. 1 0
      virt/lib/Makefile
  89. 257 0
      virt/lib/irqbypass.c

+ 1 - 0
Documentation/kernel-parameters.txt

@@ -1585,6 +1585,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			nosid	disable Source ID checking
 			nosid	disable Source ID checking
 			no_x2apic_optout
 			no_x2apic_optout
 				BIOS x2APIC opt-out request will be ignored
 				BIOS x2APIC opt-out request will be ignored
+			nopost	disable Interrupt Posting
 
 
 	iomem=		Disable strict checking of access to MMIO memory
 	iomem=		Disable strict checking of access to MMIO memory
 		strict	regions from userspace.
 		strict	regions from userspace.

+ 47 - 5
Documentation/virtual/kvm/api.txt

@@ -401,10 +401,9 @@ Capability: basic
 Architectures: x86, ppc, mips
 Architectures: x86, ppc, mips
 Type: vcpu ioctl
 Type: vcpu ioctl
 Parameters: struct kvm_interrupt (in)
 Parameters: struct kvm_interrupt (in)
-Returns: 0 on success, -1 on error
+Returns: 0 on success, negative on failure.
 
 
-Queues a hardware interrupt vector to be injected.  This is only
-useful if in-kernel local APIC or equivalent is not used.
+Queues a hardware interrupt vector to be injected.
 
 
 /* for KVM_INTERRUPT */
 /* for KVM_INTERRUPT */
 struct kvm_interrupt {
 struct kvm_interrupt {
@@ -414,7 +413,14 @@ struct kvm_interrupt {
 
 
 X86:
 X86:
 
 
-Note 'irq' is an interrupt vector, not an interrupt pin or line.
+Returns: 0 on success,
+	 -EEXIST if an interrupt is already enqueued
+	 -EINVAL the the irq number is invalid
+	 -ENXIO if the PIC is in the kernel
+	 -EFAULT if the pointer is invalid
+
+Note 'irq' is an interrupt vector, not an interrupt pin or line. This
+ioctl is useful if the in-kernel PIC is not used.
 
 
 PPC:
 PPC:
 
 
@@ -1598,7 +1604,7 @@ provided event instead of triggering an exit.
 struct kvm_ioeventfd {
 struct kvm_ioeventfd {
 	__u64 datamatch;
 	__u64 datamatch;
 	__u64 addr;        /* legal pio/mmio address */
 	__u64 addr;        /* legal pio/mmio address */
-	__u32 len;         /* 1, 2, 4, or 8 bytes    */
+	__u32 len;         /* 0, 1, 2, 4, or 8 bytes    */
 	__s32 fd;
 	__s32 fd;
 	__u32 flags;
 	__u32 flags;
 	__u8  pad[36];
 	__u8  pad[36];
@@ -1621,6 +1627,10 @@ to the registered address is equal to datamatch in struct kvm_ioeventfd.
 For virtio-ccw devices, addr contains the subchannel id and datamatch the
 For virtio-ccw devices, addr contains the subchannel id and datamatch the
 virtqueue index.
 virtqueue index.
 
 
+With KVM_CAP_IOEVENTFD_ANY_LENGTH, a zero length ioeventfd is allowed, and
+the kernel will ignore the length of guest write and may get a faster vmexit.
+The speedup may only apply to specific architectures, but the ioeventfd will
+work anyway.
 
 
 4.60 KVM_DIRTY_TLB
 4.60 KVM_DIRTY_TLB
 
 
@@ -3309,6 +3319,18 @@ Valid values for 'type' are:
    to ignore the request, or to gather VM memory core dump and/or
    to ignore the request, or to gather VM memory core dump and/or
    reset/shutdown of the VM.
    reset/shutdown of the VM.
 
 
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
+
+Indicates that the VCPU's in-kernel local APIC received an EOI for a
+level-triggered IOAPIC interrupt.  This exit only triggers when the
+IOAPIC is implemented in userspace (i.e. KVM_CAP_SPLIT_IRQCHIP is enabled);
+the userspace IOAPIC should process the EOI and retrigger the interrupt if
+it is still asserted.  Vector is the LAPIC interrupt vector for which the
+EOI was received.
+
 		/* Fix the size of the union. */
 		/* Fix the size of the union. */
 		char padding[256];
 		char padding[256];
 	};
 	};
@@ -3627,6 +3649,26 @@ struct {
 
 
 KVM handlers should exit to userspace with rc = -EREMOTE.
 KVM handlers should exit to userspace with rc = -EREMOTE.
 
 
+7.5 KVM_CAP_SPLIT_IRQCHIP
+
+Architectures: x86
+Parameters: args[0] - number of routes reserved for userspace IOAPICs
+Returns: 0 on success, -1 on error
+
+Create a local apic for each processor in the kernel. This can be used
+instead of KVM_CREATE_IRQCHIP if the userspace VMM wishes to emulate the
+IOAPIC and PIC (and also the PIT, even though this has to be enabled
+separately).
+
+This capability also enables in kernel routing of interrupt requests;
+when KVM_CAP_SPLIT_IRQCHIP only routes of KVM_IRQ_ROUTING_MSI type are
+used in the IRQ routing table.  The first args[0] MSI routes are reserved
+for the IOAPIC pins.  Whenever the LAPIC receives an EOI for these routes,
+a KVM_EXIT_IOAPIC_EOI vmexit will be reported to userspace.
+
+Fails if VCPU has already been created, or if the irqchip is already in the
+kernel (i.e. KVM_CREATE_IRQCHIP has already been called).
+
 
 
 8. Other capabilities.
 8. Other capabilities.
 ----------------------
 ----------------------

+ 187 - 0
Documentation/virtual/kvm/arm/vgic-mapped-irqs.txt

@@ -0,0 +1,187 @@
+KVM/ARM VGIC Forwarded Physical Interrupts
+==========================================
+
+The KVM/ARM code implements software support for the ARM Generic
+Interrupt Controller's (GIC's) hardware support for virtualization by
+allowing software to inject virtual interrupts to a VM, which the guest
+OS sees as regular interrupts.  The code is famously known as the VGIC.
+
+Some of these virtual interrupts, however, correspond to physical
+interrupts from real physical devices.  One example could be the
+architected timer, which itself supports virtualization, and therefore
+lets a guest OS program the hardware device directly to raise an
+interrupt at some point in time.  When such an interrupt is raised, the
+host OS initially handles the interrupt and must somehow signal this
+event as a virtual interrupt to the guest.  Another example could be a
+passthrough device, where the physical interrupts are initially handled
+by the host, but the device driver for the device lives in the guest OS
+and KVM must therefore somehow inject a virtual interrupt on behalf of
+the physical one to the guest OS.
+
+These virtual interrupts corresponding to a physical interrupt on the
+host are called forwarded physical interrupts, but are also sometimes
+referred to as 'virtualized physical interrupts' and 'mapped interrupts'.
+
+Forwarded physical interrupts are handled slightly differently compared
+to virtual interrupts generated purely by a software emulated device.
+
+
+The HW bit
+----------
+Virtual interrupts are signalled to the guest by programming the List
+Registers (LRs) on the GIC before running a VCPU.  The LR is programmed
+with the virtual IRQ number and the state of the interrupt (Pending,
+Active, or Pending+Active).  When the guest ACKs and EOIs a virtual
+interrupt, the LR state moves from Pending to Active, and finally to
+inactive.
+
+The LRs include an extra bit, called the HW bit.  When this bit is set,
+KVM must also program an additional field in the LR, the physical IRQ
+number, to link the virtual with the physical IRQ.
+
+When the HW bit is set, KVM must EITHER set the Pending OR the Active
+bit, never both at the same time.
+
+Setting the HW bit causes the hardware to deactivate the physical
+interrupt on the physical distributor when the guest deactivates the
+corresponding virtual interrupt.
+
+
+Forwarded Physical Interrupts Life Cycle
+----------------------------------------
+
+The state of forwarded physical interrupts is managed in the following way:
+
+  - The physical interrupt is acked by the host, and becomes active on
+    the physical distributor (*).
+  - KVM sets the LR.Pending bit, because this is the only way the GICV
+    interface is going to present it to the guest.
+  - LR.Pending will stay set as long as the guest has not acked the interrupt.
+  - LR.Pending transitions to LR.Active on the guest read of the IAR, as
+    expected.
+  - On guest EOI, the *physical distributor* active bit gets cleared,
+    but the LR.Active is left untouched (set).
+  - KVM clears the LR on VM exits when the physical distributor
+    active state has been cleared.
+
+(*): The host handling is slightly more complicated.  For some forwarded
+interrupts (shared), KVM directly sets the active state on the physical
+distributor before entering the guest, because the interrupt is never actually
+handled on the host (see details on the timer as an example below).  For other
+forwarded interrupts (non-shared) the host does not deactivate the interrupt
+when the host ISR completes, but leaves the interrupt active until the guest
+deactivates it.  Leaving the interrupt active is allowed, because Linux
+configures the physical GIC with EOIMode=1, which causes EOI operations to
+perform a priority drop allowing the GIC to receive other interrupts of the
+default priority.
+
+
+Forwarded Edge and Level Triggered PPIs and SPIs
+------------------------------------------------
+Forwarded physical interrupts injected should always be active on the
+physical distributor when injected to a guest.
+
+Level-triggered interrupts will keep the interrupt line to the GIC
+asserted, typically until the guest programs the device to deassert the
+line.  This means that the interrupt will remain pending on the physical
+distributor until the guest has reprogrammed the device.  Since we
+always run the VM with interrupts enabled on the CPU, a pending
+interrupt will exit the guest as soon as we switch into the guest,
+preventing the guest from ever making progress as the process repeats
+over and over.  Therefore, the active state on the physical distributor
+must be set when entering the guest, preventing the GIC from forwarding
+the pending interrupt to the CPU.  As soon as the guest deactivates the
+interrupt, the physical line is sampled by the hardware again and the host
+takes a new interrupt if and only if the physical line is still asserted.
+
+Edge-triggered interrupts do not exhibit the same problem with
+preventing guest execution that level-triggered interrupts do.  One
+option is to not use HW bit at all, and inject edge-triggered interrupts
+from a physical device as pure virtual interrupts.  But that would
+potentially slow down handling of the interrupt in the guest, because a
+physical interrupt occurring in the middle of the guest ISR would
+preempt the guest for the host to handle the interrupt.  Additionally,
+if you configure the system to handle interrupts on a separate physical
+core from that running your VCPU, you still have to interrupt the VCPU
+to queue the pending state onto the LR, even though the guest won't use
+this information until the guest ISR completes.  Therefore, the HW
+bit should always be set for forwarded edge-triggered interrupts.  With
+the HW bit set, the virtual interrupt is injected and additional
+physical interrupts occurring before the guest deactivates the interrupt
+simply mark the state on the physical distributor as Pending+Active.  As
+soon as the guest deactivates the interrupt, the host takes another
+interrupt if and only if there was a physical interrupt between injecting
+the forwarded interrupt to the guest and the guest deactivating the
+interrupt.
+
+Consequently, whenever we schedule a VCPU with one or more LRs with the
+HW bit set, the interrupt must also be active on the physical
+distributor.
+
+
+Forwarded LPIs
+--------------
+LPIs, introduced in GICv3, are always edge-triggered and do not have an
+active state.  They become pending when a device signal them, and as
+soon as they are acked by the CPU, they are inactive again.
+
+It therefore doesn't make sense, and is not supported, to set the HW bit
+for physical LPIs that are forwarded to a VM as virtual interrupts,
+typically virtual SPIs.
+
+For LPIs, there is no other choice than to preempt the VCPU thread if
+necessary, and queue the pending state onto the LR.
+
+
+Putting It Together: The Architected Timer
+------------------------------------------
+The architected timer is a device that signals interrupts with level
+triggered semantics.  The timer hardware is directly accessed by VCPUs
+which program the timer to fire at some point in time.  Each VCPU on a
+system programs the timer to fire at different times, and therefore the
+hardware is multiplexed between multiple VCPUs.  This is implemented by
+context-switching the timer state along with each VCPU thread.
+
+However, this means that a scenario like the following is entirely
+possible, and in fact, typical:
+
+1.  KVM runs the VCPU
+2.  The guest programs the time to fire in T+100
+3.  The guest is idle and calls WFI (wait-for-interrupts)
+4.  The hardware traps to the host
+5.  KVM stores the timer state to memory and disables the hardware timer
+6.  KVM schedules a soft timer to fire in T+(100 - time since step 2)
+7.  KVM puts the VCPU thread to sleep (on a waitqueue)
+8.  The soft timer fires, waking up the VCPU thread
+9.  KVM reprograms the timer hardware with the VCPU's values
+10. KVM marks the timer interrupt as active on the physical distributor
+11. KVM injects a forwarded physical interrupt to the guest
+12. KVM runs the VCPU
+
+Notice that KVM injects a forwarded physical interrupt in step 11 without
+the corresponding interrupt having actually fired on the host.  That is
+exactly why we mark the timer interrupt as active in step 10, because
+the active state on the physical distributor is part of the state
+belonging to the timer hardware, which is context-switched along with
+the VCPU thread.
+
+If the guest does not idle because it is busy, the flow looks like this
+instead:
+
+1.  KVM runs the VCPU
+2.  The guest programs the time to fire in T+100
+4.  At T+100 the timer fires and a physical IRQ causes the VM to exit
+    (note that this initially only traps to EL2 and does not run the host ISR
+    until KVM has returned to the host).
+5.  With interrupts still disabled on the CPU coming back from the guest, KVM
+    stores the virtual timer state to memory and disables the virtual hw timer.
+6.  KVM looks at the timer state (in memory) and injects a forwarded physical
+    interrupt because it concludes the timer has expired.
+7.  KVM marks the timer interrupt as active on the physical distributor
+7.  KVM enables the timer, enables interrupts, and runs the VCPU
+
+Notice that again the forwarded physical interrupt is injected to the
+guest without having actually been handled on the host.  In this case it
+is because the physical interrupt is never actually seen by the host because the
+timer is disabled upon guest return, and the virtual forwarded interrupt is
+injected on the KVM guest entry path.

+ 10 - 8
Documentation/virtual/kvm/devices/arm-vgic.txt

@@ -44,28 +44,29 @@ Groups:
   Attributes:
   Attributes:
     The attr field of kvm_device_attr encodes two values:
     The attr field of kvm_device_attr encodes two values:
     bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
     bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
 
 
     All distributor regs are (rw, 32-bit)
     All distributor regs are (rw, 32-bit)
 
 
     The offset is relative to the "Distributor base address" as defined in the
     The offset is relative to the "Distributor base address" as defined in the
     GICv2 specs.  Getting or setting such a register has the same effect as
     GICv2 specs.  Getting or setting such a register has the same effect as
-    reading or writing the register on the actual hardware from the cpu
-    specified with cpu id field.  Note that most distributor fields are not
-    banked, but return the same value regardless of the cpu id used to access
-    the register.
+    reading or writing the register on the actual hardware from the cpu whose
+    index is specified with the vcpu_index field.  Note that most distributor
+    fields are not banked, but return the same value regardless of the
+    vcpu_index used to access the register.
   Limitations:
   Limitations:
     - Priorities are not implemented, and registers are RAZ/WI
     - Priorities are not implemented, and registers are RAZ/WI
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
   Errors:
   Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
     -EBUSY: One or more VCPUs are running
     -EBUSY: One or more VCPUs are running
+    -EINVAL: Invalid vcpu_index supplied
 
 
   KVM_DEV_ARM_VGIC_GRP_CPU_REGS
   KVM_DEV_ARM_VGIC_GRP_CPU_REGS
   Attributes:
   Attributes:
     The attr field of kvm_device_attr encodes two values:
     The attr field of kvm_device_attr encodes two values:
     bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
     bits:     | 63   ....  40 | 39 ..  32  |  31   ....    0 |
-    values:   |    reserved   |   cpu id   |      offset     |
+    values:   |    reserved   | vcpu_index |      offset     |
 
 
     All CPU interface regs are (rw, 32-bit)
     All CPU interface regs are (rw, 32-bit)
 
 
@@ -91,8 +92,9 @@ Groups:
     - Priorities are not implemented, and registers are RAZ/WI
     - Priorities are not implemented, and registers are RAZ/WI
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
     - Currently only implemented for KVM_DEV_TYPE_ARM_VGIC_V2.
   Errors:
   Errors:
-    -ENODEV: Getting or setting this register is not yet supported
+    -ENXIO: Getting or setting this register is not yet supported
     -EBUSY: One or more VCPUs are running
     -EBUSY: One or more VCPUs are running
+    -EINVAL: Invalid vcpu_index supplied
 
 
   KVM_DEV_ARM_VGIC_GRP_NR_IRQS
   KVM_DEV_ARM_VGIC_GRP_NR_IRQS
   Attributes:
   Attributes:

+ 12 - 0
Documentation/virtual/kvm/locking.txt

@@ -166,3 +166,15 @@ Comment:	The srcu read lock must be held while accessing memslots (e.g.
 		MMIO/PIO address->device structure mapping (kvm->buses).
 		MMIO/PIO address->device structure mapping (kvm->buses).
 		The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
 		The srcu index can be stored in kvm_vcpu->srcu_idx per vcpu
 		if it is needed by multiple functions.
 		if it is needed by multiple functions.
+
+Name:		blocked_vcpu_on_cpu_lock
+Type:		spinlock_t
+Arch:		x86
+Protects:	blocked_vcpu_on_cpu
+Comment:	This is a per-CPU lock and it is used for VT-d posted-interrupts.
+		When VT-d posted-interrupts is supported and the VM has assigned
+		devices, we put the blocked vCPU on the list blocked_vcpu_on_cpu
+		protected by blocked_vcpu_on_cpu_lock, when VT-d hardware issues
+		wakeup notification event since external interrupts from the
+		assigned devices happens, we will find the vCPU on the list to
+		wakeup.

+ 7 - 0
MAINTAINERS

@@ -11348,6 +11348,13 @@ L:	netdev@vger.kernel.org
 S:	Maintained
 S:	Maintained
 F:	drivers/net/ethernet/via/via-velocity.*
 F:	drivers/net/ethernet/via/via-velocity.*
 
 
+VIRT LIB
+M:	Alex Williamson <alex.williamson@redhat.com>
+M:	Paolo Bonzini <pbonzini@redhat.com>
+L:	kvm@vger.kernel.org
+S:	Supported
+F:	virt/lib/
+
 VIVID VIRTUAL VIDEO DRIVER
 VIVID VIRTUAL VIDEO DRIVER
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 M:	Hans Verkuil <hverkuil@xs4all.nl>
 L:	linux-media@vger.kernel.org
 L:	linux-media@vger.kernel.org

+ 6 - 4
Makefile

@@ -550,6 +550,7 @@ drivers-y	:= drivers/ sound/ firmware/
 net-y		:= net/
 net-y		:= net/
 libs-y		:= lib/
 libs-y		:= lib/
 core-y		:= usr/
 core-y		:= usr/
+virt-y		:= virt/
 endif # KBUILD_EXTMOD
 endif # KBUILD_EXTMOD
 
 
 ifeq ($(dot-config),1)
 ifeq ($(dot-config),1)
@@ -882,10 +883,10 @@ core-y		+= kernel/ certs/ mm/ fs/ ipc/ security/ crypto/ block/
 
 
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 vmlinux-dirs	:= $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
 		     $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
-		     $(net-y) $(net-m) $(libs-y) $(libs-m)))
+		     $(net-y) $(net-m) $(libs-y) $(libs-m) $(virt-y)))
 
 
 vmlinux-alldirs	:= $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
 vmlinux-alldirs	:= $(sort $(vmlinux-dirs) $(patsubst %/,%,$(filter %/, \
-		     $(init-) $(core-) $(drivers-) $(net-) $(libs-))))
+		     $(init-) $(core-) $(drivers-) $(net-) $(libs-) $(virt-))))
 
 
 init-y		:= $(patsubst %/, %/built-in.o, $(init-y))
 init-y		:= $(patsubst %/, %/built-in.o, $(init-y))
 core-y		:= $(patsubst %/, %/built-in.o, $(core-y))
 core-y		:= $(patsubst %/, %/built-in.o, $(core-y))
@@ -894,14 +895,15 @@ net-y		:= $(patsubst %/, %/built-in.o, $(net-y))
 libs-y1		:= $(patsubst %/, %/lib.a, $(libs-y))
 libs-y1		:= $(patsubst %/, %/lib.a, $(libs-y))
 libs-y2		:= $(patsubst %/, %/built-in.o, $(libs-y))
 libs-y2		:= $(patsubst %/, %/built-in.o, $(libs-y))
 libs-y		:= $(libs-y1) $(libs-y2)
 libs-y		:= $(libs-y1) $(libs-y2)
+virt-y		:= $(patsubst %/, %/built-in.o, $(virt-y))
 
 
 # Externally visible symbols (used by link-vmlinux.sh)
 # Externally visible symbols (used by link-vmlinux.sh)
 export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
 export KBUILD_VMLINUX_INIT := $(head-y) $(init-y)
-export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y)
+export KBUILD_VMLINUX_MAIN := $(core-y) $(libs-y) $(drivers-y) $(net-y) $(virt-y)
 export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
 export KBUILD_LDS          := arch/$(SRCARCH)/kernel/vmlinux.lds
 export LDFLAGS_vmlinux
 export LDFLAGS_vmlinux
 # used by scripts/pacmage/Makefile
 # used by scripts/pacmage/Makefile
-export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools virt)
+export KBUILD_ALLDIRS := $(sort $(filter-out arch/%,$(vmlinux-alldirs)) arch Documentation include samples scripts tools)
 
 
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
 vmlinux-deps := $(KBUILD_LDS) $(KBUILD_VMLINUX_INIT) $(KBUILD_VMLINUX_MAIN)
 
 

+ 20 - 0
arch/arm/include/asm/kvm_arm.h

@@ -218,4 +218,24 @@
 #define HSR_DABT_CM		(1U << 8)
 #define HSR_DABT_CM		(1U << 8)
 #define HSR_DABT_EA		(1U << 9)
 #define HSR_DABT_EA		(1U << 9)
 
 
+#define kvm_arm_exception_type	\
+	{0, "RESET" }, 		\
+	{1, "UNDEFINED" },	\
+	{2, "SOFTWARE" },	\
+	{3, "PREF_ABORT" },	\
+	{4, "DATA_ABORT" },	\
+	{5, "IRQ" },		\
+	{6, "FIQ" },		\
+	{7, "HVC" }
+
+#define HSRECN(x) { HSR_EC_##x, #x }
+
+#define kvm_arm_exception_class \
+	HSRECN(UNKNOWN), HSRECN(WFI), HSRECN(CP15_32), HSRECN(CP15_64), \
+	HSRECN(CP14_MR), HSRECN(CP14_LS), HSRECN(CP_0_13), HSRECN(CP10_ID), \
+	HSRECN(JAZELLE), HSRECN(BXJ), HSRECN(CP14_64), HSRECN(SVC_HYP), \
+	HSRECN(HVC), HSRECN(SMC), HSRECN(IABT), HSRECN(IABT_HYP), \
+	HSRECN(DABT), HSRECN(DABT_HYP)
+
+
 #endif /* __ARM_KVM_ARM_H__ */
 #endif /* __ARM_KVM_ARM_H__ */

+ 4 - 1
arch/arm/include/asm/kvm_host.h

@@ -126,7 +126,10 @@ struct kvm_vcpu_arch {
 	 * here.
 	 * here.
 	 */
 	 */
 
 
-	/* Don't run the guest on this vcpu */
+	/* vcpu power-off state */
+	bool power_off;
+
+	 /* Don't run the guest (internal implementation need) */
 	bool pause;
 	bool pause;
 
 
 	/* IO related fields */
 	/* IO related fields */

+ 2 - 0
arch/arm/kvm/Kconfig

@@ -46,4 +46,6 @@ config KVM_ARM_HOST
 	---help---
 	---help---
 	  Provides host support for ARM processors.
 	  Provides host support for ARM processors.
 
 
+source drivers/vhost/Kconfig
+
 endif # VIRTUALIZATION
 endif # VIRTUALIZATION

+ 60 - 16
arch/arm/kvm/arm.c

@@ -271,6 +271,16 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return kvm_timer_should_fire(vcpu);
 	return kvm_timer_should_fire(vcpu);
 }
 }
 
 
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
+{
+	kvm_timer_schedule(vcpu);
+}
+
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu)
+{
+	kvm_timer_unschedule(vcpu);
+}
+
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 {
 {
 	/* Force users to call KVM_ARM_VCPU_INIT */
 	/* Force users to call KVM_ARM_VCPU_INIT */
@@ -308,7 +318,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 				    struct kvm_mp_state *mp_state)
 {
 {
-	if (vcpu->arch.pause)
+	if (vcpu->arch.power_off)
 		mp_state->mp_state = KVM_MP_STATE_STOPPED;
 		mp_state->mp_state = KVM_MP_STATE_STOPPED;
 	else
 	else
 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
 		mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
@@ -321,10 +331,10 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 {
 	switch (mp_state->mp_state) {
 	switch (mp_state->mp_state) {
 	case KVM_MP_STATE_RUNNABLE:
 	case KVM_MP_STATE_RUNNABLE:
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 		break;
 		break;
 	case KVM_MP_STATE_STOPPED:
 	case KVM_MP_STATE_STOPPED:
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 		break;
 		break;
 	default:
 	default:
 		return -EINVAL;
 		return -EINVAL;
@@ -342,7 +352,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
  */
  */
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 {
-	return !!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v);
+	return ((!!v->arch.irq_lines || kvm_vgic_vcpu_pending_irq(v))
+		&& !v->arch.power_off && !v->arch.pause);
 }
 }
 
 
 /* Just ensure a guest exit from a particular CPU */
 /* Just ensure a guest exit from a particular CPU */
@@ -468,11 +479,38 @@ bool kvm_arch_intc_initialized(struct kvm *kvm)
 	return vgic_initialized(kvm);
 	return vgic_initialized(kvm);
 }
 }
 
 
-static void vcpu_pause(struct kvm_vcpu *vcpu)
+static void kvm_arm_halt_guest(struct kvm *kvm) __maybe_unused;
+static void kvm_arm_resume_guest(struct kvm *kvm) __maybe_unused;
+
+static void kvm_arm_halt_guest(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		vcpu->arch.pause = true;
+	force_vm_exit(cpu_all_mask);
+}
+
+static void kvm_arm_resume_guest(struct kvm *kvm)
+{
+	int i;
+	struct kvm_vcpu *vcpu;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
+
+		vcpu->arch.pause = false;
+		wake_up_interruptible(wq);
+	}
+}
+
+static void vcpu_sleep(struct kvm_vcpu *vcpu)
 {
 {
 	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 	wait_queue_head_t *wq = kvm_arch_vcpu_wq(vcpu);
 
 
-	wait_event_interruptible(*wq, !vcpu->arch.pause);
+	wait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+				       (!vcpu->arch.pause)));
 }
 }
 
 
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
@@ -522,8 +560,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 
 		update_vttbr(vcpu->kvm);
 		update_vttbr(vcpu->kvm);
 
 
-		if (vcpu->arch.pause)
-			vcpu_pause(vcpu);
+		if (vcpu->arch.power_off || vcpu->arch.pause)
+			vcpu_sleep(vcpu);
 
 
 		/*
 		/*
 		 * Disarming the background timer must be done in a
 		 * Disarming the background timer must be done in a
@@ -549,11 +587,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 			run->exit_reason = KVM_EXIT_INTR;
 			run->exit_reason = KVM_EXIT_INTR;
 		}
 		}
 
 
-		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm)) {
+		if (ret <= 0 || need_new_vmid_gen(vcpu->kvm) ||
+			vcpu->arch.power_off || vcpu->arch.pause) {
 			local_irq_enable();
 			local_irq_enable();
+			kvm_timer_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			kvm_vgic_sync_hwstate(vcpu);
 			preempt_enable();
 			preempt_enable();
-			kvm_timer_sync_hwstate(vcpu);
 			continue;
 			continue;
 		}
 		}
 
 
@@ -596,14 +635,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		 * guest time.
 		 * guest time.
 		 */
 		 */
 		kvm_guest_exit();
 		kvm_guest_exit();
-		trace_kvm_exit(kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+		trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
+
+		/*
+		 * We must sync the timer state before the vgic state so that
+		 * the vgic can properly sample the updated state of the
+		 * interrupt line.
+		 */
+		kvm_timer_sync_hwstate(vcpu);
 
 
 		kvm_vgic_sync_hwstate(vcpu);
 		kvm_vgic_sync_hwstate(vcpu);
 
 
 		preempt_enable();
 		preempt_enable();
 
 
-		kvm_timer_sync_hwstate(vcpu);
-
 		ret = handle_exit(vcpu, run, ret);
 		ret = handle_exit(vcpu, run, ret);
 	}
 	}
 
 
@@ -765,12 +809,12 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu,
 	vcpu_reset_hcr(vcpu);
 	vcpu_reset_hcr(vcpu);
 
 
 	/*
 	/*
-	 * Handle the "start in power-off" case by marking the VCPU as paused.
+	 * Handle the "start in power-off" case.
 	 */
 	 */
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
 	if (test_bit(KVM_ARM_VCPU_POWER_OFF, vcpu->arch.features))
-		vcpu->arch.pause = true;
+		vcpu->arch.power_off = true;
 	else
 	else
-		vcpu->arch.pause = false;
+		vcpu->arch.power_off = false;
 
 
 	return 0;
 	return 0;
 }
 }

+ 5 - 5
arch/arm/kvm/psci.c

@@ -63,7 +63,7 @@ static unsigned long kvm_psci_vcpu_suspend(struct kvm_vcpu *vcpu)
 
 
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu)
 {
 {
-	vcpu->arch.pause = true;
+	vcpu->arch.power_off = true;
 }
 }
 
 
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
@@ -87,7 +87,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 */
 	 */
 	if (!vcpu)
 	if (!vcpu)
 		return PSCI_RET_INVALID_PARAMS;
 		return PSCI_RET_INVALID_PARAMS;
-	if (!vcpu->arch.pause) {
+	if (!vcpu->arch.power_off) {
 		if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
 		if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
 			return PSCI_RET_ALREADY_ON;
 			return PSCI_RET_ALREADY_ON;
 		else
 		else
@@ -115,7 +115,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
 	 * the general puspose registers are undefined upon CPU_ON.
 	 * the general puspose registers are undefined upon CPU_ON.
 	 */
 	 */
 	*vcpu_reg(vcpu, 0) = context_id;
 	*vcpu_reg(vcpu, 0) = context_id;
-	vcpu->arch.pause = false;
+	vcpu->arch.power_off = false;
 	smp_mb();		/* Make sure the above is visible */
 	smp_mb();		/* Make sure the above is visible */
 
 
 	wq = kvm_arch_vcpu_wq(vcpu);
 	wq = kvm_arch_vcpu_wq(vcpu);
@@ -153,7 +153,7 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
 		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
 		mpidr = kvm_vcpu_get_mpidr_aff(tmp);
 		if ((mpidr & target_affinity_mask) == target_affinity) {
 		if ((mpidr & target_affinity_mask) == target_affinity) {
 			matching_cpus++;
 			matching_cpus++;
-			if (!tmp->arch.pause)
+			if (!tmp->arch.power_off)
 				return PSCI_0_2_AFFINITY_LEVEL_ON;
 				return PSCI_0_2_AFFINITY_LEVEL_ON;
 		}
 		}
 	}
 	}
@@ -179,7 +179,7 @@ static void kvm_prepare_system_event(struct kvm_vcpu *vcpu, u32 type)
 	 * re-initialized.
 	 * re-initialized.
 	 */
 	 */
 	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
 	kvm_for_each_vcpu(i, tmp, vcpu->kvm) {
-		tmp->arch.pause = true;
+		tmp->arch.power_off = true;
 		kvm_vcpu_kick(tmp);
 		kvm_vcpu_kick(tmp);
 	}
 	}
 
 

+ 7 - 3
arch/arm/kvm/trace.h

@@ -25,21 +25,25 @@ TRACE_EVENT(kvm_entry,
 );
 );
 
 
 TRACE_EVENT(kvm_exit,
 TRACE_EVENT(kvm_exit,
-	TP_PROTO(unsigned int exit_reason, unsigned long vcpu_pc),
-	TP_ARGS(exit_reason, vcpu_pc),
+	TP_PROTO(int idx, unsigned int exit_reason, unsigned long vcpu_pc),
+	TP_ARGS(idx, exit_reason, vcpu_pc),
 
 
 	TP_STRUCT__entry(
 	TP_STRUCT__entry(
+		__field(	int,		idx		)
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned int,	exit_reason	)
 		__field(	unsigned long,	vcpu_pc		)
 		__field(	unsigned long,	vcpu_pc		)
 	),
 	),
 
 
 	TP_fast_assign(
 	TP_fast_assign(
+		__entry->idx			= idx;
 		__entry->exit_reason		= exit_reason;
 		__entry->exit_reason		= exit_reason;
 		__entry->vcpu_pc		= vcpu_pc;
 		__entry->vcpu_pc		= vcpu_pc;
 	),
 	),
 
 
-	TP_printk("HSR_EC: 0x%04x, PC: 0x%08lx",
+	TP_printk("%s: HSR_EC: 0x%04x (%s), PC: 0x%08lx",
+		  __print_symbolic(__entry->idx, kvm_arm_exception_type),
 		  __entry->exit_reason,
 		  __entry->exit_reason,
+		  __print_symbolic(__entry->exit_reason, kvm_arm_exception_class),
 		  __entry->vcpu_pc)
 		  __entry->vcpu_pc)
 );
 );
 
 

+ 16 - 0
arch/arm64/include/asm/kvm_arm.h

@@ -200,4 +200,20 @@
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
 #define HPFAR_MASK	(~UL(0xf))
 #define HPFAR_MASK	(~UL(0xf))
 
 
+#define kvm_arm_exception_type	\
+	{0, "IRQ" }, 		\
+	{1, "TRAP" }
+
+#define ECN(x) { ESR_ELx_EC_##x, #x }
+
+#define kvm_arm_exception_class \
+	ECN(UNKNOWN), ECN(WFx), ECN(CP15_32), ECN(CP15_64), ECN(CP14_MR), \
+	ECN(CP14_LS), ECN(FP_ASIMD), ECN(CP10_ID), ECN(CP14_64), ECN(SVC64), \
+	ECN(HVC64), ECN(SMC64), ECN(SYS64), ECN(IMP_DEF), ECN(IABT_LOW), \
+	ECN(IABT_CUR), ECN(PC_ALIGN), ECN(DABT_LOW), ECN(DABT_CUR), \
+	ECN(SP_ALIGN), ECN(FP_EXC32), ECN(FP_EXC64), ECN(SERROR), \
+	ECN(BREAKPT_LOW), ECN(BREAKPT_CUR), ECN(SOFTSTP_LOW), \
+	ECN(SOFTSTP_CUR), ECN(WATCHPT_LOW), ECN(WATCHPT_CUR), \
+	ECN(BKPT32), ECN(VECTOR32), ECN(BRK64)
+
 #endif /* __ARM64_KVM_ARM_H__ */
 #endif /* __ARM64_KVM_ARM_H__ */

+ 4 - 1
arch/arm64/include/asm/kvm_host.h

@@ -149,7 +149,10 @@ struct kvm_vcpu_arch {
 		u32	mdscr_el1;
 		u32	mdscr_el1;
 	} guest_debug_preserved;
 	} guest_debug_preserved;
 
 
-	/* Don't run the guest */
+	/* vcpu power-off state */
+	bool power_off;
+
+	/* Don't run the guest (internal implementation need) */
 	bool pause;
 	bool pause;
 
 
 	/* IO related fields */
 	/* IO related fields */

+ 2 - 0
arch/arm64/kvm/Kconfig

@@ -48,4 +48,6 @@ config KVM_ARM_HOST
 	---help---
 	---help---
 	  Provides host support for ARM processors.
 	  Provides host support for ARM processors.
 
 
+source drivers/vhost/Kconfig
+
 endif # VIRTUALIZATION
 endif # VIRTUALIZATION

+ 8 - 0
arch/arm64/kvm/hyp.S

@@ -880,6 +880,14 @@ __kvm_hyp_panic:
 
 
 	bl __restore_sysregs
 	bl __restore_sysregs
 
 
+	/*
+	 * Make sure we have a valid host stack, and don't leave junk in the
+	 * frame pointer that will give us a misleading host stack unwinding.
+	 */
+	ldr	x22, [x2, #CPU_GP_REG_OFFSET(CPU_SP_EL1)]
+	msr	sp_el1, x22
+	mov	x29, xzr
+
 1:	adr	x0, __hyp_panic_str
 1:	adr	x0, __hyp_panic_str
 	adr	x1, 2f
 	adr	x1, 2f
 	ldp	x2, x3, [x1]
 	ldp	x2, x3, [x1]

+ 2 - 0
arch/mips/include/asm/kvm_host.h

@@ -847,5 +847,7 @@ static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 		struct kvm_memory_slot *slot) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 
 #endif /* __MIPS_KVM_HOST_H__ */
 #endif /* __MIPS_KVM_HOST_H__ */

+ 5 - 0
arch/powerpc/include/asm/disassemble.h

@@ -42,6 +42,11 @@ static inline unsigned int get_dcrn(u32 inst)
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
 }
 }
 
 
+static inline unsigned int get_tmrn(u32 inst)
+{
+	return ((inst >> 16) & 0x1f) | ((inst >> 6) & 0x3e0);
+}
+
 static inline unsigned int get_rt(u32 inst)
 static inline unsigned int get_rt(u32 inst)
 {
 {
 	return (inst >> 21) & 0x1f;
 	return (inst >> 21) & 0x1f;

+ 2 - 0
arch/powerpc/include/asm/kvm_host.h

@@ -716,5 +716,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) {}
 static inline void kvm_arch_exit(void) {}
 static inline void kvm_arch_exit(void) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 
 #endif /* __POWERPC_KVM_HOST_H__ */
 #endif /* __POWERPC_KVM_HOST_H__ */

+ 6 - 0
arch/powerpc/include/asm/reg_booke.h

@@ -742,6 +742,12 @@
 #define MMUBE1_VBE4		0x00000002
 #define MMUBE1_VBE4		0x00000002
 #define MMUBE1_VBE5		0x00000001
 #define MMUBE1_VBE5		0x00000001
 
 
+#define TMRN_TMCFG0      16	/* Thread Management Configuration Register 0 */
+#define TMRN_TMCFG0_NPRIBITS       0x003f0000 /* Bits of thread priority */
+#define TMRN_TMCFG0_NPRIBITS_SHIFT 16
+#define TMRN_TMCFG0_NATHRD         0x00003f00 /* Number of active threads */
+#define TMRN_TMCFG0_NATHRD_SHIFT   8
+#define TMRN_TMCFG0_NTHRD          0x0000003f /* Number of threads */
 #define TMRN_IMSR0	0x120	/* Initial MSR Register 0 (e6500) */
 #define TMRN_IMSR0	0x120	/* Initial MSR Register 0 (e6500) */
 #define TMRN_IMSR1	0x121	/* Initial MSR Register 1 (e6500) */
 #define TMRN_IMSR1	0x121	/* Initial MSR Register 1 (e6500) */
 #define TMRN_INIA0	0x140	/* Next Instruction Address Register 0 */
 #define TMRN_INIA0	0x140	/* Next Instruction Address Register 0 */

+ 2 - 1
arch/powerpc/kvm/book3s_64_mmu_hv.c

@@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 	}
 	}
 
 
 	/* Lastly try successively smaller sizes from the page allocator */
 	/* Lastly try successively smaller sizes from the page allocator */
-	while (!hpt && order > PPC_MIN_HPT_ORDER) {
+	/* Only do this if userspace didn't specify a size via ioctl */
+	while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) {
 		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
 		hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT|
 				       __GFP_NOWARN, order - PAGE_SHIFT);
 				       __GFP_NOWARN, order - PAGE_SHIFT);
 		if (!hpt)
 		if (!hpt)

+ 2 - 0
arch/powerpc/kvm/book3s_hv_rm_mmu.c

@@ -470,6 +470,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 	note_hpte_modification(kvm, rev);
 	note_hpte_modification(kvm, rev);
 	unlock_hpte(hpte, 0);
 	unlock_hpte(hpte, 0);
 
 
+	if (v & HPTE_V_ABSENT)
+		v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
 	hpret[0] = v;
 	hpret[0] = v;
 	hpret[1] = r;
 	hpret[1] = r;
 	return H_SUCCESS;
 	return H_SUCCESS;

+ 22 - 7
arch/powerpc/kvm/book3s_hv_rmhandlers.S

@@ -150,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
 	beq	11f
 	beq	11f
+	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
+	beq 	15f	/* Invoke the H_DOORBELL handler */
 	cmpwi	cr2, r12, BOOK3S_INTERRUPT_HMI
 	cmpwi	cr2, r12, BOOK3S_INTERRUPT_HMI
 	beq	cr2, 14f			/* HMI check */
 	beq	cr2, 14f			/* HMI check */
 
 
@@ -174,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtspr	SPRN_HSRR1, r7
 	mtspr	SPRN_HSRR1, r7
 	b	hmi_exception_after_realmode
 	b	hmi_exception_after_realmode
 
 
+15:	mtspr SPRN_HSRR0, r8
+	mtspr SPRN_HSRR1, r7
+	ba    0xe80
+
 kvmppc_primary_no_guest:
 kvmppc_primary_no_guest:
 	/* We handle this much like a ceded vcpu */
 	/* We handle this much like a ceded vcpu */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
 	/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
@@ -2377,7 +2383,6 @@ machine_check_realmode:
 	mr	r3, r9		/* get vcpu pointer */
 	mr	r3, r9		/* get vcpu pointer */
 	bl	kvmppc_realmode_machine_check
 	bl	kvmppc_realmode_machine_check
 	nop
 	nop
-	cmpdi	r3, 0		/* Did we handle MCE ? */
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	ld	r9, HSTATE_KVM_VCPU(r13)
 	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	li	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
 	/*
 	/*
@@ -2390,13 +2395,18 @@ machine_check_realmode:
 	 * The old code used to return to host for unhandled errors which
 	 * The old code used to return to host for unhandled errors which
 	 * was causing guest to hang with soft lockups inside guest and
 	 * was causing guest to hang with soft lockups inside guest and
 	 * makes it difficult to recover guest instance.
 	 * makes it difficult to recover guest instance.
+	 *
+	 * if we receive machine check with MSR(RI=0) then deliver it to
+	 * guest as machine check causing guest to crash.
 	 */
 	 */
-	ld	r10, VCPU_PC(r9)
 	ld	r11, VCPU_MSR(r9)
 	ld	r11, VCPU_MSR(r9)
+	andi.	r10, r11, MSR_RI	/* check for unrecoverable exception */
+	beq	1f			/* Deliver a machine check to guest */
+	ld	r10, VCPU_PC(r9)
+	cmpdi	r3, 0		/* Did we handle MCE ? */
 	bne	2f	/* Continue guest execution. */
 	bne	2f	/* Continue guest execution. */
 	/* If not, deliver a machine check.  SRR0/1 are already set */
 	/* If not, deliver a machine check.  SRR0/1 are already set */
-	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
-	ld	r11, VCPU_MSR(r9)
+1:	li	r10, BOOK3S_INTERRUPT_MACHINE_CHECK
 	bl	kvmppc_msr_interrupt
 	bl	kvmppc_msr_interrupt
 2:	b	fast_interrupt_c_return
 2:	b	fast_interrupt_c_return
 
 
@@ -2436,14 +2446,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
 
 	/* hypervisor doorbell */
 	/* hypervisor doorbell */
 3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
 3:	li	r12, BOOK3S_INTERRUPT_H_DOORBELL
+
+	/*
+	 * Clear the doorbell as we will invoke the handler
+	 * explicitly in the guest exit path.
+	 */
+	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
+	PPC_MSGCLR(6)
 	/* see if it's a host IPI */
 	/* see if it's a host IPI */
 	li	r3, 1
 	li	r3, 1
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	cmpwi	r0, 0
 	cmpwi	r0, 0
 	bnelr
 	bnelr
-	/* if not, clear it and return -1 */
-	lis	r6, (PPC_DBELL_SERVER << (63-36))@h
-	PPC_MSGCLR(6)
+	/* if not, return -1 */
 	li	r3, -1
 	li	r3, -1
 	blr
 	blr
 
 

+ 2 - 1
arch/powerpc/kvm/e500.c

@@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500,
                            struct kvm_book3e_206_tlb_entry *gtlbe)
                            struct kvm_book3e_206_tlb_entry *gtlbe)
 {
 {
 	struct vcpu_id_table *idt = vcpu_e500->idt;
 	struct vcpu_id_table *idt = vcpu_e500->idt;
-	unsigned int pr, tid, ts, pid;
+	unsigned int pr, tid, ts;
+	int pid;
 	u32 val, eaddr;
 	u32 val, eaddr;
 	unsigned long flags;
 	unsigned long flags;
 
 

+ 19 - 0
arch/powerpc/kvm/e500_emulate.c

@@ -15,6 +15,7 @@
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
 #include <asm/disassemble.h>
 #include <asm/disassemble.h>
 #include <asm/dbell.h>
 #include <asm/dbell.h>
+#include <asm/reg_booke.h>
 
 
 #include "booke.h"
 #include "booke.h"
 #include "e500.h"
 #include "e500.h"
@@ -22,6 +23,7 @@
 #define XOP_DCBTLS  166
 #define XOP_DCBTLS  166
 #define XOP_MSGSND  206
 #define XOP_MSGSND  206
 #define XOP_MSGCLR  238
 #define XOP_MSGCLR  238
+#define XOP_MFTMR   366
 #define XOP_TLBIVAX 786
 #define XOP_TLBIVAX 786
 #define XOP_TLBSX   914
 #define XOP_TLBSX   914
 #define XOP_TLBRE   946
 #define XOP_TLBRE   946
@@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu)
 	return EMULATE_DONE;
 	return EMULATE_DONE;
 }
 }
 
 
+static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst,
+				  int rt)
+{
+	/* Expose one thread per vcpu */
+	if (get_tmrn(inst) == TMRN_TMCFG0) {
+		kvmppc_set_gpr(vcpu, rt,
+			       1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT));
+		return EMULATE_DONE;
+	}
+
+	return EMULATE_FAIL;
+}
+
 int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
 int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				unsigned int inst, int *advance)
 				unsigned int inst, int *advance)
 {
 {
@@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu,
 			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
 			emulated = kvmppc_e500_emul_tlbivax(vcpu, ea);
 			break;
 			break;
 
 
+		case XOP_MFTMR:
+			emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt);
+			break;
+
 		case XOP_EHPRIV:
 		case XOP_EHPRIV:
 			emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
 			emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst,
 							   advance);
 							   advance);

+ 2 - 2
arch/powerpc/kvm/e500_mmu_host.c

@@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 
 
 			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
 			for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) {
 				unsigned long gfn_start, gfn_end;
 				unsigned long gfn_start, gfn_end;
-				tsize_pages = 1 << (tsize - 2);
+				tsize_pages = 1UL << (tsize - 2);
 
 
 				gfn_start = gfn & ~(tsize_pages - 1);
 				gfn_start = gfn & ~(tsize_pages - 1);
 				gfn_end = gfn_start + tsize_pages;
 				gfn_end = gfn_start + tsize_pages;
@@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
 	}
 	}
 
 
 	if (likely(!pfnmap)) {
 	if (likely(!pfnmap)) {
-		tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT);
+		tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT);
 		pfn = gfn_to_pfn_memslot(slot, gfn);
 		pfn = gfn_to_pfn_memslot(slot, gfn);
 		if (is_error_noslot_pfn(pfn)) {
 		if (is_error_noslot_pfn(pfn)) {
 			if (printk_ratelimit())
 			if (printk_ratelimit())

+ 3 - 0
arch/powerpc/kvm/powerpc.c

@@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 		else
 		else
 			r = num_online_cpus();
 			r = num_online_cpus();
 		break;
 		break;
+	case KVM_CAP_NR_MEMSLOTS:
+		r = KVM_USER_MEM_SLOTS;
+		break;
 	case KVM_CAP_MAX_VCPUS:
 	case KVM_CAP_MAX_VCPUS:
 		r = KVM_MAX_VCPUS;
 		r = KVM_MAX_VCPUS;
 		break;
 		break;

+ 2 - 0
arch/s390/include/asm/kvm_host.h

@@ -644,5 +644,7 @@ static inline void kvm_arch_memslots_updated(struct kvm *kvm, struct kvm_memslot
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_all(struct kvm *kvm) {}
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 static inline void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
 		struct kvm_memory_slot *slot) {}
 		struct kvm_memory_slot *slot) {}
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
 
 
 #endif
 #endif

+ 21 - 21
arch/s390/kvm/intercept.c

@@ -336,28 +336,28 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
 	return -EOPNOTSUPP;
 	return -EOPNOTSUPP;
 }
 }
 
 
-static const intercept_handler_t intercept_funcs[] = {
-	[0x00 >> 2] = handle_noop,
-	[0x04 >> 2] = handle_instruction,
-	[0x08 >> 2] = handle_prog,
-	[0x10 >> 2] = handle_noop,
-	[0x14 >> 2] = handle_external_interrupt,
-	[0x18 >> 2] = handle_noop,
-	[0x1C >> 2] = kvm_s390_handle_wait,
-	[0x20 >> 2] = handle_validity,
-	[0x28 >> 2] = handle_stop,
-	[0x38 >> 2] = handle_partial_execution,
-};
-
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
 {
 {
-	intercept_handler_t func;
-	u8 code = vcpu->arch.sie_block->icptcode;
-
-	if (code & 3 || (code >> 2) >= ARRAY_SIZE(intercept_funcs))
+	switch (vcpu->arch.sie_block->icptcode) {
+	case 0x00:
+	case 0x10:
+	case 0x18:
+		return handle_noop(vcpu);
+	case 0x04:
+		return handle_instruction(vcpu);
+	case 0x08:
+		return handle_prog(vcpu);
+	case 0x14:
+		return handle_external_interrupt(vcpu);
+	case 0x1c:
+		return kvm_s390_handle_wait(vcpu);
+	case 0x20:
+		return handle_validity(vcpu);
+	case 0x28:
+		return handle_stop(vcpu);
+	case 0x38:
+		return handle_partial_execution(vcpu);
+	default:
 		return -EOPNOTSUPP;
 		return -EOPNOTSUPP;
-	func = intercept_funcs[code >> 2];
-	if (func)
-		return func(vcpu);
-	return -EOPNOTSUPP;
+	}
 }
 }

+ 43 - 73
arch/s390/kvm/interrupt.c

@@ -51,11 +51,9 @@ static int psw_mchk_disabled(struct kvm_vcpu *vcpu)
 
 
 static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
 static int psw_interrupts_disabled(struct kvm_vcpu *vcpu)
 {
 {
-	if ((vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER) ||
-	    (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_IO) ||
-	    (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_EXT))
-		return 0;
-	return 1;
+	return psw_extint_disabled(vcpu) &&
+	       psw_ioint_disabled(vcpu) &&
+	       psw_mchk_disabled(vcpu);
 }
 }
 
 
 static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
 static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
@@ -71,13 +69,8 @@ static int ckc_interrupts_enabled(struct kvm_vcpu *vcpu)
 
 
 static int ckc_irq_pending(struct kvm_vcpu *vcpu)
 static int ckc_irq_pending(struct kvm_vcpu *vcpu)
 {
 {
-	preempt_disable();
-	if (!(vcpu->arch.sie_block->ckc <
-	      get_tod_clock_fast() + vcpu->arch.sie_block->epoch)) {
-		preempt_enable();
+	if (vcpu->arch.sie_block->ckc >= kvm_s390_get_tod_clock_fast(vcpu->kvm))
 		return 0;
 		return 0;
-	}
-	preempt_enable();
 	return ckc_interrupts_enabled(vcpu);
 	return ckc_interrupts_enabled(vcpu);
 }
 }
 
 
@@ -109,14 +102,10 @@ static inline u8 int_word_to_isc(u32 int_word)
 	return (int_word & 0x38000000) >> 27;
 	return (int_word & 0x38000000) >> 27;
 }
 }
 
 
-static inline unsigned long pending_floating_irqs(struct kvm_vcpu *vcpu)
+static inline unsigned long pending_irqs(struct kvm_vcpu *vcpu)
 {
 {
-	return vcpu->kvm->arch.float_int.pending_irqs;
-}
-
-static inline unsigned long pending_local_irqs(struct kvm_vcpu *vcpu)
-{
-	return vcpu->arch.local_int.pending_irqs;
+	return vcpu->kvm->arch.float_int.pending_irqs |
+	       vcpu->arch.local_int.pending_irqs;
 }
 }
 
 
 static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
 static unsigned long disable_iscs(struct kvm_vcpu *vcpu,
@@ -135,8 +124,7 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned long active_mask;
 	unsigned long active_mask;
 
 
-	active_mask = pending_local_irqs(vcpu);
-	active_mask |= pending_floating_irqs(vcpu);
+	active_mask = pending_irqs(vcpu);
 	if (!active_mask)
 	if (!active_mask)
 		return 0;
 		return 0;
 
 
@@ -204,7 +192,7 @@ static void __set_cpuflag(struct kvm_vcpu *vcpu, u32 flag)
 
 
 static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 {
 {
-	if (!(pending_floating_irqs(vcpu) & IRQ_PEND_IO_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_IO_MASK))
 		return;
 		return;
 	else if (psw_ioint_disabled(vcpu))
 	else if (psw_ioint_disabled(vcpu))
 		__set_cpuflag(vcpu, CPUSTAT_IO_INT);
 		__set_cpuflag(vcpu, CPUSTAT_IO_INT);
@@ -214,7 +202,7 @@ static void set_intercept_indicators_io(struct kvm_vcpu *vcpu)
 
 
 static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 {
 {
-	if (!(pending_local_irqs(vcpu) & IRQ_PEND_EXT_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_EXT_MASK))
 		return;
 		return;
 	if (psw_extint_disabled(vcpu))
 	if (psw_extint_disabled(vcpu))
 		__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
 		__set_cpuflag(vcpu, CPUSTAT_EXT_INT);
@@ -224,7 +212,7 @@ static void set_intercept_indicators_ext(struct kvm_vcpu *vcpu)
 
 
 static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
 static void set_intercept_indicators_mchk(struct kvm_vcpu *vcpu)
 {
 {
-	if (!(pending_local_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
+	if (!(pending_irqs(vcpu) & IRQ_PEND_MCHK_MASK))
 		return;
 		return;
 	if (psw_mchk_disabled(vcpu))
 	if (psw_mchk_disabled(vcpu))
 		vcpu->arch.sie_block->ictl |= ICTL_LPSW;
 		vcpu->arch.sie_block->ictl |= ICTL_LPSW;
@@ -815,23 +803,21 @@ int kvm_s390_ext_call_pending(struct kvm_vcpu *vcpu)
 
 
 int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
 int kvm_s390_vcpu_has_irq(struct kvm_vcpu *vcpu, int exclude_stop)
 {
 {
-	int rc;
+	if (deliverable_irqs(vcpu))
+		return 1;
 
 
-	rc = !!deliverable_irqs(vcpu);
-
-	if (!rc && kvm_cpu_has_pending_timer(vcpu))
-		rc = 1;
+	if (kvm_cpu_has_pending_timer(vcpu))
+		return 1;
 
 
 	/* external call pending and deliverable */
 	/* external call pending and deliverable */
-	if (!rc && kvm_s390_ext_call_pending(vcpu) &&
+	if (kvm_s390_ext_call_pending(vcpu) &&
 	    !psw_extint_disabled(vcpu) &&
 	    !psw_extint_disabled(vcpu) &&
 	    (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
 	    (vcpu->arch.sie_block->gcr[0] & 0x2000ul))
-		rc = 1;
-
-	if (!rc && !exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
-		rc = 1;
+		return 1;
 
 
-	return rc;
+	if (!exclude_stop && kvm_s390_is_stop_irq_pending(vcpu))
+		return 1;
+	return 0;
 }
 }
 
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
@@ -846,7 +832,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 	vcpu->stat.exit_wait_state++;
 	vcpu->stat.exit_wait_state++;
 
 
 	/* fast path */
 	/* fast path */
-	if (kvm_cpu_has_pending_timer(vcpu) || kvm_arch_vcpu_runnable(vcpu))
+	if (kvm_arch_vcpu_runnable(vcpu))
 		return 0;
 		return 0;
 
 
 	if (psw_interrupts_disabled(vcpu)) {
 	if (psw_interrupts_disabled(vcpu)) {
@@ -860,9 +846,7 @@ int kvm_s390_handle_wait(struct kvm_vcpu *vcpu)
 		goto no_timer;
 		goto no_timer;
 	}
 	}
 
 
-	preempt_disable();
-	now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
-	preempt_enable();
+	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 
 
 	/* underflow */
 	/* underflow */
@@ -901,9 +885,7 @@ enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
 	u64 now, sltime;
 	u64 now, sltime;
 
 
 	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
 	vcpu = container_of(timer, struct kvm_vcpu, arch.ckc_timer);
-	preempt_disable();
-	now = get_tod_clock_fast() + vcpu->arch.sie_block->epoch;
-	preempt_enable();
+	now = kvm_s390_get_tod_clock_fast(vcpu->kvm);
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 	sltime = tod_to_ns(vcpu->arch.sie_block->ckc - now);
 
 
 	/*
 	/*
@@ -981,39 +963,30 @@ static int __inject_prog(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 	trace_kvm_s390_inject_vcpu(vcpu->vcpu_id, KVM_S390_PROGRAM_INT,
 				   irq->u.pgm.code, 0);
 				   irq->u.pgm.code, 0);
 
 
-	li->irq.pgm = irq->u.pgm;
+	if (irq->u.pgm.code == PGM_PER) {
+		li->irq.pgm.code |= PGM_PER;
+		/* only modify PER related information */
+		li->irq.pgm.per_address = irq->u.pgm.per_address;
+		li->irq.pgm.per_code = irq->u.pgm.per_code;
+		li->irq.pgm.per_atmid = irq->u.pgm.per_atmid;
+		li->irq.pgm.per_access_id = irq->u.pgm.per_access_id;
+	} else if (!(irq->u.pgm.code & PGM_PER)) {
+		li->irq.pgm.code = (li->irq.pgm.code & PGM_PER) |
+				   irq->u.pgm.code;
+		/* only modify non-PER information */
+		li->irq.pgm.trans_exc_code = irq->u.pgm.trans_exc_code;
+		li->irq.pgm.mon_code = irq->u.pgm.mon_code;
+		li->irq.pgm.data_exc_code = irq->u.pgm.data_exc_code;
+		li->irq.pgm.mon_class_nr = irq->u.pgm.mon_class_nr;
+		li->irq.pgm.exc_access_id = irq->u.pgm.exc_access_id;
+		li->irq.pgm.op_access_id = irq->u.pgm.op_access_id;
+	} else {
+		li->irq.pgm = irq->u.pgm;
+	}
 	set_bit(IRQ_PEND_PROG, &li->pending_irqs);
 	set_bit(IRQ_PEND_PROG, &li->pending_irqs);
 	return 0;
 	return 0;
 }
 }
 
 
-int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
-{
-	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-	struct kvm_s390_irq irq;
-
-	spin_lock(&li->lock);
-	irq.u.pgm.code = code;
-	__inject_prog(vcpu, &irq);
-	BUG_ON(waitqueue_active(li->wq));
-	spin_unlock(&li->lock);
-	return 0;
-}
-
-int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
-			     struct kvm_s390_pgm_info *pgm_info)
-{
-	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
-	struct kvm_s390_irq irq;
-	int rc;
-
-	spin_lock(&li->lock);
-	irq.u.pgm = *pgm_info;
-	rc = __inject_prog(vcpu, &irq);
-	BUG_ON(waitqueue_active(li->wq));
-	spin_unlock(&li->lock);
-	return rc;
-}
-
 static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 static int __inject_pfault_init(struct kvm_vcpu *vcpu, struct kvm_s390_irq *irq)
 {
 {
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
 	struct kvm_s390_local_interrupt *li = &vcpu->arch.local_int;
@@ -1390,12 +1363,9 @@ static void __floating_irq_kick(struct kvm *kvm, u64 type)
 
 
 static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 static int __inject_vm(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
 {
 {
-	struct kvm_s390_float_interrupt *fi;
 	u64 type = READ_ONCE(inti->type);
 	u64 type = READ_ONCE(inti->type);
 	int rc;
 	int rc;
 
 
-	fi = &kvm->arch.float_int;
-
 	switch (type) {
 	switch (type) {
 	case KVM_S390_MCHK:
 	case KVM_S390_MCHK:
 		rc = __inject_float_mchk(kvm, inti);
 		rc = __inject_float_mchk(kvm, inti);

+ 27 - 31
arch/s390/kvm/kvm-s390.c

@@ -514,35 +514,20 @@ static int kvm_s390_set_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 
 
 	if (gtod_high != 0)
 	if (gtod_high != 0)
 		return -EINVAL;
 		return -EINVAL;
-	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x\n", gtod_high);
+	VM_EVENT(kvm, 3, "SET: TOD extension: 0x%x", gtod_high);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 static int kvm_s390_set_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 {
-	struct kvm_vcpu *cur_vcpu;
-	unsigned int vcpu_idx;
-	u64 host_tod, gtod;
-	int r;
+	u64 gtod;
 
 
 	if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
 	if (copy_from_user(&gtod, (void __user *)attr->addr, sizeof(gtod)))
 		return -EFAULT;
 		return -EFAULT;
 
 
-	r = store_tod_clock(&host_tod);
-	if (r)
-		return r;
-
-	mutex_lock(&kvm->lock);
-	preempt_disable();
-	kvm->arch.epoch = gtod - host_tod;
-	kvm_s390_vcpu_block_all(kvm);
-	kvm_for_each_vcpu(vcpu_idx, cur_vcpu, kvm)
-		cur_vcpu->arch.sie_block->epoch = kvm->arch.epoch;
-	kvm_s390_vcpu_unblock_all(kvm);
-	preempt_enable();
-	mutex_unlock(&kvm->lock);
-	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx\n", gtod);
+	kvm_s390_set_tod_clock(kvm, gtod);
+	VM_EVENT(kvm, 3, "SET: TOD base: 0x%llx", gtod);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -574,26 +559,19 @@ static int kvm_s390_get_tod_high(struct kvm *kvm, struct kvm_device_attr *attr)
 	if (copy_to_user((void __user *)attr->addr, &gtod_high,
 	if (copy_to_user((void __user *)attr->addr, &gtod_high,
 					 sizeof(gtod_high)))
 					 sizeof(gtod_high)))
 		return -EFAULT;
 		return -EFAULT;
-	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x\n", gtod_high);
+	VM_EVENT(kvm, 3, "QUERY: TOD extension: 0x%x", gtod_high);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 static int kvm_s390_get_tod_low(struct kvm *kvm, struct kvm_device_attr *attr)
 {
 {
-	u64 host_tod, gtod;
-	int r;
+	u64 gtod;
 
 
-	r = store_tod_clock(&host_tod);
-	if (r)
-		return r;
-
-	preempt_disable();
-	gtod = host_tod + kvm->arch.epoch;
-	preempt_enable();
+	gtod = kvm_s390_get_tod_clock_fast(kvm);
 	if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
 	if (copy_to_user((void __user *)attr->addr, &gtod, sizeof(gtod)))
 		return -EFAULT;
 		return -EFAULT;
-	VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx\n", gtod);
+	VM_EVENT(kvm, 3, "QUERY: TOD base: 0x%llx", gtod);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1120,7 +1098,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	if (!kvm->arch.sca)
 	if (!kvm->arch.sca)
 		goto out_err;
 		goto out_err;
 	spin_lock(&kvm_lock);
 	spin_lock(&kvm_lock);
-	sca_offset = (sca_offset + 16) & 0x7f0;
+	sca_offset += 16;
+	if (sca_offset + sizeof(struct sca_block) > PAGE_SIZE)
+		sca_offset = 0;
 	kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
 	kvm->arch.sca = (struct sca_block *) ((char *) kvm->arch.sca + sca_offset);
 	spin_unlock(&kvm_lock);
 	spin_unlock(&kvm_lock);
 
 
@@ -1911,6 +1891,22 @@ retry:
 	return 0;
 	return 0;
 }
 }
 
 
+void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod)
+{
+	struct kvm_vcpu *vcpu;
+	int i;
+
+	mutex_lock(&kvm->lock);
+	preempt_disable();
+	kvm->arch.epoch = tod - get_tod_clock();
+	kvm_s390_vcpu_block_all(kvm);
+	kvm_for_each_vcpu(i, vcpu, kvm)
+		vcpu->arch.sie_block->epoch = kvm->arch.epoch;
+	kvm_s390_vcpu_unblock_all(kvm);
+	preempt_enable();
+	mutex_unlock(&kvm->lock);
+}
+
 /**
 /**
  * kvm_arch_fault_in_page - fault-in guest page if necessary
  * kvm_arch_fault_in_page - fault-in guest page if necessary
  * @vcpu: The corresponding virtual cpu
  * @vcpu: The corresponding virtual cpu

+ 31 - 4
arch/s390/kvm/kvm-s390.h

@@ -175,6 +175,7 @@ static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
 	return kvm->arch.user_cpu_state_ctrl != 0;
 	return kvm->arch.user_cpu_state_ctrl != 0;
 }
 }
 
 
+/* implemented in interrupt.c */
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_wait(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
 void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
 enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer);
@@ -185,7 +186,25 @@ int __must_check kvm_s390_inject_vm(struct kvm *kvm,
 				    struct kvm_s390_interrupt *s390int);
 				    struct kvm_s390_interrupt *s390int);
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 int __must_check kvm_s390_inject_vcpu(struct kvm_vcpu *vcpu,
 				      struct kvm_s390_irq *irq);
 				      struct kvm_s390_irq *irq);
-int __must_check kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code);
+static inline int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
+					   struct kvm_s390_pgm_info *pgm_info)
+{
+	struct kvm_s390_irq irq = {
+		.type = KVM_S390_PROGRAM_INT,
+		.u.pgm = *pgm_info,
+	};
+
+	return kvm_s390_inject_vcpu(vcpu, &irq);
+}
+static inline int kvm_s390_inject_program_int(struct kvm_vcpu *vcpu, u16 code)
+{
+	struct kvm_s390_irq irq = {
+		.type = KVM_S390_PROGRAM_INT,
+		.u.pgm.code = code,
+	};
+
+	return kvm_s390_inject_vcpu(vcpu, &irq);
+}
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 struct kvm_s390_interrupt_info *kvm_s390_get_io_int(struct kvm *kvm,
 						    u64 isc_mask, u32 schid);
 						    u64 isc_mask, u32 schid);
 int kvm_s390_reinject_io_int(struct kvm *kvm,
 int kvm_s390_reinject_io_int(struct kvm *kvm,
@@ -212,6 +231,7 @@ int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
 
 
 /* implemented in kvm-s390.c */
 /* implemented in kvm-s390.c */
+void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_store_status_unloaded(struct kvm_vcpu *vcpu, unsigned long addr);
 int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
 int kvm_s390_store_adtl_status_unloaded(struct kvm_vcpu *vcpu,
@@ -231,9 +251,6 @@ extern unsigned long kvm_s390_fac_list_mask[];
 
 
 /* implemented in diag.c */
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
-/* implemented in interrupt.c */
-int kvm_s390_inject_prog_irq(struct kvm_vcpu *vcpu,
-			     struct kvm_s390_pgm_info *pgm_info);
 
 
 static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
 static inline void kvm_s390_vcpu_block_all(struct kvm *kvm)
 {
 {
@@ -254,6 +271,16 @@ static inline void kvm_s390_vcpu_unblock_all(struct kvm *kvm)
 		kvm_s390_vcpu_unblock(vcpu);
 		kvm_s390_vcpu_unblock(vcpu);
 }
 }
 
 
+static inline u64 kvm_s390_get_tod_clock_fast(struct kvm *kvm)
+{
+	u64 rc;
+
+	preempt_disable();
+	rc = get_tod_clock_fast() + kvm->arch.epoch;
+	preempt_enable();
+	return rc;
+}
+
 /**
 /**
  * kvm_s390_inject_prog_cond - conditionally inject a program check
  * kvm_s390_inject_prog_cond - conditionally inject a program check
  * @vcpu: virtual cpu
  * @vcpu: virtual cpu

+ 3 - 16
arch/s390/kvm/priv.c

@@ -33,11 +33,9 @@
 /* Handle SCK (SET CLOCK) interception */
 /* Handle SCK (SET CLOCK) interception */
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 static int handle_set_clock(struct kvm_vcpu *vcpu)
 {
 {
-	struct kvm_vcpu *cpup;
-	s64 hostclk, val;
-	int i, rc;
+	int rc;
 	ar_t ar;
 	ar_t ar;
-	u64 op2;
+	u64 op2, val;
 
 
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 	if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
 		return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
@@ -49,19 +47,8 @@ static int handle_set_clock(struct kvm_vcpu *vcpu)
 	if (rc)
 	if (rc)
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 		return kvm_s390_inject_prog_cond(vcpu, rc);
 
 
-	if (store_tod_clock(&hostclk)) {
-		kvm_s390_set_psw_cc(vcpu, 3);
-		return 0;
-	}
 	VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
 	VCPU_EVENT(vcpu, 3, "SCK: setting guest TOD to 0x%llx", val);
-	val = (val - hostclk) & ~0x3fUL;
-
-	mutex_lock(&vcpu->kvm->lock);
-	preempt_disable();
-	kvm_for_each_vcpu(i, cpup, vcpu->kvm)
-		cpup->arch.sie_block->epoch = val;
-	preempt_enable();
-	mutex_unlock(&vcpu->kvm->lock);
+	kvm_s390_set_tod_clock(vcpu->kvm, val);
 
 
 	kvm_s390_set_psw_cc(vcpu, 0);
 	kvm_s390_set_psw_cc(vcpu, 0);
 	return 0;
 	return 0;

+ 5 - 5
arch/x86/include/asm/irq_remapping.h

@@ -33,6 +33,11 @@ enum irq_remap_cap {
 	IRQ_POSTING_CAP = 0,
 	IRQ_POSTING_CAP = 0,
 };
 };
 
 
+struct vcpu_data {
+	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
+	u32 vector;		/* Guest vector of the interrupt */
+};
+
 #ifdef CONFIG_IRQ_REMAP
 #ifdef CONFIG_IRQ_REMAP
 
 
 extern bool irq_remapping_cap(enum irq_remap_cap cap);
 extern bool irq_remapping_cap(enum irq_remap_cap cap);
@@ -58,11 +63,6 @@ static inline struct irq_domain *arch_get_ir_parent_domain(void)
 	return x86_vector_domain;
 	return x86_vector_domain;
 }
 }
 
 
-struct vcpu_data {
-	u64 pi_desc_addr;	/* Physical address of PI Descriptor */
-	u32 vector;		/* Guest vector of the interrupt */
-};
-
 #else  /* CONFIG_IRQ_REMAP */
 #else  /* CONFIG_IRQ_REMAP */
 
 
 static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }
 static inline bool irq_remapping_cap(enum irq_remap_cap cap) { return 0; }

+ 10 - 0
arch/x86/include/asm/kvm_emulate.h

@@ -111,6 +111,16 @@ struct x86_emulate_ops {
 			unsigned int bytes,
 			unsigned int bytes,
 			struct x86_exception *fault);
 			struct x86_exception *fault);
 
 
+	/*
+	 * read_phys: Read bytes of standard (non-emulated/special) memory.
+	 *            Used for descriptor reading.
+	 *  @addr:  [IN ] Physical address from which to read.
+	 *  @val:   [OUT] Value read from memory.
+	 *  @bytes: [IN ] Number of bytes to read from memory.
+	 */
+	int (*read_phys)(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+			void *val, unsigned int bytes);
+
 	/*
 	/*
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
 	 * write_std: Write bytes of standard (non-emulated/special) memory.
 	 *            Used for descriptor writing.
 	 *            Used for descriptor writing.

+ 36 - 2
arch/x86/include/asm/kvm_host.h

@@ -24,6 +24,7 @@
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
 #include <linux/clocksource.h>
+#include <linux/irqbypass.h>
 
 
 #include <asm/pvclock-abi.h>
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
@@ -176,6 +177,8 @@ enum {
  */
  */
 #define KVM_APIC_PV_EOI_PENDING	1
 #define KVM_APIC_PV_EOI_PENDING	1
 
 
+struct kvm_kernel_irq_routing_entry;
+
 /*
 /*
  * We don't want allocation failures within the mmu code, so we preallocate
  * We don't want allocation failures within the mmu code, so we preallocate
  * enough memory for a single page fault in a cache.
  * enough memory for a single page fault in a cache.
@@ -374,6 +377,7 @@ struct kvm_mtrr {
 /* Hyper-V per vcpu emulation context */
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 struct kvm_vcpu_hv {
 	u64 hv_vapic;
 	u64 hv_vapic;
+	s64 runtime_offset;
 };
 };
 
 
 struct kvm_vcpu_arch {
 struct kvm_vcpu_arch {
@@ -396,6 +400,7 @@ struct kvm_vcpu_arch {
 	u64 efer;
 	u64 efer;
 	u64 apic_base;
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
 	struct kvm_lapic *apic;    /* kernel irqchip context */
+	u64 eoi_exit_bitmap[4];
 	unsigned long apic_attention;
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int32_t apic_arb_prio;
 	int mp_state;
 	int mp_state;
@@ -573,6 +578,9 @@ struct kvm_vcpu_arch {
 	struct {
 	struct {
 		bool pv_unhalted;
 		bool pv_unhalted;
 	} pv;
 	} pv;
+
+	int pending_ioapic_eoi;
+	int pending_external_vector;
 };
 };
 
 
 struct kvm_lpage_info {
 struct kvm_lpage_info {
@@ -683,6 +691,9 @@ struct kvm_arch {
 	u32 bsp_vcpu_id;
 	u32 bsp_vcpu_id;
 
 
 	u64 disabled_quirks;
 	u64 disabled_quirks;
+
+	bool irqchip_split;
+	u8 nr_reserved_ioapic_pins;
 };
 };
 
 
 struct kvm_vm_stat {
 struct kvm_vm_stat {
@@ -819,10 +830,10 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*vm_has_apicv)(struct kvm *kvm);
+	int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
-	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_virtual_x2apic_mode)(struct kvm_vcpu *vcpu, bool set);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*set_apic_access_page_addr)(struct kvm_vcpu *vcpu, hpa_t hpa);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
 	void (*deliver_posted_interrupt)(struct kvm_vcpu *vcpu, int vector);
@@ -887,6 +898,20 @@ struct kvm_x86_ops {
 					   gfn_t offset, unsigned long mask);
 					   gfn_t offset, unsigned long mask);
 	/* pmu operations of sub-arch */
 	/* pmu operations of sub-arch */
 	const struct kvm_pmu_ops *pmu_ops;
 	const struct kvm_pmu_ops *pmu_ops;
+
+	/*
+	 * Architecture specific hooks for vCPU blocking due to
+	 * HLT instruction.
+	 * Returns for .pre_block():
+	 *    - 0 means continue to block the vCPU.
+	 *    - 1 means we cannot block the vCPU since some event
+	 *        happens during this period, such as, 'ON' bit in
+	 *        posted-interrupts descriptor is set.
+	 */
+	int (*pre_block)(struct kvm_vcpu *vcpu);
+	void (*post_block)(struct kvm_vcpu *vcpu);
+	int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set);
 };
 };
 
 
 struct kvm_arch_async_pf {
 struct kvm_arch_async_pf {
@@ -1231,4 +1256,13 @@ int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size);
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
 
 
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu);
+
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq);
+
+static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu) {}
+static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {}
+
 #endif /* _ASM_X86_KVM_HOST_H */
 #endif /* _ASM_X86_KVM_HOST_H */

+ 2 - 1
arch/x86/include/asm/vmx.h

@@ -72,7 +72,7 @@
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_SHADOW_VMCS              0x00004000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_ENABLE_PML               0x00020000
 #define SECONDARY_EXEC_XSAVES			0x00100000
 #define SECONDARY_EXEC_XSAVES			0x00100000
-
+#define SECONDARY_EXEC_PCOMMIT			0x00200000
 
 
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_EXT_INTR_MASK                 0x00000001
 #define PIN_BASED_NMI_EXITING                   0x00000008
 #define PIN_BASED_NMI_EXITING                   0x00000008
@@ -416,6 +416,7 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 
+#define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
 
 

+ 18 - 0
arch/x86/include/uapi/asm/hyperv.h

@@ -153,6 +153,12 @@
 /* MSR used to provide vcpu index */
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
 
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET			0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+
 /* MSR used to read the per-partition time reference counter */
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
 
@@ -251,4 +257,16 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 	__s64 tsc_offset;
 	__s64 tsc_offset;
 } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
 } HV_REFERENCE_TSC_PAGE, *PHV_REFERENCE_TSC_PAGE;
 
 
+/* Define the number of synthetic interrupt sources. */
+#define HV_SYNIC_SINT_COUNT		(16)
+/* Define the expected SynIC version. */
+#define HV_SYNIC_VERSION_1		(0x1)
+
+#define HV_SYNIC_CONTROL_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIMP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SIEFP_ENABLE		(1ULL << 0)
+#define HV_SYNIC_SINT_MASKED		(1ULL << 16)
+#define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
+#define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
+
 #endif
 #endif

+ 3 - 1
arch/x86/include/uapi/asm/vmx.h

@@ -78,6 +78,7 @@
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_PML_FULL            62
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XSAVES              63
 #define EXIT_REASON_XRSTORS             64
 #define EXIT_REASON_XRSTORS             64
+#define EXIT_REASON_PCOMMIT             65
 
 
 #define VMX_EXIT_REASONS \
 #define VMX_EXIT_REASONS \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
 	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \
@@ -126,7 +127,8 @@
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVVPID,               "INVVPID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_INVPCID,               "INVPCID" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
 	{ EXIT_REASON_XSAVES,                "XSAVES" }, \
-	{ EXIT_REASON_XRSTORS,               "XRSTORS" }
+	{ EXIT_REASON_XRSTORS,               "XRSTORS" }, \
+	{ EXIT_REASON_PCOMMIT,               "PCOMMIT" }
 
 
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_SAVE_GUEST_MSR_FAIL        1
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4
 #define VMX_ABORT_LOAD_HOST_MSR_FAIL         4

+ 35 - 11
arch/x86/kernel/kvmclock.c

@@ -32,6 +32,7 @@
 static int kvmclock = 1;
 static int kvmclock = 1;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
 static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+static cycle_t kvm_sched_clock_offset;
 
 
 static int parse_no_kvmclock(char *arg)
 static int parse_no_kvmclock(char *arg)
 {
 {
@@ -92,6 +93,29 @@ static cycle_t kvm_clock_get_cycles(struct clocksource *cs)
 	return kvm_clock_read();
 	return kvm_clock_read();
 }
 }
 
 
+static cycle_t kvm_sched_clock_read(void)
+{
+	return kvm_clock_read() - kvm_sched_clock_offset;
+}
+
+static inline void kvm_sched_clock_init(bool stable)
+{
+	if (!stable) {
+		pv_time_ops.sched_clock = kvm_clock_read;
+		return;
+	}
+
+	kvm_sched_clock_offset = kvm_clock_read();
+	pv_time_ops.sched_clock = kvm_sched_clock_read;
+	set_sched_clock_stable();
+
+	printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n",
+			kvm_sched_clock_offset);
+
+	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
+	         sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
+}
+
 /*
 /*
  * If we don't do that, there is the possibility that the guest
  * If we don't do that, there is the possibility that the guest
  * will calibrate under heavy load - thus, getting a lower lpj -
  * will calibrate under heavy load - thus, getting a lower lpj -
@@ -248,7 +272,17 @@ void __init kvmclock_init(void)
 		memblock_free(mem, size);
 		memblock_free(mem, size);
 		return;
 		return;
 	}
 	}
-	pv_time_ops.sched_clock = kvm_clock_read;
+
+	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
+
+	cpu = get_cpu();
+	vcpu_time = &hv_clock[cpu].pvti;
+	flags = pvclock_read_flags(vcpu_time);
+
+	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
+	put_cpu();
+
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.get_wallclock = kvm_get_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
 	x86_platform.set_wallclock = kvm_set_wallclock;
@@ -265,16 +299,6 @@ void __init kvmclock_init(void)
 	kvm_get_preset_lpj();
 	kvm_get_preset_lpj();
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
 	pv_info.name = "KVM";
 	pv_info.name = "KVM";
-
-	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
-		pvclock_set_flags(~0);
-
-	cpu = get_cpu();
-	vcpu_time = &hv_clock[cpu].pvti;
-	flags = pvclock_read_flags(vcpu_time);
-	if (flags & PVCLOCK_COUNTS_FROM_ZERO)
-		set_sched_clock_stable();
-	put_cpu();
 }
 }
 
 
 int __init kvm_setup_vsyscall_timeinfo(void)
 int __init kvm_setup_vsyscall_timeinfo(void)

+ 2 - 0
arch/x86/kvm/Kconfig

@@ -28,6 +28,8 @@ config KVM
 	select ANON_INODES
 	select ANON_INODES
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQCHIP
 	select HAVE_KVM_IRQFD
 	select HAVE_KVM_IRQFD
+	select IRQ_BYPASS_MANAGER
+	select HAVE_KVM_IRQ_BYPASS
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_IRQ_ROUTING
 	select HAVE_KVM_EVENTFD
 	select HAVE_KVM_EVENTFD
 	select KVM_APIC_ARCHITECTURE
 	select KVM_APIC_ARCHITECTURE

+ 37 - 25
arch/x86/kvm/assigned-dev.c

@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/fs.h>
 #include "irq.h"
 #include "irq.h"
 #include "assigned-dev.h"
 #include "assigned-dev.h"
+#include "trace/events/kvm.h"
 
 
 struct kvm_assigned_dev_kernel {
 struct kvm_assigned_dev_kernel {
 	struct kvm_irq_ack_notifier ack_notifier;
 	struct kvm_irq_ack_notifier ack_notifier;
@@ -131,7 +132,42 @@ static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id)
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
 
 
-#ifdef __KVM_HAVE_MSI
+/*
+ * Deliver an IRQ in an atomic context if we can, or return a failure,
+ * user can retry in a process context.
+ * Return value:
+ *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
+ *  Other values - No need to retry.
+ */
+static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq,
+				int level)
+{
+	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
+	struct kvm_kernel_irq_routing_entry *e;
+	int ret = -EINVAL;
+	int idx;
+
+	trace_kvm_set_irq(irq, level, irq_source_id);
+
+	/*
+	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
+	 * which would need to be retried from thread context;  when same GSI
+	 * is connected to both PIC and IOAPIC, we'd have to report a
+	 * partial failure here.
+	 * Since there's no easy way to do this, we only support injecting MSI
+	 * which is limited to 1:1 GSI mapping.
+	 */
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
+		e = &entries[0];
+		ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id,
+						irq, level);
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
+
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id)
 {
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -150,9 +186,7 @@ static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id)
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
-#endif
 
 
-#ifdef __KVM_HAVE_MSIX
 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id)
 {
 {
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
 	struct kvm_assigned_dev_kernel *assigned_dev = dev_id;
@@ -183,7 +217,6 @@ static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id)
 
 
 	return IRQ_HANDLED;
 	return IRQ_HANDLED;
 }
 }
-#endif
 
 
 /* Ack the irq line for an assigned device */
 /* Ack the irq line for an assigned device */
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
 static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian)
@@ -386,7 +419,6 @@ static int assigned_device_enable_host_intx(struct kvm *kvm,
 	return 0;
 	return 0;
 }
 }
 
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 static int assigned_device_enable_host_msi(struct kvm *kvm,
 					   struct kvm_assigned_dev_kernel *dev)
 					   struct kvm_assigned_dev_kernel *dev)
 {
 {
@@ -408,9 +440,7 @@ static int assigned_device_enable_host_msi(struct kvm *kvm,
 
 
 	return 0;
 	return 0;
 }
 }
-#endif
 
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 static int assigned_device_enable_host_msix(struct kvm *kvm,
 					    struct kvm_assigned_dev_kernel *dev)
 					    struct kvm_assigned_dev_kernel *dev)
 {
 {
@@ -443,8 +473,6 @@ err:
 	return r;
 	return r;
 }
 }
 
 
-#endif
-
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
 static int assigned_device_enable_guest_intx(struct kvm *kvm,
 				struct kvm_assigned_dev_kernel *dev,
 				struct kvm_assigned_dev_kernel *dev,
 				struct kvm_assigned_irq *irq)
 				struct kvm_assigned_irq *irq)
@@ -454,7 +482,6 @@ static int assigned_device_enable_guest_intx(struct kvm *kvm,
 	return 0;
 	return 0;
 }
 }
 
 
-#ifdef __KVM_HAVE_MSI
 static int assigned_device_enable_guest_msi(struct kvm *kvm,
 static int assigned_device_enable_guest_msi(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
 			struct kvm_assigned_irq *irq)
@@ -463,9 +490,7 @@ static int assigned_device_enable_guest_msi(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	dev->ack_notifier.gsi = -1;
 	return 0;
 	return 0;
 }
 }
-#endif
 
 
-#ifdef __KVM_HAVE_MSIX
 static int assigned_device_enable_guest_msix(struct kvm *kvm,
 static int assigned_device_enable_guest_msix(struct kvm *kvm,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_dev_kernel *dev,
 			struct kvm_assigned_irq *irq)
 			struct kvm_assigned_irq *irq)
@@ -474,7 +499,6 @@ static int assigned_device_enable_guest_msix(struct kvm *kvm,
 	dev->ack_notifier.gsi = -1;
 	dev->ack_notifier.gsi = -1;
 	return 0;
 	return 0;
 }
 }
-#endif
 
 
 static int assign_host_irq(struct kvm *kvm,
 static int assign_host_irq(struct kvm *kvm,
 			   struct kvm_assigned_dev_kernel *dev,
 			   struct kvm_assigned_dev_kernel *dev,
@@ -492,16 +516,12 @@ static int assign_host_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_HOST_INTX:
 	case KVM_DEV_IRQ_HOST_INTX:
 		r = assigned_device_enable_host_intx(kvm, dev);
 		r = assigned_device_enable_host_intx(kvm, dev);
 		break;
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_HOST_MSI:
 	case KVM_DEV_IRQ_HOST_MSI:
 		r = assigned_device_enable_host_msi(kvm, dev);
 		r = assigned_device_enable_host_msi(kvm, dev);
 		break;
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_HOST_MSIX:
 	case KVM_DEV_IRQ_HOST_MSIX:
 		r = assigned_device_enable_host_msix(kvm, dev);
 		r = assigned_device_enable_host_msix(kvm, dev);
 		break;
 		break;
-#endif
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 	}
 	}
@@ -534,16 +554,12 @@ static int assign_guest_irq(struct kvm *kvm,
 	case KVM_DEV_IRQ_GUEST_INTX:
 	case KVM_DEV_IRQ_GUEST_INTX:
 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 		r = assigned_device_enable_guest_intx(kvm, dev, irq);
 		break;
 		break;
-#ifdef __KVM_HAVE_MSI
 	case KVM_DEV_IRQ_GUEST_MSI:
 	case KVM_DEV_IRQ_GUEST_MSI:
 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 		r = assigned_device_enable_guest_msi(kvm, dev, irq);
 		break;
 		break;
-#endif
-#ifdef __KVM_HAVE_MSIX
 	case KVM_DEV_IRQ_GUEST_MSIX:
 	case KVM_DEV_IRQ_GUEST_MSIX:
 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 		r = assigned_device_enable_guest_msix(kvm, dev, irq);
 		break;
 		break;
-#endif
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 	}
 	}
@@ -826,7 +842,6 @@ out:
 }
 }
 
 
 
 
-#ifdef __KVM_HAVE_MSIX
 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm,
 				    struct kvm_assigned_msix_nr *entry_nr)
 				    struct kvm_assigned_msix_nr *entry_nr)
 {
 {
@@ -906,7 +921,6 @@ msix_entry_out:
 
 
 	return r;
 	return r;
 }
 }
-#endif
 
 
 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm,
 		struct kvm_assigned_pci_dev *assigned_dev)
 		struct kvm_assigned_pci_dev *assigned_dev)
@@ -1012,7 +1026,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 			goto out;
 		break;
 		break;
 	}
 	}
-#ifdef __KVM_HAVE_MSIX
 	case KVM_ASSIGN_SET_MSIX_NR: {
 	case KVM_ASSIGN_SET_MSIX_NR: {
 		struct kvm_assigned_msix_nr entry_nr;
 		struct kvm_assigned_msix_nr entry_nr;
 		r = -EFAULT;
 		r = -EFAULT;
@@ -1033,7 +1046,6 @@ long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 			goto out;
 			goto out;
 		break;
 		break;
 	}
 	}
-#endif
 	case KVM_ASSIGN_SET_INTX_MASK: {
 	case KVM_ASSIGN_SET_INTX_MASK: {
 		struct kvm_assigned_pci_dev assigned_dev;
 		struct kvm_assigned_pci_dev assigned_dev;
 
 

+ 1 - 1
arch/x86/kvm/cpuid.c

@@ -348,7 +348,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(FSGSBASE) | F(BMI1) | F(HLE) | F(AVX2) | F(SMEP) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
 		F(ADX) | F(SMAP) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) |
-		F(AVX512CD);
+		F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(PCOMMIT);
 
 
 	/* cpuid 0xD.1.eax */
 	/* cpuid 0xD.1.eax */
 	const u32 kvm_supported_word10_x86_features =
 	const u32 kvm_supported_word10_x86_features =

+ 37 - 0
arch/x86/kvm/cpuid.h

@@ -133,4 +133,41 @@ static inline bool guest_cpuid_has_mpx(struct kvm_vcpu *vcpu)
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	best = kvm_find_cpuid_entry(vcpu, 7, 0);
 	return best && (best->ebx & bit(X86_FEATURE_MPX));
 	return best && (best->ebx & bit(X86_FEATURE_MPX));
 }
 }
+
+static inline bool guest_cpuid_has_pcommit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 7, 0);
+	return best && (best->ebx & bit(X86_FEATURE_PCOMMIT));
+}
+
+static inline bool guest_cpuid_has_rdtscp(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
+	return best && (best->edx & bit(X86_FEATURE_RDTSCP));
+}
+
+/*
+ * NRIPS is provided through cpuidfn 0x8000000a.edx bit 3
+ */
+#define BIT_NRIPS	3
+
+static inline bool guest_cpuid_has_nrips(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 0x8000000a, 0);
+
+	/*
+	 * NRIPS is a scattered cpuid feature, so we can't use
+	 * X86_FEATURE_NRIPS here (X86_FEATURE_NRIPS would be bit
+	 * position 8, not 3).
+	 */
+	return best && (best->edx & bit(BIT_NRIPS));
+}
+#undef BIT_NRIPS
+
 #endif
 #endif

+ 27 - 8
arch/x86/kvm/emulate.c

@@ -2272,8 +2272,8 @@ static int emulator_has_longmode(struct x86_emulate_ctxt *ctxt)
 #define GET_SMSTATE(type, smbase, offset)				  \
 #define GET_SMSTATE(type, smbase, offset)				  \
 	({								  \
 	({								  \
 	 type __val;							  \
 	 type __val;							  \
-	 int r = ctxt->ops->read_std(ctxt, smbase + offset, &__val,       \
-				     sizeof(__val), NULL);		  \
+	 int r = ctxt->ops->read_phys(ctxt, smbase + offset, &__val,      \
+				      sizeof(__val));			  \
 	 if (r != X86EMUL_CONTINUE)					  \
 	 if (r != X86EMUL_CONTINUE)					  \
 		 return X86EMUL_UNHANDLEABLE;				  \
 		 return X86EMUL_UNHANDLEABLE;				  \
 	 __val;								  \
 	 __val;								  \
@@ -2484,17 +2484,36 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt)
 
 
 	/*
 	/*
 	 * Get back to real mode, to prepare a safe state in which to load
 	 * Get back to real mode, to prepare a safe state in which to load
-	 * CR0/CR3/CR4/EFER.  Also this will ensure that addresses passed
-	 * to read_std/write_std are not virtual.
-	 *
-	 * CR4.PCIDE must be zero, because it is a 64-bit mode only feature.
+	 * CR0/CR3/CR4/EFER.  It's all a bit more complicated if the vCPU
+	 * supports long mode.
 	 */
 	 */
+	cr4 = ctxt->ops->get_cr(ctxt, 4);
+	if (emulator_has_longmode(ctxt)) {
+		struct desc_struct cs_desc;
+
+		/* Zero CR4.PCIDE before CR0.PG.  */
+		if (cr4 & X86_CR4_PCIDE) {
+			ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE);
+			cr4 &= ~X86_CR4_PCIDE;
+		}
+
+		/* A 32-bit code segment is required to clear EFER.LMA.  */
+		memset(&cs_desc, 0, sizeof(cs_desc));
+		cs_desc.type = 0xb;
+		cs_desc.s = cs_desc.g = cs_desc.p = 1;
+		ctxt->ops->set_segment(ctxt, 0, &cs_desc, 0, VCPU_SREG_CS);
+	}
+
+	/* For the 64-bit case, this will clear EFER.LMA.  */
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	cr0 = ctxt->ops->get_cr(ctxt, 0);
 	if (cr0 & X86_CR0_PE)
 	if (cr0 & X86_CR0_PE)
 		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
 		ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE));
-	cr4 = ctxt->ops->get_cr(ctxt, 4);
+
+	/* Now clear CR4.PAE (which must be done before clearing EFER.LME).  */
 	if (cr4 & X86_CR4_PAE)
 	if (cr4 & X86_CR4_PAE)
 		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
 		ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE);
+
+	/* And finally go back to 32-bit mode.  */
 	efer = 0;
 	efer = 0;
 	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
 	ctxt->ops->set_msr(ctxt, MSR_EFER, efer);
 
 
@@ -4455,7 +4474,7 @@ static const struct opcode twobyte_table[256] = {
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shld), N, N,
 	/* 0xA8 - 0xAF */
 	/* 0xA8 - 0xAF */
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
 	I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg),
-	II(No64 | EmulateOnUD | ImplicitOps, em_rsm, rsm),
+	II(EmulateOnUD | ImplicitOps, em_rsm, rsm),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_bts),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2ImmByte | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),
 	F(DstMem | SrcReg | Src2CL | ModRM, em_shrd),

+ 29 - 2
arch/x86/kvm/hyperv.c

@@ -41,6 +41,7 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_TIME_REF_COUNT:
 	case HV_X64_MSR_TIME_REF_COUNT:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
 	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_RESET:
 		r = true;
 		r = true;
 		break;
 		break;
 	}
 	}
@@ -163,6 +164,12 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 						 data);
 						 data);
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
 		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+	case HV_X64_MSR_RESET:
+		if (data == 1) {
+			vcpu_debug(vcpu, "hyper-v reset requested\n");
+			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+		}
+		break;
 	default:
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
 			    msr, data);
@@ -171,7 +178,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 	return 0;
 	return 0;
 }
 }
 
 
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+	cputime_t utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 {
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
 
@@ -205,6 +221,11 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+	case HV_X64_MSR_VP_RUNTIME:
+		if (!host)
+			return 1;
+		hv->runtime_offset = data - current_task_runtime_100ns();
+		break;
 	default:
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
 			    msr, data);
@@ -241,6 +262,9 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 						 pdata);
 						 pdata);
 	case HV_X64_MSR_CRASH_CTL:
 	case HV_X64_MSR_CRASH_CTL:
 		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
 		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+	case HV_X64_MSR_RESET:
+		data = 0;
+		break;
 	default:
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 		return 1;
@@ -277,6 +301,9 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = hv->hv_vapic;
 		data = hv->hv_vapic;
 		break;
 		break;
+	case HV_X64_MSR_VP_RUNTIME:
+		data = current_task_runtime_100ns() + hv->runtime_offset;
+		break;
 	default:
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
 		return 1;
@@ -295,7 +322,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 		mutex_unlock(&vcpu->kvm->lock);
 		mutex_unlock(&vcpu->kvm->lock);
 		return r;
 		return r;
 	} else
 	} else
-		return kvm_hv_set_msr(vcpu, msr, data);
+		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 }
 
 
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)

+ 3 - 1
arch/x86/kvm/i8254.c

@@ -35,6 +35,7 @@
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 
 
+#include "ioapic.h"
 #include "irq.h"
 #include "irq.h"
 #include "i8254.h"
 #include "i8254.h"
 #include "x86.h"
 #include "x86.h"
@@ -333,7 +334,8 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
 	s64 interval;
 	s64 interval;
 
 
-	if (!irqchip_in_kernel(kvm) || ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
+	if (!ioapic_in_kernel(kvm) ||
+	    ps->flags & KVM_PIT_FLAGS_HPET_LEGACY)
 		return;
 		return;
 
 
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
 	interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);

+ 6 - 23
arch/x86/kvm/ioapic.c

@@ -233,21 +233,7 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 }
 }
 
 
 
 
-static void update_handled_vectors(struct kvm_ioapic *ioapic)
-{
-	DECLARE_BITMAP(handled_vectors, 256);
-	int i;
-
-	memset(handled_vectors, 0, sizeof(handled_vectors));
-	for (i = 0; i < IOAPIC_NUM_PINS; ++i)
-		__set_bit(ioapic->redirtbl[i].fields.vector, handled_vectors);
-	memcpy(ioapic->handled_vectors, handled_vectors,
-	       sizeof(handled_vectors));
-	smp_wmb();
-}
-
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr)
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
 	union kvm_ioapic_redirect_entry *e;
 	union kvm_ioapic_redirect_entry *e;
@@ -260,13 +246,11 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    kvm_irq_has_notifier(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index) ||
 		    index == RTC_GSI) {
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
-				e->fields.dest_id, e->fields.dest_mode)) {
+			             e->fields.dest_id, e->fields.dest_mode) ||
+			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
+			     kvm_apic_pending_eoi(vcpu, e->fields.vector)))
 				__set_bit(e->fields.vector,
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
 					(unsigned long *)eoi_exit_bitmap);
-				if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG)
-					__set_bit(e->fields.vector,
-						(unsigned long *)tmr);
-			}
 		}
 		}
 	}
 	}
 	spin_unlock(&ioapic->lock);
 	spin_unlock(&ioapic->lock);
@@ -315,7 +299,6 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			e->bits |= (u32) val;
 			e->bits |= (u32) val;
 			e->fields.remote_irr = 0;
 			e->fields.remote_irr = 0;
 		}
 		}
-		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
@@ -599,7 +582,6 @@ static void kvm_ioapic_reset(struct kvm_ioapic *ioapic)
 	ioapic->id = 0;
 	ioapic->id = 0;
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	memset(ioapic->irq_eoi, 0x00, IOAPIC_NUM_PINS);
 	rtc_irq_eoi_tracking_reset(ioapic);
 	rtc_irq_eoi_tracking_reset(ioapic);
-	update_handled_vectors(ioapic);
 }
 }
 
 
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
 static const struct kvm_io_device_ops ioapic_mmio_ops = {
@@ -628,8 +610,10 @@ int kvm_ioapic_init(struct kvm *kvm)
 	if (ret < 0) {
 	if (ret < 0) {
 		kvm->arch.vioapic = NULL;
 		kvm->arch.vioapic = NULL;
 		kfree(ioapic);
 		kfree(ioapic);
+		return ret;
 	}
 	}
 
 
+	kvm_vcpu_request_scan_ioapic(kvm);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -666,7 +650,6 @@ int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state)
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	memcpy(ioapic, state, sizeof(struct kvm_ioapic_state));
 	ioapic->irr = 0;
 	ioapic->irr = 0;
 	ioapic->irr_delivered = 0;
 	ioapic->irr_delivered = 0;
-	update_handled_vectors(ioapic);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_vcpu_request_scan_ioapic(kvm);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	kvm_ioapic_inject_all(ioapic, state->irr);
 	spin_unlock(&ioapic->lock);
 	spin_unlock(&ioapic->lock);

+ 8 - 7
arch/x86/kvm/ioapic.h

@@ -9,6 +9,7 @@ struct kvm;
 struct kvm_vcpu;
 struct kvm_vcpu;
 
 
 #define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
 #define IOAPIC_NUM_PINS  KVM_IOAPIC_NUM_PINS
+#define MAX_NR_RESERVED_IOAPIC_PINS KVM_MAX_IRQ_ROUTES
 #define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
 #define IOAPIC_VERSION_ID 0x11	/* IOAPIC version */
 #define IOAPIC_EDGE_TRIG  0
 #define IOAPIC_EDGE_TRIG  0
 #define IOAPIC_LEVEL_TRIG 1
 #define IOAPIC_LEVEL_TRIG 1
@@ -73,7 +74,6 @@ struct kvm_ioapic {
 	struct kvm *kvm;
 	struct kvm *kvm;
 	void (*ack_notifier)(void *opaque, int irq);
 	void (*ack_notifier)(void *opaque, int irq);
 	spinlock_t lock;
 	spinlock_t lock;
-	DECLARE_BITMAP(handled_vectors, 256);
 	struct rtc_status rtc_status;
 	struct rtc_status rtc_status;
 	struct delayed_work eoi_inject;
 	struct delayed_work eoi_inject;
 	u32 irq_eoi[IOAPIC_NUM_PINS];
 	u32 irq_eoi[IOAPIC_NUM_PINS];
@@ -98,11 +98,12 @@ static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm)
 	return kvm->arch.vioapic;
 	return kvm->arch.vioapic;
 }
 }
 
 
-static inline bool kvm_ioapic_handles_vector(struct kvm *kvm, int vector)
+static inline int ioapic_in_kernel(struct kvm *kvm)
 {
 {
-	struct kvm_ioapic *ioapic = kvm->arch.vioapic;
-	smp_rmb();
-	return test_bit(vector, ioapic->handled_vectors);
+	int ret;
+
+	ret = (ioapic_irqchip(kvm) != NULL);
+	return ret;
 }
 }
 
 
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
 void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu);
@@ -120,7 +121,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 		struct kvm_lapic_irq *irq, unsigned long *dest_map);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
 int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state);
-void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap,
-			u32 *tmr);
+void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap);
 
 
 #endif
 #endif

+ 30 - 10
arch/x86/kvm/irq.c

@@ -37,15 +37,28 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 }
 }
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
 
 
+/*
+ * check if there is a pending userspace external interrupt
+ */
+static int pending_userspace_extint(struct kvm_vcpu *v)
+{
+	return v->arch.pending_external_vector != -1;
+}
+
 /*
 /*
  * check if there is pending interrupt from
  * check if there is pending interrupt from
  * non-APIC source without intack.
  * non-APIC source without intack.
  */
  */
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 static int kvm_cpu_has_extint(struct kvm_vcpu *v)
 {
 {
-	if (kvm_apic_accept_pic_intr(v))
-		return pic_irqchip(v->kvm)->output;	/* PIC */
-	else
+	u8 accept = kvm_apic_accept_pic_intr(v);
+
+	if (accept) {
+		if (irqchip_split(v->kvm))
+			return pending_userspace_extint(v);
+		else
+			return pic_irqchip(v->kvm)->output;
+	} else
 		return 0;
 		return 0;
 }
 }
 
 
@@ -57,13 +70,13 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v)
  */
  */
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 {
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 		return v->arch.interrupt.pending;
 
 
 	if (kvm_cpu_has_extint(v))
 	if (kvm_cpu_has_extint(v))
 		return 1;
 		return 1;
 
 
-	if (kvm_apic_vid_enabled(v->kvm))
+	if (kvm_vcpu_apic_vid_enabled(v))
 		return 0;
 		return 0;
 
 
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
 	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
@@ -75,7 +88,7 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
  */
  */
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 {
 {
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.pending;
 		return v->arch.interrupt.pending;
 
 
 	if (kvm_cpu_has_extint(v))
 	if (kvm_cpu_has_extint(v))
@@ -91,9 +104,16 @@ EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
  */
  */
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 {
 {
-	if (kvm_cpu_has_extint(v))
-		return kvm_pic_read_irq(v->kvm); /* PIC */
-	return -1;
+	if (kvm_cpu_has_extint(v)) {
+		if (irqchip_split(v->kvm)) {
+			int vector = v->arch.pending_external_vector;
+
+			v->arch.pending_external_vector = -1;
+			return vector;
+		} else
+			return kvm_pic_read_irq(v->kvm); /* PIC */
+	} else
+		return -1;
 }
 }
 
 
 /*
 /*
@@ -103,7 +123,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 {
 {
 	int vector;
 	int vector;
 
 
-	if (!irqchip_in_kernel(v->kvm))
+	if (!lapic_in_kernel(v))
 		return v->arch.interrupt.nr;
 		return v->arch.interrupt.nr;
 
 
 	vector = kvm_cpu_get_extint(v);
 	vector = kvm_cpu_get_extint(v);

+ 26 - 1
arch/x86/kvm/irq.h

@@ -83,13 +83,38 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
 	return kvm->arch.vpic;
 	return kvm->arch.vpic;
 }
 }
 
 
+static inline int pic_in_kernel(struct kvm *kvm)
+{
+	int ret;
+
+	ret = (pic_irqchip(kvm) != NULL);
+	return ret;
+}
+
+static inline int irqchip_split(struct kvm *kvm)
+{
+	return kvm->arch.irqchip_split;
+}
+
 static inline int irqchip_in_kernel(struct kvm *kvm)
 static inline int irqchip_in_kernel(struct kvm *kvm)
 {
 {
 	struct kvm_pic *vpic = pic_irqchip(kvm);
 	struct kvm_pic *vpic = pic_irqchip(kvm);
+	bool ret;
+
+	ret = (vpic != NULL);
+	ret |= irqchip_split(kvm);
 
 
 	/* Read vpic before kvm->irq_routing.  */
 	/* Read vpic before kvm->irq_routing.  */
 	smp_rmb();
 	smp_rmb();
-	return vpic != NULL;
+	return ret;
+}
+
+static inline int lapic_in_kernel(struct kvm_vcpu *vcpu)
+{
+	/* Same as irqchip_in_kernel(vcpu->kvm), but with less
+	 * pointer chasing and no unnecessary memory barriers.
+	 */
+	return vcpu->arch.apic != NULL;
 }
 }
 
 
 void kvm_pic_reset(struct kvm_kpic_state *s);
 void kvm_pic_reset(struct kvm_kpic_state *s);

+ 88 - 41
arch/x86/kvm/irq_comm.c

@@ -91,8 +91,8 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 	return r;
 	return r;
 }
 }
 
 
-static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
-				   struct kvm_lapic_irq *irq)
+void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+		     struct kvm_lapic_irq *irq)
 {
 {
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 	trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
 
 
@@ -108,6 +108,7 @@ static inline void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
 	irq->level = 1;
 	irq->level = 1;
 	irq->shorthand = 0;
 	irq->shorthand = 0;
 }
 }
+EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
 
 
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
 		struct kvm *kvm, int irq_source_id, int level, bool line_status)
@@ -123,12 +124,16 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
 }
 }
 
 
 
 
-static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
-			 struct kvm *kvm)
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			      struct kvm *kvm, int irq_source_id, int level,
+			      bool line_status)
 {
 {
 	struct kvm_lapic_irq irq;
 	struct kvm_lapic_irq irq;
 	int r;
 	int r;
 
 
+	if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
+		return -EWOULDBLOCK;
+
 	kvm_set_msi_irq(e, &irq);
 	kvm_set_msi_irq(e, &irq);
 
 
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
 	if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
@@ -137,42 +142,6 @@ static int kvm_set_msi_inatomic(struct kvm_kernel_irq_routing_entry *e,
 		return -EWOULDBLOCK;
 		return -EWOULDBLOCK;
 }
 }
 
 
-/*
- * Deliver an IRQ in an atomic context if we can, or return a failure,
- * user can retry in a process context.
- * Return value:
- *  -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context.
- *  Other values - No need to retry.
- */
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level)
-{
-	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	struct kvm_kernel_irq_routing_entry *e;
-	int ret = -EINVAL;
-	int idx;
-
-	trace_kvm_set_irq(irq, level, irq_source_id);
-
-	/*
-	 * Injection into either PIC or IOAPIC might need to scan all CPUs,
-	 * which would need to be retried from thread context;  when same GSI
-	 * is connected to both PIC and IOAPIC, we'd have to report a
-	 * partial failure here.
-	 * Since there's no easy way to do this, we only support injecting MSI
-	 * which is limited to 1:1 GSI mapping.
-	 */
-	idx = srcu_read_lock(&kvm->irq_srcu);
-	if (kvm_irq_map_gsi(kvm, entries, irq) > 0) {
-		e = &entries[0];
-		if (likely(e->type == KVM_IRQ_ROUTING_MSI))
-			ret = kvm_set_msi_inatomic(e, kvm);
-		else
-			ret = -EWOULDBLOCK;
-	}
-	srcu_read_unlock(&kvm->irq_srcu, idx);
-	return ret;
-}
-
 int kvm_request_irq_source_id(struct kvm *kvm)
 int kvm_request_irq_source_id(struct kvm *kvm)
 {
 {
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
 	unsigned long *bitmap = &kvm->arch.irq_sources_bitmap;
@@ -208,7 +177,7 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id)
 		goto unlock;
 		goto unlock;
 	}
 	}
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
 	clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap);
-	if (!irqchip_in_kernel(kvm))
+	if (!ioapic_in_kernel(kvm))
 		goto unlock;
 		goto unlock;
 
 
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
 	kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id);
@@ -297,6 +266,33 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			     struct kvm_vcpu **dest_vcpu)
+{
+	int i, r = 0;
+	struct kvm_vcpu *vcpu;
+
+	if (kvm_intr_is_single_vcpu_fast(kvm, irq, dest_vcpu))
+		return true;
+
+	kvm_for_each_vcpu(i, vcpu, kvm) {
+		if (!kvm_apic_present(vcpu))
+			continue;
+
+		if (!kvm_apic_match_dest(vcpu, NULL, irq->shorthand,
+					irq->dest_id, irq->dest_mode))
+			continue;
+
+		if (++r == 2)
+			return false;
+
+		*dest_vcpu = vcpu;
+	}
+
+	return r == 1;
+}
+EXPORT_SYMBOL_GPL(kvm_intr_is_single_vcpu);
+
 #define IOAPIC_ROUTING_ENTRY(irq) \
 #define IOAPIC_ROUTING_ENTRY(irq) \
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	{ .gsi = irq, .type = KVM_IRQ_ROUTING_IRQCHIP,	\
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
 	  .u.irqchip = { .irqchip = KVM_IRQCHIP_IOAPIC, .pin = (irq) } }
@@ -328,3 +324,54 @@ int kvm_setup_default_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, default_routing,
 	return kvm_set_irq_routing(kvm, default_routing,
 				   ARRAY_SIZE(default_routing), 0);
 				   ARRAY_SIZE(default_routing), 0);
 }
 }
+
+static const struct kvm_irq_routing_entry empty_routing[] = {};
+
+int kvm_setup_empty_irq_routing(struct kvm *kvm)
+{
+	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
+}
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+		return;
+	kvm_make_scan_ioapic_request(kvm);
+}
+
+void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_kernel_irq_routing_entry *entry;
+	struct kvm_irq_routing_table *table;
+	u32 i, nr_ioapic_pins;
+	int idx;
+
+	/* kvm->irq_routing must be read after clearing
+	 * KVM_SCAN_IOAPIC. */
+	smp_mb();
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	table = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	nr_ioapic_pins = min_t(u32, table->nr_rt_entries,
+			       kvm->arch.nr_reserved_ioapic_pins);
+	for (i = 0; i < nr_ioapic_pins; ++i) {
+		hlist_for_each_entry(entry, &table->map[i], link) {
+			u32 dest_id, dest_mode;
+			bool level;
+
+			if (entry->type != KVM_IRQ_ROUTING_MSI)
+				continue;
+			dest_id = (entry->msi.address_lo >> 12) & 0xff;
+			dest_mode = (entry->msi.address_lo >> 2) & 0x1;
+			level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
+			if (level && kvm_apic_match_dest(vcpu, NULL, 0,
+						dest_id, dest_mode)) {
+				u32 vector = entry->msi.data & 0xff;
+
+				__set_bit(vector,
+					  (unsigned long *) eoi_exit_bitmap);
+			}
+		}
+	}
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+}

+ 104 - 23
arch/x86/kvm/lapic.c

@@ -209,7 +209,7 @@ out:
 	if (old)
 	if (old)
 		kfree_rcu(old, rcu);
 		kfree_rcu(old, rcu);
 
 
-	kvm_vcpu_request_scan_ioapic(kvm);
+	kvm_make_scan_ioapic_request(kvm);
 }
 }
 
 
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
 static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
@@ -348,6 +348,8 @@ void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 
 	__kvm_apic_update_irr(pir, apic->regs);
 	__kvm_apic_update_irr(pir, apic->regs);
+
+	kvm_make_request(KVM_REQ_EVENT, vcpu);
 }
 }
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 EXPORT_SYMBOL_GPL(kvm_apic_update_irr);
 
 
@@ -390,7 +392,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
 
 	vcpu = apic->vcpu;
 	vcpu = apic->vcpu;
 
 
-	if (unlikely(kvm_apic_vid_enabled(vcpu->kvm))) {
+	if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
 		/* try to update RVI */
 		/* try to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -551,15 +553,6 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 }
 
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr)
-{
-	struct kvm_lapic *apic = vcpu->arch.apic;
-	int i;
-
-	for (i = 0; i < 8; i++)
-		apic_set_reg(apic, APIC_TMR + 0x10 * i, tmr[i]);
-}
-
 static void apic_update_ppr(struct kvm_lapic *apic)
 static void apic_update_ppr(struct kvm_lapic *apic)
 {
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	u32 tpr, isrv, ppr, old_ppr;
@@ -764,6 +757,65 @@ out:
 	return ret;
 	return ret;
 }
 }
 
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu)
+{
+	struct kvm_apic_map *map;
+	bool ret = false;
+	struct kvm_lapic *dst = NULL;
+
+	if (irq->shorthand)
+		return false;
+
+	rcu_read_lock();
+	map = rcu_dereference(kvm->arch.apic_map);
+
+	if (!map)
+		goto out;
+
+	if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+		if (irq->dest_id == 0xFF)
+			goto out;
+
+		if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
+			goto out;
+
+		dst = map->phys_map[irq->dest_id];
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	} else {
+		u16 cid;
+		unsigned long bitmap = 1;
+		int i, r = 0;
+
+		if (!kvm_apic_logical_map_valid(map))
+			goto out;
+
+		apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+
+		if (cid >= ARRAY_SIZE(map->logical_map))
+			goto out;
+
+		for_each_set_bit(i, &bitmap, 16) {
+			dst = map->logical_map[cid][i];
+			if (++r == 2)
+				goto out;
+		}
+
+		if (dst && kvm_apic_present(dst->vcpu))
+			*dest_vcpu = dst->vcpu;
+		else
+			goto out;
+	}
+
+	ret = true;
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
 /*
 /*
  * Add a pending IRQ into lapic.
  * Add a pending IRQ into lapic.
  * Return 1 if successfully added and 0 if discarded.
  * Return 1 if successfully added and 0 if discarded.
@@ -781,6 +833,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 	case APIC_DM_LOWEST:
 	case APIC_DM_LOWEST:
 		vcpu->arch.apic_arb_prio++;
 		vcpu->arch.apic_arb_prio++;
 	case APIC_DM_FIXED:
 	case APIC_DM_FIXED:
+		if (unlikely(trig_mode && !level))
+			break;
+
 		/* FIXME add logic for vcpu on reset */
 		/* FIXME add logic for vcpu on reset */
 		if (unlikely(!apic_enabled(apic)))
 		if (unlikely(!apic_enabled(apic)))
 			break;
 			break;
@@ -790,6 +845,13 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 		if (dest_map)
 		if (dest_map)
 			__set_bit(vcpu->vcpu_id, dest_map);
 			__set_bit(vcpu->vcpu_id, dest_map);
 
 
+		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+			if (trig_mode)
+				apic_set_vector(vector, apic->regs + APIC_TMR);
+			else
+				apic_clear_vector(vector, apic->regs + APIC_TMR);
+		}
+
 		if (kvm_x86_ops->deliver_posted_interrupt)
 		if (kvm_x86_ops->deliver_posted_interrupt)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 		else {
 		else {
@@ -868,16 +930,32 @@ int kvm_apic_compare_prio(struct kvm_vcpu *vcpu1, struct kvm_vcpu *vcpu2)
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 	return vcpu1->arch.apic_arb_prio - vcpu2->arch.apic_arb_prio;
 }
 }
 
 
+static bool kvm_ioapic_handles_vector(struct kvm_lapic *apic, int vector)
+{
+	return test_bit(vector, (ulong *)apic->vcpu->arch.eoi_exit_bitmap);
+}
+
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 static void kvm_ioapic_send_eoi(struct kvm_lapic *apic, int vector)
 {
 {
-	if (kvm_ioapic_handles_vector(apic->vcpu->kvm, vector)) {
-		int trigger_mode;
-		if (apic_test_vector(vector, apic->regs + APIC_TMR))
-			trigger_mode = IOAPIC_LEVEL_TRIG;
-		else
-			trigger_mode = IOAPIC_EDGE_TRIG;
-		kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
+	int trigger_mode;
+
+	/* Eoi the ioapic only if the ioapic doesn't own the vector. */
+	if (!kvm_ioapic_handles_vector(apic, vector))
+		return;
+
+	/* Request a KVM exit to inform the userspace IOAPIC. */
+	if (irqchip_split(apic->vcpu->kvm)) {
+		apic->vcpu->arch.pending_ioapic_eoi = vector;
+		kvm_make_request(KVM_REQ_IOAPIC_EOI_EXIT, apic->vcpu);
+		return;
 	}
 	}
+
+	if (apic_test_vector(vector, apic->regs + APIC_TMR))
+		trigger_mode = IOAPIC_LEVEL_TRIG;
+	else
+		trigger_mode = IOAPIC_EDGE_TRIG;
+
+	kvm_ioapic_update_eoi(apic->vcpu, vector, trigger_mode);
 }
 }
 
 
 static int apic_set_eoi(struct kvm_lapic *apic)
 static int apic_set_eoi(struct kvm_lapic *apic)
@@ -1615,7 +1693,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
 	}
-	apic->irr_pending = kvm_apic_vid_enabled(vcpu->kvm);
+	apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
 	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	update_divide_count(apic);
@@ -1838,7 +1916,10 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
 				apic_find_highest_isr(apic));
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
-	kvm_rtc_eoi_tracking_restore_one(vcpu);
+	if (ioapic_in_kernel(vcpu->kvm))
+		kvm_rtc_eoi_tracking_restore_one(vcpu);
+
+	vcpu->arch.apic_arb_prio = 0;
 }
 }
 
 
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
 void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1922,7 +2003,7 @@ static void apic_sync_pv_eoi_to_guest(struct kvm_vcpu *vcpu,
 	    /* Cache not set: could be safe but we don't bother. */
 	    /* Cache not set: could be safe but we don't bother. */
 	    apic->highest_isr_cache == -1 ||
 	    apic->highest_isr_cache == -1 ||
 	    /* Need EOI to update ioapic. */
 	    /* Need EOI to update ioapic. */
-	    kvm_ioapic_handles_vector(vcpu->kvm, apic->highest_isr_cache)) {
+	    kvm_ioapic_handles_vector(apic, apic->highest_isr_cache)) {
 		/*
 		/*
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * PV EOI was disabled by apic_sync_pv_eoi_from_guest
 		 * so we need not do anything here.
 		 * so we need not do anything here.
@@ -1978,7 +2059,7 @@ int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 	u32 reg = (msr - APIC_BASE_MSR) << 4;
 
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 		return 1;
 
 
 	if (reg == APIC_ICR2)
 	if (reg == APIC_ICR2)
@@ -1995,7 +2076,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 	u32 reg = (msr - APIC_BASE_MSR) << 4, low, high = 0;
 
 
-	if (!irqchip_in_kernel(vcpu->kvm) || !apic_x2apic_mode(apic))
+	if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic))
 		return 1;
 		return 1;
 
 
 	if (reg == APIC_DFR || reg == APIC_ICR2) {
 	if (reg == APIC_DFR || reg == APIC_ICR2) {

+ 4 - 3
arch/x86/kvm/lapic.h

@@ -57,7 +57,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 void kvm_apic_set_version(struct kvm_vcpu *vcpu);
 
 
-void kvm_apic_update_tmr(struct kvm_vcpu *vcpu, u32 *tmr);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void __kvm_apic_update_irr(u32 *pir, void *regs);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 void kvm_apic_update_irr(struct kvm_vcpu *vcpu, u32 *pir);
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
@@ -144,9 +143,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 }
 
 
-static inline bool kvm_apic_vid_enabled(struct kvm *kvm)
+static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
 {
 {
-	return kvm_x86_ops->vm_has_apicv(kvm);
+	return kvm_x86_ops->cpu_uses_apicv(vcpu);
 }
 }
 
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -169,4 +168,6 @@ bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
 
 
+bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
+			struct kvm_vcpu **dest_vcpu);
 #endif
 #endif

+ 53 - 38
arch/x86/kvm/mmu.c

@@ -818,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp)
 	kvm->arch.indirect_shadow_pages--;
 	kvm->arch.indirect_shadow_pages--;
 }
 }
 
 
-static int has_wrprotected_page(struct kvm_vcpu *vcpu,
-				gfn_t gfn,
-				int level)
+static int __has_wrprotected_page(gfn_t gfn, int level,
+				  struct kvm_memory_slot *slot)
 {
 {
-	struct kvm_memory_slot *slot;
 	struct kvm_lpage_info *linfo;
 	struct kvm_lpage_info *linfo;
 
 
-	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	if (slot) {
 	if (slot) {
 		linfo = lpage_info_slot(gfn, slot, level);
 		linfo = lpage_info_slot(gfn, slot, level);
 		return linfo->write_count;
 		return linfo->write_count;
@@ -834,6 +831,14 @@ static int has_wrprotected_page(struct kvm_vcpu *vcpu,
 	return 1;
 	return 1;
 }
 }
 
 
+static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
+{
+	struct kvm_memory_slot *slot;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
+	return __has_wrprotected_page(gfn, level, slot);
+}
+
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 {
 {
 	unsigned long page_size;
 	unsigned long page_size;
@@ -851,6 +856,17 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
 	return ret;
 	return ret;
 }
 }
 
 
+static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
+					  bool no_dirty_log)
+{
+	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
+		return false;
+	if (no_dirty_log && slot->dirty_bitmap)
+		return false;
+
+	return true;
+}
+
 static struct kvm_memory_slot *
 static struct kvm_memory_slot *
 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 			    bool no_dirty_log)
 			    bool no_dirty_log)
@@ -858,21 +874,25 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot *slot;
 
 
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
 	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
-	if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
-	      (no_dirty_log && slot->dirty_bitmap))
+	if (!memslot_valid_for_gpte(slot, no_dirty_log))
 		slot = NULL;
 		slot = NULL;
 
 
 	return slot;
 	return slot;
 }
 }
 
 
-static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-{
-	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
-}
-
-static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
+static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
+			 bool *force_pt_level)
 {
 {
 	int host_level, level, max_level;
 	int host_level, level, max_level;
+	struct kvm_memory_slot *slot;
+
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
+
+	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
+	*force_pt_level = !memslot_valid_for_gpte(slot, true);
+	if (unlikely(*force_pt_level))
+		return PT_PAGE_TABLE_LEVEL;
 
 
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
 	host_level = host_mapping_level(vcpu->kvm, large_gfn);
 
 
@@ -882,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
 
 
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
 	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
-		if (has_wrprotected_page(vcpu, large_gfn, level))
+		if (__has_wrprotected_page(large_gfn, level, slot))
 			break;
 			break;
 
 
 	return level - 1;
 	return level - 1;
@@ -2962,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 {
 {
 	int r;
 	int r;
 	int level;
 	int level;
-	int force_pt_level;
+	bool force_pt_level = false;
 	pfn_t pfn;
 	pfn_t pfn;
 	unsigned long mmu_seq;
 	unsigned long mmu_seq;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
 	bool map_writable, write = error_code & PFERR_WRITE_MASK;
 
 
-	force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
 		/*
 		/*
 		 * This path builds a PAE pagetable - so we can map
 		 * This path builds a PAE pagetable - so we can map
 		 * 2mb pages at maximum. Therefore check if the level
 		 * 2mb pages at maximum. Therefore check if the level
@@ -2979,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
 			level = PT_DIRECTORY_LEVEL;
 			level = PT_DIRECTORY_LEVEL;
 
 
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 
 	if (fast_page_fault(vcpu, v, level, error_code))
 	if (fast_page_fault(vcpu, v, level, error_code))
 		return 0;
 		return 0;
@@ -3427,7 +3445,7 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
 
 
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 static bool can_do_async_pf(struct kvm_vcpu *vcpu)
 {
 {
-	if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
+	if (unlikely(!lapic_in_kernel(vcpu) ||
 		     kvm_event_needs_reinjection(vcpu)))
 		     kvm_event_needs_reinjection(vcpu)))
 		return false;
 		return false;
 
 
@@ -3476,7 +3494,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	pfn_t pfn;
 	pfn_t pfn;
 	int r;
 	int r;
 	int level;
 	int level;
-	int force_pt_level;
+	bool force_pt_level;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	unsigned long mmu_seq;
 	unsigned long mmu_seq;
 	int write = error_code & PFERR_WRITE_MASK;
 	int write = error_code & PFERR_WRITE_MASK;
@@ -3495,20 +3513,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	if (mapping_level_dirty_bitmap(vcpu, gfn) ||
-	    !check_hugepage_cache_consistency(vcpu, gfn, PT_DIRECTORY_LEVEL))
-		force_pt_level = 1;
-	else
-		force_pt_level = 0;
-
+	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
+							   PT_DIRECTORY_LEVEL);
+	level = mapping_level(vcpu, gfn, &force_pt_level);
 	if (likely(!force_pt_level)) {
 	if (likely(!force_pt_level)) {
-		level = mapping_level(vcpu, gfn);
 		if (level > PT_DIRECTORY_LEVEL &&
 		if (level > PT_DIRECTORY_LEVEL &&
 		    !check_hugepage_cache_consistency(vcpu, gfn, level))
 		    !check_hugepage_cache_consistency(vcpu, gfn, level))
 			level = PT_DIRECTORY_LEVEL;
 			level = PT_DIRECTORY_LEVEL;
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
 		gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	} else
-		level = PT_PAGE_TABLE_LEVEL;
+	}
 
 
 	if (fast_page_fault(vcpu, gpa, level, error_code))
 	if (fast_page_fault(vcpu, gpa, level, error_code))
 		return 0;
 		return 0;
@@ -3706,7 +3719,7 @@ static void
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 			    int maxphyaddr, bool execonly)
 			    int maxphyaddr, bool execonly)
 {
 {
-	int pte;
+	u64 bad_mt_xwr;
 
 
 	rsvd_check->rsvd_bits_mask[0][3] =
 	rsvd_check->rsvd_bits_mask[0][3] =
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7);
@@ -3724,14 +3737,16 @@ __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
 		rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20);
 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
 	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
 
 
-	for (pte = 0; pte < 64; pte++) {
-		int rwx_bits = pte & 7;
-		int mt = pte >> 3;
-		if (mt == 0x2 || mt == 0x3 || mt == 0x7 ||
-				rwx_bits == 0x2 || rwx_bits == 0x6 ||
-				(rwx_bits == 0x4 && !execonly))
-			rsvd_check->bad_mt_xwr |= (1ull << pte);
+	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
+	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
+	bad_mt_xwr |= 0xFFull << (7 * 8);	/* bits 3..5 must not be 7 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 2);	/* bits 0..2 must not be 010 */
+	bad_mt_xwr |= REPEAT_BYTE(1ull << 6);	/* bits 0..2 must not be 110 */
+	if (!execonly) {
+		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
+		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
 	}
 	}
+	rsvd_check->bad_mt_xwr = bad_mt_xwr;
 }
 }
 
 
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,

+ 9 - 10
arch/x86/kvm/paging_tmpl.h

@@ -698,7 +698,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	int r;
 	int r;
 	pfn_t pfn;
 	pfn_t pfn;
 	int level = PT_PAGE_TABLE_LEVEL;
 	int level = PT_PAGE_TABLE_LEVEL;
-	int force_pt_level;
+	bool force_pt_level = false;
 	unsigned long mmu_seq;
 	unsigned long mmu_seq;
 	bool map_writable, is_self_change_mapping;
 	bool map_writable, is_self_change_mapping;
 
 
@@ -743,15 +743,14 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 	      &walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
 
 
-	if (walker.level >= PT_DIRECTORY_LEVEL)
-		force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn)
-		   || is_self_change_mapping;
-	else
-		force_pt_level = 1;
-	if (!force_pt_level) {
-		level = min(walker.level, mapping_level(vcpu, walker.gfn));
-		walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
-	}
+	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
+		level = mapping_level(vcpu, walker.gfn, &force_pt_level);
+		if (likely(!force_pt_level)) {
+			level = min(walker.level, level);
+			walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
+		}
+	} else
+		force_pt_level = true;
 
 
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 	smp_rmb();

+ 19 - 24
arch/x86/kvm/svm.c

@@ -159,6 +159,9 @@ struct vcpu_svm {
 	u32 apf_reason;
 	u32 apf_reason;
 
 
 	u64  tsc_ratio;
 	u64  tsc_ratio;
+
+	/* cached guest cpuid flags for faster access */
+	bool nrips_enabled	: 1;
 };
 };
 
 
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
 static DEFINE_PER_CPU(u64, current_tsc_ratio);
@@ -1086,7 +1089,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 	return target_tsc - tsc;
 	return target_tsc - tsc;
 }
 }
 
 
-static void init_vmcb(struct vcpu_svm *svm, bool init_event)
+static void init_vmcb(struct vcpu_svm *svm)
 {
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_control_area *control = &svm->vmcb->control;
 	struct vmcb_save_area *save = &svm->vmcb->save;
 	struct vmcb_save_area *save = &svm->vmcb->save;
@@ -1157,8 +1160,7 @@ static void init_vmcb(struct vcpu_svm *svm, bool init_event)
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 	init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
 
 
-	if (!init_event)
-		svm_set_efer(&svm->vcpu, 0);
+	svm_set_efer(&svm->vcpu, 0);
 	save->dr6 = 0xffff0ff0;
 	save->dr6 = 0xffff0ff0;
 	kvm_set_rflags(&svm->vcpu, 2);
 	kvm_set_rflags(&svm->vcpu, 2);
 	save->rip = 0x0000fff0;
 	save->rip = 0x0000fff0;
@@ -1212,7 +1214,7 @@ static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
 		if (kvm_vcpu_is_reset_bsp(&svm->vcpu))
 			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 			svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 	}
 	}
-	init_vmcb(svm, init_event);
+	init_vmcb(svm);
 
 
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
 	kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
@@ -1268,7 +1270,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	clear_page(svm->vmcb);
 	clear_page(svm->vmcb);
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
 	svm->asid_generation = 0;
 	svm->asid_generation = 0;
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 
 
 	svm_init_osvw(&svm->vcpu);
 	svm_init_osvw(&svm->vcpu);
 
 
@@ -1890,7 +1892,7 @@ static int shutdown_interception(struct vcpu_svm *svm)
 	 * so reinitialize it.
 	 * so reinitialize it.
 	 */
 	 */
 	clear_page(svm->vmcb);
 	clear_page(svm->vmcb);
-	init_vmcb(svm, false);
+	init_vmcb(svm);
 
 
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	kvm_run->exit_reason = KVM_EXIT_SHUTDOWN;
 	return 0;
 	return 0;
@@ -2365,7 +2367,9 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_info_2       = vmcb->control.exit_info_2;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info     = vmcb->control.exit_int_info;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
 	nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
-	nested_vmcb->control.next_rip          = vmcb->control.next_rip;
+
+	if (svm->nrips_enabled)
+		nested_vmcb->control.next_rip  = vmcb->control.next_rip;
 
 
 	/*
 	/*
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
 	 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -3060,7 +3064,7 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
 	/* instruction emulation calls kvm_set_cr8() */
 	/* instruction emulation calls kvm_set_cr8() */
 	r = cr_interception(svm);
 	r = cr_interception(svm);
-	if (irqchip_in_kernel(svm->vcpu.kvm))
+	if (lapic_in_kernel(&svm->vcpu))
 		return r;
 		return r;
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 	if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
 		return r;
 		return r;
@@ -3294,24 +3298,11 @@ static int msr_interception(struct vcpu_svm *svm)
 
 
 static int interrupt_window_interception(struct vcpu_svm *svm)
 static int interrupt_window_interception(struct vcpu_svm *svm)
 {
 {
-	struct kvm_run *kvm_run = svm->vcpu.run;
-
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
 	svm_clear_vintr(svm);
 	svm_clear_vintr(svm);
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
 	mark_dirty(svm->vmcb, VMCB_INTR);
 	mark_dirty(svm->vmcb, VMCB_INTR);
 	++svm->vcpu.stat.irq_window_exits;
 	++svm->vcpu.stat.irq_window_exits;
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!irqchip_in_kernel(svm->vcpu.kvm) &&
-	    kvm_run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(&svm->vcpu)) {
-		kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
-
 	return 1;
 	return 1;
 }
 }
 
 
@@ -3659,12 +3650,12 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 	return;
 }
 }
 
 
-static int svm_vm_has_apicv(struct kvm *kvm)
+static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
 {
 	return 0;
 	return 0;
 }
 }
 
 
-static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 {
 	return;
 	return;
 }
 }
@@ -4098,6 +4089,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
 
 
 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 static void svm_cpuid_update(struct kvm_vcpu *vcpu)
 {
 {
+	struct vcpu_svm *svm = to_svm(vcpu);
+
+	/* Update nrips enabled cache */
+	svm->nrips_enabled = !!guest_cpuid_has_nrips(&svm->vcpu);
 }
 }
 
 
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -4425,7 +4420,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
-	.vm_has_apicv = svm_vm_has_apicv,
+	.cpu_uses_apicv = svm_cpu_uses_apicv,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
 

+ 51 - 0
arch/x86/kvm/trace.h

@@ -128,6 +128,24 @@ TRACE_EVENT(kvm_pio,
 		  __entry->count > 1 ? "(...)" : "")
 		  __entry->count > 1 ? "(...)" : "")
 );
 );
 
 
+/*
+ * Tracepoint for fast mmio.
+ */
+TRACE_EVENT(kvm_fast_mmio,
+	TP_PROTO(u64 gpa),
+	TP_ARGS(gpa),
+
+	TP_STRUCT__entry(
+		__field(u64,	gpa)
+	),
+
+	TP_fast_assign(
+		__entry->gpa		= gpa;
+	),
+
+	TP_printk("fast mmio at gpa 0x%llx", __entry->gpa)
+);
+
 /*
 /*
  * Tracepoint for cpuid.
  * Tracepoint for cpuid.
  */
  */
@@ -974,6 +992,39 @@ TRACE_EVENT(kvm_enter_smm,
 		  __entry->smbase)
 		  __entry->smbase)
 );
 );
 
 
+/*
+ * Tracepoint for VT-d posted-interrupts.
+ */
+TRACE_EVENT(kvm_pi_irte_update,
+	TP_PROTO(unsigned int vcpu_id, unsigned int gsi,
+		 unsigned int gvec, u64 pi_desc_addr, bool set),
+	TP_ARGS(vcpu_id, gsi, gvec, pi_desc_addr, set),
+
+	TP_STRUCT__entry(
+		__field(	unsigned int,	vcpu_id		)
+		__field(	unsigned int,	gsi		)
+		__field(	unsigned int,	gvec		)
+		__field(	u64,		pi_desc_addr	)
+		__field(	bool,		set		)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->gsi		= gsi;
+		__entry->gvec		= gvec;
+		__entry->pi_desc_addr	= pi_desc_addr;
+		__entry->set		= set;
+	),
+
+	TP_printk("VT-d PI is %s for this irq, vcpu %u, gsi: 0x%x, "
+		  "gvec: 0x%x, pi_desc_addr: 0x%llx",
+		  __entry->set ? "enabled and being updated" : "disabled",
+		  __entry->vcpu_id,
+		  __entry->gsi,
+		  __entry->gvec,
+		  __entry->pi_desc_addr)
+);
+
 #endif /* _TRACE_KVM_H */
 #endif /* _TRACE_KVM_H */
 
 
 #undef TRACE_INCLUDE_PATH
 #undef TRACE_INCLUDE_PATH

+ 605 - 145
arch/x86/kvm/vmx.c

@@ -35,6 +35,7 @@
 #include "kvm_cache_regs.h"
 #include "kvm_cache_regs.h"
 #include "x86.h"
 #include "x86.h"
 
 
+#include <asm/cpu.h>
 #include <asm/io.h>
 #include <asm/io.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/vmx.h>
 #include <asm/vmx.h>
@@ -45,6 +46,7 @@
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
 #include <asm/kexec.h>
 #include <asm/kexec.h>
 #include <asm/apic.h>
 #include <asm/apic.h>
+#include <asm/irq_remapping.h>
 
 
 #include "trace.h"
 #include "trace.h"
 #include "pmu.h"
 #include "pmu.h"
@@ -424,6 +426,9 @@ struct nested_vmx {
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
 	u64 vmcs01_debugctl;
 	u64 vmcs01_debugctl;
 
 
+	u16 vpid02;
+	u16 last_vpid;
+
 	u32 nested_vmx_procbased_ctls_low;
 	u32 nested_vmx_procbased_ctls_low;
 	u32 nested_vmx_procbased_ctls_high;
 	u32 nested_vmx_procbased_ctls_high;
 	u32 nested_vmx_true_procbased_ctls_low;
 	u32 nested_vmx_true_procbased_ctls_low;
@@ -440,14 +445,33 @@ struct nested_vmx {
 	u32 nested_vmx_misc_low;
 	u32 nested_vmx_misc_low;
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_misc_high;
 	u32 nested_vmx_ept_caps;
 	u32 nested_vmx_ept_caps;
+	u32 nested_vmx_vpid_caps;
 };
 };
 
 
 #define POSTED_INTR_ON  0
 #define POSTED_INTR_ON  0
+#define POSTED_INTR_SN  1
+
 /* Posted-Interrupt Descriptor */
 /* Posted-Interrupt Descriptor */
 struct pi_desc {
 struct pi_desc {
 	u32 pir[8];     /* Posted interrupt requested */
 	u32 pir[8];     /* Posted interrupt requested */
-	u32 control;	/* bit 0 of control is outstanding notification bit */
-	u32 rsvd[7];
+	union {
+		struct {
+				/* bit 256 - Outstanding Notification */
+			u16	on	: 1,
+				/* bit 257 - Suppress Notification */
+				sn	: 1,
+				/* bit 271:258 - Reserved */
+				rsvd_1	: 14;
+				/* bit 279:272 - Notification Vector */
+			u8	nv;
+				/* bit 287:280 - Reserved */
+			u8	rsvd_2;
+				/* bit 319:288 - Notification Destination */
+			u32	ndst;
+		};
+		u64 control;
+	};
+	u32 rsvd[6];
 } __aligned(64);
 } __aligned(64);
 
 
 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
 static bool pi_test_and_set_on(struct pi_desc *pi_desc)
@@ -467,6 +491,30 @@ static int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 	return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
 }
 }
 
 
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+	return clear_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+	return set_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_ON,
+			(unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+	return test_bit(POSTED_INTR_SN,
+			(unsigned long *)&pi_desc->control);
+}
+
 struct vcpu_vmx {
 struct vcpu_vmx {
 	struct kvm_vcpu       vcpu;
 	struct kvm_vcpu       vcpu;
 	unsigned long         host_rsp;
 	unsigned long         host_rsp;
@@ -532,8 +580,6 @@ struct vcpu_vmx {
 	s64 vnmi_blocked_time;
 	s64 vnmi_blocked_time;
 	u32 exit_reason;
 	u32 exit_reason;
 
 
-	bool rdtscp_enabled;
-
 	/* Posted interrupt descriptor */
 	/* Posted interrupt descriptor */
 	struct pi_desc pi_desc;
 	struct pi_desc pi_desc;
 
 
@@ -563,6 +609,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 	return container_of(vcpu, struct vcpu_vmx, vcpu);
 }
 }
 
 
+static struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+	return &(to_vmx(vcpu)->pi_desc);
+}
+
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x)
 #define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
 #define FIELD(number, name)	[number] = VMCS12_OFFSET(name)
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
 #define FIELD64(number, name)	[number] = VMCS12_OFFSET(name), \
@@ -809,7 +860,7 @@ static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
 static void kvm_cpu_vmxoff(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_mpx_supported(void);
 static bool vmx_xsaves_supported(void);
 static bool vmx_xsaves_supported(void);
-static int vmx_vm_has_apicv(struct kvm *kvm);
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 			    struct kvm_segment *var, int seg);
@@ -831,6 +882,13 @@ static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
+
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_msr_bitmap_legacy;
 static unsigned long *vmx_msr_bitmap_legacy;
@@ -946,9 +1004,9 @@ static inline bool cpu_has_vmx_tpr_shadow(void)
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 	return vmcs_config.cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW;
 }
 }
 
 
-static inline bool vm_need_tpr_shadow(struct kvm *kvm)
+static inline bool cpu_need_tpr_shadow(struct kvm_vcpu *vcpu)
 {
 {
-	return (cpu_has_vmx_tpr_shadow()) && (irqchip_in_kernel(kvm));
+	return cpu_has_vmx_tpr_shadow() && lapic_in_kernel(vcpu);
 }
 }
 
 
 static inline bool cpu_has_secondary_exec_ctrls(void)
 static inline bool cpu_has_secondary_exec_ctrls(void)
@@ -983,7 +1041,8 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
 
 
 static inline bool cpu_has_vmx_posted_intr(void)
 static inline bool cpu_has_vmx_posted_intr(void)
 {
 {
-	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
+	return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
+		vmcs_config.pin_based_exec_ctrl & PIN_BASED_POSTED_INTR;
 }
 }
 
 
 static inline bool cpu_has_vmx_apicv(void)
 static inline bool cpu_has_vmx_apicv(void)
@@ -1062,9 +1121,9 @@ static inline bool cpu_has_vmx_ple(void)
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 		SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 }
 }
 
 
-static inline bool vm_need_virtualize_apic_accesses(struct kvm *kvm)
+static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
 {
 {
-	return flexpriority_enabled && irqchip_in_kernel(kvm);
+	return flexpriority_enabled && lapic_in_kernel(vcpu);
 }
 }
 
 
 static inline bool cpu_has_vmx_vpid(void)
 static inline bool cpu_has_vmx_vpid(void)
@@ -1157,6 +1216,11 @@ static inline bool nested_cpu_has_virt_x2apic_mode(struct vmcs12 *vmcs12)
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
 }
 }
 
 
+static inline bool nested_cpu_has_vpid(struct vmcs12 *vmcs12)
+{
+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_VPID);
+}
+
 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 static inline bool nested_cpu_has_apic_reg_virt(struct vmcs12 *vmcs12)
 {
 {
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
 	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_APIC_REGISTER_VIRT);
@@ -1337,13 +1401,13 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 			 __loaded_vmcs_clear, loaded_vmcs, 1);
 }
 }
 
 
-static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(int vpid)
 {
 {
-	if (vmx->vpid == 0)
+	if (vpid == 0)
 		return;
 		return;
 
 
 	if (cpu_has_vmx_invvpid_single())
 	if (cpu_has_vmx_invvpid_single())
-		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
 }
 }
 
 
 static inline void vpid_sync_vcpu_global(void)
 static inline void vpid_sync_vcpu_global(void)
@@ -1352,10 +1416,10 @@ static inline void vpid_sync_vcpu_global(void)
 		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
 }
 }
 
 
-static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+static inline void vpid_sync_context(int vpid)
 {
 {
 	if (cpu_has_vmx_invvpid_single())
 	if (cpu_has_vmx_invvpid_single())
-		vpid_sync_vcpu_single(vmx);
+		vpid_sync_vcpu_single(vpid);
 	else
 	else
 		vpid_sync_vcpu_global();
 		vpid_sync_vcpu_global();
 }
 }
@@ -1895,6 +1959,52 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 	preempt_enable();
 	preempt_enable();
 }
 }
 
 
+static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * If 'nv' field is POSTED_INTR_WAKEUP_VECTOR, there
+		 * are two possible cases:
+		 * 1. After running 'pre_block', context switch
+		 *    happened. For this case, 'sn' was set in
+		 *    vmx_vcpu_put(), so we need to clear it here.
+		 * 2. After running 'pre_block', we were blocked,
+		 *    and woken up by some other guy. For this case,
+		 *    we don't need to do anything, 'pi_post_block'
+		 *    will do everything for us. However, we cannot
+		 *    check whether it is case #1 or case #2 here
+		 *    (maybe, not needed), so we also clear sn here,
+		 *    I think it is not a big deal.
+		 */
+		if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR) {
+			if (vcpu->cpu != cpu) {
+				dest = cpu_physical_id(cpu);
+
+				if (x2apic_enabled())
+					new.ndst = dest;
+				else
+					new.ndst = (dest << 8) & 0xFF00;
+			}
+
+			/* set 'NV' to 'notification vector' */
+			new.nv = POSTED_INTR_VECTOR;
+		}
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+}
 /*
 /*
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * Switches to specified vcpu, until a matching vcpu_put(), but assumes
  * vcpu mutex is already taken.
  * vcpu mutex is already taken.
@@ -1945,10 +2055,27 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 		vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
 		vmx->loaded_vmcs->cpu = cpu;
 		vmx->loaded_vmcs->cpu = cpu;
 	}
 	}
+
+	vmx_vcpu_pi_load(vcpu, cpu);
+}
+
+static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	/* Set SN when the vCPU is preempted */
+	if (vcpu->preempted)
+		pi_set_sn(pi_desc);
 }
 }
 
 
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
+	vmx_vcpu_pi_put(vcpu);
+
 	__vmx_load_host_state(to_vmx(vcpu));
 	__vmx_load_host_state(to_vmx(vcpu));
 	if (!vmm_exclusive) {
 	if (!vmm_exclusive) {
 		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
 		__loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
@@ -2207,7 +2334,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
 		if (index >= 0)
 		if (index >= 0)
 			move_msr_up(vmx, index, save_nmsrs++);
 			move_msr_up(vmx, index, save_nmsrs++);
 		index = __find_msr_index(vmx, MSR_TSC_AUX);
 		index = __find_msr_index(vmx, MSR_TSC_AUX);
-		if (index >= 0 && vmx->rdtscp_enabled)
+		if (index >= 0 && guest_cpuid_has_rdtscp(&vmx->vcpu))
 			move_msr_up(vmx, index, save_nmsrs++);
 			move_msr_up(vmx, index, save_nmsrs++);
 		/*
 		/*
 		 * MSR_STAR is only needed on long mode guests, and only
 		 * MSR_STAR is only needed on long mode guests, and only
@@ -2377,7 +2504,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
 		PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (vmx_cpu_uses_apicv(&vmx->vcpu))
 		vmx->nested.nested_vmx_pinbased_ctls_high |=
 		vmx->nested.nested_vmx_pinbased_ctls_high |=
 			PIN_BASED_POSTED_INTR;
 			PIN_BASED_POSTED_INTR;
 
 
@@ -2471,10 +2598,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_RDTSCP |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
 		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_ENABLE_VPID |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 		SECONDARY_EXEC_WBINVD_EXITING |
 		SECONDARY_EXEC_WBINVD_EXITING |
-		SECONDARY_EXEC_XSAVES;
+		SECONDARY_EXEC_XSAVES |
+		SECONDARY_EXEC_PCOMMIT;
 
 
 	if (enable_ept) {
 	if (enable_ept) {
 		/* nested EPT: emulate EPT also to L1 */
 		/* nested EPT: emulate EPT also to L1 */
@@ -2493,6 +2622,12 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	} else
 	} else
 		vmx->nested.nested_vmx_ept_caps = 0;
 		vmx->nested.nested_vmx_ept_caps = 0;
 
 
+	if (enable_vpid)
+		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
+				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+	else
+		vmx->nested.nested_vmx_vpid_caps = 0;
+
 	if (enable_unrestricted_guest)
 	if (enable_unrestricted_guest)
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 		vmx->nested.nested_vmx_secondary_ctls_high |=
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
 			SECONDARY_EXEC_UNRESTRICTED_GUEST;
@@ -2608,7 +2743,8 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
 		break;
 		break;
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 	case MSR_IA32_VMX_EPT_VPID_CAP:
 		/* Currently, no nested vpid support */
 		/* Currently, no nested vpid support */
-		*pdata = vmx->nested.nested_vmx_ept_caps;
+		*pdata = vmx->nested.nested_vmx_ept_caps |
+			((u64)vmx->nested.nested_vmx_vpid_caps << 32);
 		break;
 		break;
 	default:
 	default:
 		return 1;
 		return 1;
@@ -2673,7 +2809,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = vcpu->arch.ia32_xss;
 		msr_info->data = vcpu->arch.ia32_xss;
 		break;
 		break;
 	case MSR_TSC_AUX:
 	case MSR_TSC_AUX:
-		if (!to_vmx(vcpu)->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu))
 			return 1;
 			return 1;
 		/* Otherwise falls through */
 		/* Otherwise falls through */
 	default:
 	default:
@@ -2779,7 +2915,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 			clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
 		break;
 		break;
 	case MSR_TSC_AUX:
 	case MSR_TSC_AUX:
-		if (!vmx->rdtscp_enabled)
+		if (!guest_cpuid_has_rdtscp(vcpu))
 			return 1;
 			return 1;
 		/* Check reserved bit, higher 32 bits should be zero */
 		/* Check reserved bit, higher 32 bits should be zero */
 		if ((data >> 32) != 0)
 		if ((data >> 32) != 0)
@@ -2874,6 +3010,8 @@ static int hardware_enable(void)
 		return -EBUSY;
 		return -EBUSY;
 
 
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
 	INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+	INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+	spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
 
 
 	/*
 	/*
 	 * Now we can enable the vmclear operation in kdump
 	 * Now we can enable the vmclear operation in kdump
@@ -3015,7 +3153,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_SHADOW_VMCS |
 			SECONDARY_EXEC_XSAVES |
 			SECONDARY_EXEC_XSAVES |
-			SECONDARY_EXEC_ENABLE_PML;
+			SECONDARY_EXEC_ENABLE_PML |
+			SECONDARY_EXEC_PCOMMIT;
 		if (adjust_vmx_controls(min2, opt2,
 		if (adjust_vmx_controls(min2, opt2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					MSR_IA32_VMX_PROCBASED_CTLS2,
 					&_cpu_based_2nd_exec_control) < 0)
 					&_cpu_based_2nd_exec_control) < 0)
@@ -3441,9 +3580,9 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 
 #endif
 #endif
 
 
-static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid)
 {
 {
-	vpid_sync_context(to_vmx(vcpu));
+	vpid_sync_context(vpid);
 	if (enable_ept) {
 	if (enable_ept) {
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 			return;
 			return;
@@ -3451,6 +3590,11 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 	}
 	}
 }
 }
 
 
+static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
+{
+	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid);
+}
+
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 {
 {
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
@@ -3644,20 +3788,21 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 		if (!is_paging(vcpu)) {
 		if (!is_paging(vcpu)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 			hw_cr4 &= ~X86_CR4_PAE;
 			hw_cr4 |= X86_CR4_PSE;
 			hw_cr4 |= X86_CR4_PSE;
-			/*
-			 * SMEP/SMAP is disabled if CPU is in non-paging mode
-			 * in hardware. However KVM always uses paging mode to
-			 * emulate guest non-paging mode with TDP.
-			 * To emulate this behavior, SMEP/SMAP needs to be
-			 * manually disabled when guest switches to non-paging
-			 * mode.
-			 */
-			hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
 		} else if (!(cr4 & X86_CR4_PAE)) {
 		} else if (!(cr4 & X86_CR4_PAE)) {
 			hw_cr4 &= ~X86_CR4_PAE;
 			hw_cr4 &= ~X86_CR4_PAE;
 		}
 		}
 	}
 	}
 
 
+	if (!enable_unrestricted_guest && !is_paging(vcpu))
+		/*
+		 * SMEP/SMAP is disabled if CPU is in non-paging mode in
+		 * hardware.  However KVM always uses paging mode without
+		 * unrestricted guest.
+		 * To emulate this behavior, SMEP/SMAP needs to be manually
+		 * disabled when guest switches to non-paging mode.
+		 */
+		hw_cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
+
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(CR4_READ_SHADOW, cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
 	vmcs_writel(GUEST_CR4, hw_cr4);
 	return 0;
 	return 0;
@@ -4146,29 +4291,28 @@ static int alloc_identity_pagetable(struct kvm *kvm)
 	return r;
 	return r;
 }
 }
 
 
-static void allocate_vpid(struct vcpu_vmx *vmx)
+static int allocate_vpid(void)
 {
 {
 	int vpid;
 	int vpid;
 
 
-	vmx->vpid = 0;
 	if (!enable_vpid)
 	if (!enable_vpid)
-		return;
+		return 0;
 	spin_lock(&vmx_vpid_lock);
 	spin_lock(&vmx_vpid_lock);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
 	vpid = find_first_zero_bit(vmx_vpid_bitmap, VMX_NR_VPIDS);
-	if (vpid < VMX_NR_VPIDS) {
-		vmx->vpid = vpid;
+	if (vpid < VMX_NR_VPIDS)
 		__set_bit(vpid, vmx_vpid_bitmap);
 		__set_bit(vpid, vmx_vpid_bitmap);
-	}
+	else
+		vpid = 0;
 	spin_unlock(&vmx_vpid_lock);
 	spin_unlock(&vmx_vpid_lock);
+	return vpid;
 }
 }
 
 
-static void free_vpid(struct vcpu_vmx *vmx)
+static void free_vpid(int vpid)
 {
 {
-	if (!enable_vpid)
+	if (!enable_vpid || vpid == 0)
 		return;
 		return;
 	spin_lock(&vmx_vpid_lock);
 	spin_lock(&vmx_vpid_lock);
-	if (vmx->vpid != 0)
-		__clear_bit(vmx->vpid, vmx_vpid_bitmap);
+	__clear_bit(vpid, vmx_vpid_bitmap);
 	spin_unlock(&vmx_vpid_lock);
 	spin_unlock(&vmx_vpid_lock);
 }
 }
 
 
@@ -4323,9 +4467,9 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 			msr, MSR_TYPE_W);
 }
 }
 
 
-static int vmx_vm_has_apicv(struct kvm *kvm)
+static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 {
 {
-	return enable_apicv && irqchip_in_kernel(kvm);
+	return enable_apicv && lapic_in_kernel(vcpu);
 }
 }
 
 
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -4369,6 +4513,22 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 {
 #ifdef CONFIG_SMP
 #ifdef CONFIG_SMP
 	if (vcpu->mode == IN_GUEST_MODE) {
 	if (vcpu->mode == IN_GUEST_MODE) {
+		struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+		/*
+		 * Currently, we don't support urgent interrupt,
+		 * all interrupts are recognized as non-urgent
+		 * interrupt, so we cannot post interrupts when
+		 * 'SN' is set.
+		 *
+		 * If the vcpu is in guest mode, it means it is
+		 * running instead of being scheduled out and
+		 * waiting in the run queue, and that's the only
+		 * case when 'SN' is set currently, warning if
+		 * 'SN' is set.
+		 */
+		WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc));
+
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 		apic->send_IPI_mask(get_cpu_mask(vcpu->cpu),
 				POSTED_INTR_VECTOR);
 				POSTED_INTR_VECTOR);
 		return true;
 		return true;
@@ -4505,7 +4665,7 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
 {
 	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
 
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 	return pin_based_exec_ctrl;
 	return pin_based_exec_ctrl;
 }
 }
@@ -4517,7 +4677,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
 	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
 		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 		exec_control &= ~CPU_BASED_MOV_DR_EXITING;
 
 
-	if (!vm_need_tpr_shadow(vmx->vcpu.kvm)) {
+	if (!cpu_need_tpr_shadow(&vmx->vcpu)) {
 		exec_control &= ~CPU_BASED_TPR_SHADOW;
 		exec_control &= ~CPU_BASED_TPR_SHADOW;
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 		exec_control |= CPU_BASED_CR8_STORE_EXITING |
 		exec_control |= CPU_BASED_CR8_STORE_EXITING |
@@ -4534,7 +4694,7 @@ static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 {
 {
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
 	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
-	if (!vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))
+	if (!cpu_need_virtualize_apic_accesses(&vmx->vcpu))
 		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 		exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 	if (vmx->vpid == 0)
 	if (vmx->vpid == 0)
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
 		exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
@@ -4548,7 +4708,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 	if (!ple_gap)
 	if (!ple_gap)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!vmx_vm_has_apicv(vmx->vcpu.kvm))
+	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4558,8 +4718,12 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 	   a current VMCS12
 	   a current VMCS12
 	*/
 	*/
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
 	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-	/* PML is enabled/disabled in creating/destorying vcpu */
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+	if (!enable_pml)
+		exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
+
+	/* Currently, we allow L1 guest to directly run pcommit instruction. */
+	exec_control &= ~SECONDARY_EXEC_PCOMMIT;
 
 
 	return exec_control;
 	return exec_control;
 }
 }
@@ -4604,12 +4768,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 	vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
 
 
-	if (cpu_has_secondary_exec_ctrls()) {
+	if (cpu_has_secondary_exec_ctrls())
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 				vmx_secondary_exec_control(vmx));
 				vmx_secondary_exec_control(vmx));
-	}
 
 
-	if (vmx_vm_has_apicv(vmx->vcpu.kvm)) {
+	if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4753,7 +4916,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 
 	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 	if (cpu_has_vmx_tpr_shadow() && !init_event) {
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
 		vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
-		if (vm_need_tpr_shadow(vcpu->kvm))
+		if (cpu_need_tpr_shadow(vcpu))
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 			vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
 				     __pa(vcpu->arch.apic->regs));
 				     __pa(vcpu->arch.apic->regs));
 		vmcs_write32(TPR_THRESHOLD, 0);
 		vmcs_write32(TPR_THRESHOLD, 0);
@@ -4761,7 +4924,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
 
-	if (vmx_vm_has_apicv(vcpu->kvm))
+	if (vmx_cpu_uses_apicv(vcpu))
 		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 
 
 	if (vmx->vpid != 0)
 	if (vmx->vpid != 0)
@@ -4771,12 +4934,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx_set_cr0(vcpu, cr0); /* enter rmode */
 	vmx->vcpu.arch.cr0 = cr0;
 	vmx->vcpu.arch.cr0 = cr0;
 	vmx_set_cr4(vcpu, 0);
 	vmx_set_cr4(vcpu, 0);
-	if (!init_event)
-		vmx_set_efer(vcpu, 0);
+	vmx_set_efer(vcpu, 0);
 	vmx_fpu_activate(vcpu);
 	vmx_fpu_activate(vcpu);
 	update_exception_bitmap(vcpu);
 	update_exception_bitmap(vcpu);
 
 
-	vpid_sync_context(vmx);
+	vpid_sync_context(vmx->vpid);
 }
 }
 
 
 /*
 /*
@@ -5296,7 +5458,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 				u8 cr8 = (u8)val;
 				u8 cr8 = (u8)val;
 				err = kvm_set_cr8(vcpu, cr8);
 				err = kvm_set_cr8(vcpu, cr8);
 				kvm_complete_insn_gp(vcpu, err);
 				kvm_complete_insn_gp(vcpu, err);
-				if (irqchip_in_kernel(vcpu->kvm))
+				if (lapic_in_kernel(vcpu))
 					return 1;
 					return 1;
 				if (cr8_prev <= cr8)
 				if (cr8_prev <= cr8)
 					return 1;
 					return 1;
@@ -5510,17 +5672,6 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 
 
 	++vcpu->stat.irq_window_exits;
 	++vcpu->stat.irq_window_exits;
-
-	/*
-	 * If the user space waits to inject interrupts, exit as soon as
-	 * possible
-	 */
-	if (!irqchip_in_kernel(vcpu->kvm) &&
-	    vcpu->run->request_interrupt_window &&
-	    !kvm_cpu_has_interrupt(vcpu)) {
-		vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
-		return 0;
-	}
 	return 1;
 	return 1;
 }
 }
 
 
@@ -5753,6 +5904,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 	if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) {
 		skip_emulated_instruction(vcpu);
 		skip_emulated_instruction(vcpu);
+		trace_kvm_fast_mmio(gpa);
 		return 1;
 		return 1;
 	}
 	}
 
 
@@ -5910,6 +6062,25 @@ static void update_ple_window_actual_max(void)
 			                    ple_window_grow, INT_MIN);
 			                    ple_window_grow, INT_MIN);
 }
 }
 
 
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+static void wakeup_handler(void)
+{
+	struct kvm_vcpu *vcpu;
+	int cpu = smp_processor_id();
+
+	spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+	list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+			blocked_vcpu_list) {
+		struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+		if (pi_test_on(pi_desc) == 1)
+			kvm_vcpu_kick(vcpu);
+	}
+	spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
 static __init int hardware_setup(void)
 static __init int hardware_setup(void)
 {
 {
 	int r = -ENOMEM, i, msr;
 	int r = -ENOMEM, i, msr;
@@ -6096,6 +6267,8 @@ static __init int hardware_setup(void)
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 		kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
 	}
 	}
 
 
+	kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+
 	return alloc_kvm_area();
 	return alloc_kvm_area();
 
 
 out8:
 out8:
@@ -6627,7 +6800,6 @@ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
 
 
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 {
 {
-	u32 exec_control;
 	if (vmx->nested.current_vmptr == -1ull)
 	if (vmx->nested.current_vmptr == -1ull)
 		return;
 		return;
 
 
@@ -6640,9 +6812,8 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
 		   they were modified */
 		   they were modified */
 		copy_shadow_to_vmcs12(vmx);
 		copy_shadow_to_vmcs12(vmx);
 		vmx->nested.sync_shadow_vmcs = false;
 		vmx->nested.sync_shadow_vmcs = false;
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+		vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+				SECONDARY_EXEC_SHADOW_VMCS);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 		vmcs_write64(VMCS_LINK_POINTER, -1ull);
 	}
 	}
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.posted_intr_nv = -1;
@@ -6662,6 +6833,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 		return;
 		return;
 
 
 	vmx->nested.vmxon = false;
 	vmx->nested.vmxon = false;
+	free_vpid(vmx->nested.vpid02);
 	nested_release_vmcs12(vmx);
 	nested_release_vmcs12(vmx);
 	if (enable_shadow_vmcs)
 	if (enable_shadow_vmcs)
 		free_vmcs(vmx->nested.current_shadow_vmcs);
 		free_vmcs(vmx->nested.current_shadow_vmcs);
@@ -7038,7 +7210,6 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 {
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	gpa_t vmptr;
 	gpa_t vmptr;
-	u32 exec_control;
 
 
 	if (!nested_vmx_check_permission(vcpu))
 	if (!nested_vmx_check_permission(vcpu))
 		return 1;
 		return 1;
@@ -7070,9 +7241,8 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12 = new_vmcs12;
 		vmx->nested.current_vmcs12_page = page;
 		vmx->nested.current_vmcs12_page = page;
 		if (enable_shadow_vmcs) {
 		if (enable_shadow_vmcs) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control |= SECONDARY_EXEC_SHADOW_VMCS;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+				      SECONDARY_EXEC_SHADOW_VMCS);
 			vmcs_write64(VMCS_LINK_POINTER,
 			vmcs_write64(VMCS_LINK_POINTER,
 				     __pa(vmx->nested.current_shadow_vmcs));
 				     __pa(vmx->nested.current_shadow_vmcs));
 			vmx->nested.sync_shadow_vmcs = true;
 			vmx->nested.sync_shadow_vmcs = true;
@@ -7178,7 +7348,63 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
 
 static int handle_invvpid(struct kvm_vcpu *vcpu)
 static int handle_invvpid(struct kvm_vcpu *vcpu)
 {
 {
-	kvm_queue_exception(vcpu, UD_VECTOR);
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+	u32 vmx_instruction_info;
+	unsigned long type, types;
+	gva_t gva;
+	struct x86_exception e;
+	int vpid;
+
+	if (!(vmx->nested.nested_vmx_secondary_ctls_high &
+	      SECONDARY_EXEC_ENABLE_VPID) ||
+			!(vmx->nested.nested_vmx_vpid_caps & VMX_VPID_INVVPID_BIT)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+
+	if (!nested_vmx_check_permission(vcpu))
+		return 1;
+
+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
+
+	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+
+	if (!(types & (1UL << type))) {
+		nested_vmx_failValid(vcpu,
+			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+		return 1;
+	}
+
+	/* according to the intel vmx instruction reference, the memory
+	 * operand is read even if it isn't needed (e.g., for type==global)
+	 */
+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+			vmx_instruction_info, false, &gva))
+		return 1;
+	if (kvm_read_guest_virt(&vcpu->arch.emulate_ctxt, gva, &vpid,
+				sizeof(u32), &e)) {
+		kvm_inject_page_fault(vcpu, &e);
+		return 1;
+	}
+
+	switch (type) {
+	case VMX_VPID_EXTENT_ALL_CONTEXT:
+		if (get_vmcs12(vcpu)->virtual_processor_id == 0) {
+			nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+			return 1;
+		}
+		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+		nested_vmx_succeed(vcpu);
+		break;
+	default:
+		/* Trap single context invalidation invvpid calls */
+		BUG_ON(1);
+		break;
+	}
+
+	skip_emulated_instruction(vcpu);
 	return 1;
 	return 1;
 }
 }
 
 
@@ -7207,6 +7433,13 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
 	return 1;
 	return 1;
 }
 }
 
 
+static int handle_pcommit(struct kvm_vcpu *vcpu)
+{
+	/* we never catch pcommit instruct for L1 guest. */
+	WARN_ON(1);
+	return 1;
+}
+
 /*
 /*
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * The exit handlers return 1 if the exit was handled fully and guest execution
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
  * may resume.  Otherwise they set the kvm_run parameter to indicate what needs
@@ -7257,6 +7490,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
+	[EXIT_REASON_PCOMMIT]                 = handle_pcommit,
 };
 };
 
 
 static const int kvm_vmx_max_exit_handlers =
 static const int kvm_vmx_max_exit_handlers =
@@ -7558,6 +7792,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 		 * the XSS exit bitmap in vmcs12.
 		 * the XSS exit bitmap in vmcs12.
 		 */
 		 */
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+	case EXIT_REASON_PCOMMIT:
+		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_PCOMMIT);
 	default:
 	default:
 		return true;
 		return true;
 	}
 	}
@@ -7569,10 +7805,9 @@ static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 	*info2 = vmcs_read32(VM_EXIT_INTR_INFO);
 }
 }
 
 
-static int vmx_enable_pml(struct vcpu_vmx *vmx)
+static int vmx_create_pml_buffer(struct vcpu_vmx *vmx)
 {
 {
 	struct page *pml_pg;
 	struct page *pml_pg;
-	u32 exec_control;
 
 
 	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	pml_pg = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!pml_pg)
 	if (!pml_pg)
@@ -7583,24 +7818,15 @@ static int vmx_enable_pml(struct vcpu_vmx *vmx)
 	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 	vmcs_write64(PML_ADDRESS, page_to_phys(vmx->pml_pg));
 	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 	vmcs_write16(GUEST_PML_INDEX, PML_ENTITY_NUM - 1);
 
 
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control |= SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
-
 	return 0;
 	return 0;
 }
 }
 
 
-static void vmx_disable_pml(struct vcpu_vmx *vmx)
+static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
 {
 {
-	u32 exec_control;
-
-	ASSERT(vmx->pml_pg);
-	__free_page(vmx->pml_pg);
-	vmx->pml_pg = NULL;
-
-	exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
-	vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
+	if (vmx->pml_pg) {
+		__free_page(vmx->pml_pg);
+		vmx->pml_pg = NULL;
+	}
 }
 }
 
 
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
 static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
@@ -7924,10 +8150,10 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	 * apicv
 	 * apicv
 	 */
 	 */
 	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
 	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-				!vmx_vm_has_apicv(vcpu->kvm))
+				!vmx_cpu_uses_apicv(vcpu))
 		return;
 		return;
 
 
-	if (!vm_need_tpr_shadow(vcpu->kvm))
+	if (!cpu_need_tpr_shadow(vcpu))
 		return;
 		return;
 
 
 	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
 	sec_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -8029,9 +8255,10 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 	}
 	}
 }
 }
 
 
-static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
+static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 {
-	if (!vmx_vm_has_apicv(vcpu->kvm))
+	u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
+	if (!vmx_cpu_uses_apicv(vcpu))
 		return;
 		return;
 
 
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8477,8 +8704,8 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
 
 	if (enable_pml)
 	if (enable_pml)
-		vmx_disable_pml(vmx);
-	free_vpid(vmx);
+		vmx_destroy_pml_buffer(vmx);
+	free_vpid(vmx->vpid);
 	leave_guest_mode(vcpu);
 	leave_guest_mode(vcpu);
 	vmx_load_vmcs01(vcpu);
 	vmx_load_vmcs01(vcpu);
 	free_nested(vmx);
 	free_nested(vmx);
@@ -8497,7 +8724,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx)
 	if (!vmx)
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
-	allocate_vpid(vmx);
+	vmx->vpid = allocate_vpid();
 
 
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
 	if (err)
 	if (err)
@@ -8530,7 +8757,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	put_cpu();
 	put_cpu();
 	if (err)
 	if (err)
 		goto free_vmcs;
 		goto free_vmcs;
-	if (vm_need_virtualize_apic_accesses(kvm)) {
+	if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 		err = alloc_apic_access_page(kvm);
 		err = alloc_apic_access_page(kvm);
 		if (err)
 		if (err)
 			goto free_vmcs;
 			goto free_vmcs;
@@ -8545,8 +8772,10 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 			goto free_vmcs;
 			goto free_vmcs;
 	}
 	}
 
 
-	if (nested)
+	if (nested) {
 		nested_vmx_setup_ctls_msrs(vmx);
 		nested_vmx_setup_ctls_msrs(vmx);
+		vmx->nested.vpid02 = allocate_vpid();
+	}
 
 
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.posted_intr_nv = -1;
 	vmx->nested.current_vmptr = -1ull;
 	vmx->nested.current_vmptr = -1ull;
@@ -8559,7 +8788,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	 * for the guest, etc.
 	 * for the guest, etc.
 	 */
 	 */
 	if (enable_pml) {
 	if (enable_pml) {
-		err = vmx_enable_pml(vmx);
+		err = vmx_create_pml_buffer(vmx);
 		if (err)
 		if (err)
 			goto free_vmcs;
 			goto free_vmcs;
 	}
 	}
@@ -8567,13 +8796,14 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	return &vmx->vcpu;
 	return &vmx->vcpu;
 
 
 free_vmcs:
 free_vmcs:
+	free_vpid(vmx->nested.vpid02);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 	free_loaded_vmcs(vmx->loaded_vmcs);
 free_msrs:
 free_msrs:
 	kfree(vmx->guest_msrs);
 	kfree(vmx->guest_msrs);
 uninit_vcpu:
 uninit_vcpu:
 	kvm_vcpu_uninit(&vmx->vcpu);
 	kvm_vcpu_uninit(&vmx->vcpu);
 free_vcpu:
 free_vcpu:
-	free_vpid(vmx);
+	free_vpid(vmx->vpid);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	return ERR_PTR(err);
 	return ERR_PTR(err);
 }
 }
@@ -8648,49 +8878,67 @@ static int vmx_get_lpage_level(void)
 		return PT_PDPE_LEVEL;
 		return PT_PDPE_LEVEL;
 }
 }
 
 
+static void vmcs_set_secondary_exec_control(u32 new_ctl)
+{
+	/*
+	 * These bits in the secondary execution controls field
+	 * are dynamic, the others are mostly based on the hypervisor
+	 * architecture and the guest's CPUID.  Do not touch the
+	 * dynamic bits.
+	 */
+	u32 mask =
+		SECONDARY_EXEC_SHADOW_VMCS |
+		SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
+		SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
+
+	u32 cur_ctl = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
+
+	vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
+		     (new_ctl & ~mask) | (cur_ctl & mask));
+}
+
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
 {
 {
 	struct kvm_cpuid_entry2 *best;
 	struct kvm_cpuid_entry2 *best;
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u32 exec_control;
+	u32 secondary_exec_ctl = vmx_secondary_exec_control(vmx);
 
 
-	vmx->rdtscp_enabled = false;
 	if (vmx_rdtscp_supported()) {
 	if (vmx_rdtscp_supported()) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		if (exec_control & SECONDARY_EXEC_RDTSCP) {
-			best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
-			if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
-				vmx->rdtscp_enabled = true;
-			else {
-				exec_control &= ~SECONDARY_EXEC_RDTSCP;
-				vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-						exec_control);
-			}
+		bool rdtscp_enabled = guest_cpuid_has_rdtscp(vcpu);
+		if (!rdtscp_enabled)
+			secondary_exec_ctl &= ~SECONDARY_EXEC_RDTSCP;
+
+		if (nested) {
+			if (rdtscp_enabled)
+				vmx->nested.nested_vmx_secondary_ctls_high |=
+					SECONDARY_EXEC_RDTSCP;
+			else
+				vmx->nested.nested_vmx_secondary_ctls_high &=
+					~SECONDARY_EXEC_RDTSCP;
 		}
 		}
-		if (nested && !vmx->rdtscp_enabled)
-			vmx->nested.nested_vmx_secondary_ctls_high &=
-				~SECONDARY_EXEC_RDTSCP;
 	}
 	}
 
 
 	/* Exposing INVPCID only when PCID is exposed */
 	/* Exposing INVPCID only when PCID is exposed */
 	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
 	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);
 	if (vmx_invpcid_supported() &&
 	if (vmx_invpcid_supported() &&
-	    best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&
-	    guest_cpuid_has_pcid(vcpu)) {
-		exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;
-		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-			     exec_control);
-	} else {
-		if (cpu_has_secondary_exec_ctrls()) {
-			exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
-			exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
-			vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
-				     exec_control);
-		}
+	    (!best || !(best->ebx & bit(X86_FEATURE_INVPCID)) ||
+	    !guest_cpuid_has_pcid(vcpu))) {
+		secondary_exec_ctl &= ~SECONDARY_EXEC_ENABLE_INVPCID;
+
 		if (best)
 		if (best)
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 			best->ebx &= ~bit(X86_FEATURE_INVPCID);
 	}
 	}
+
+	vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+	if (static_cpu_has(X86_FEATURE_PCOMMIT) && nested) {
+		if (guest_cpuid_has_pcommit(vcpu))
+			vmx->nested.nested_vmx_secondary_ctls_high |=
+				SECONDARY_EXEC_PCOMMIT;
+		else
+			vmx->nested.nested_vmx_secondary_ctls_high &=
+				~SECONDARY_EXEC_PCOMMIT;
+	}
 }
 }
 
 
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
 static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9298,13 +9546,13 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 
 	if (cpu_has_secondary_exec_ctrls()) {
 	if (cpu_has_secondary_exec_ctrls()) {
 		exec_control = vmx_secondary_exec_control(vmx);
 		exec_control = vmx_secondary_exec_control(vmx);
-		if (!vmx->rdtscp_enabled)
-			exec_control &= ~SECONDARY_EXEC_RDTSCP;
+
 		/* Take the following fields only from vmcs12 */
 		/* Take the following fields only from vmcs12 */
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 		exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_RDTSCP |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
-				  SECONDARY_EXEC_APIC_REGISTER_VIRT);
+				  SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				  SECONDARY_EXEC_PCOMMIT);
 		if (nested_cpu_has(vmcs12,
 		if (nested_cpu_has(vmcs12,
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 				CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
 			exec_control |= vmcs12->secondary_vm_exec_control;
 			exec_control |= vmcs12->secondary_vm_exec_control;
@@ -9323,7 +9571,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 				vmcs_write64(APIC_ACCESS_ADDR,
 				vmcs_write64(APIC_ACCESS_ADDR,
 				  page_to_phys(vmx->nested.apic_access_page));
 				  page_to_phys(vmx->nested.apic_access_page));
 		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
 		} else if (!(nested_cpu_has_virt_x2apic_mode(vmcs12)) &&
-			    (vm_need_virtualize_apic_accesses(vmx->vcpu.kvm))) {
+			    cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
 			exec_control |=
 			exec_control |=
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 				SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
 			kvm_vcpu_reload_apic_access_page(vcpu);
 			kvm_vcpu_reload_apic_access_page(vcpu);
@@ -9433,12 +9681,24 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 
 	if (enable_vpid) {
 	if (enable_vpid) {
 		/*
 		/*
-		 * Trivially support vpid by letting L2s share their parent
-		 * L1's vpid. TODO: move to a more elaborate solution, giving
-		 * each L2 its own vpid and exposing the vpid feature to L1.
+		 * There is no direct mapping between vpid02 and vpid12, the
+		 * vpid02 is per-vCPU for L0 and reused while the value of
+		 * vpid12 is changed w/ one invvpid during nested vmentry.
+		 * The vpid12 is allocated by L1 for L2, so it will not
+		 * influence global bitmap(for vpid01 and vpid02 allocation)
+		 * even if spawn a lot of nested vCPUs.
 		 */
 		 */
-		vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-		vmx_flush_tlb(vcpu);
+		if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02);
+			if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+				vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+				__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
+			}
+		} else {
+			vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+			vmx_flush_tlb(vcpu);
+		}
+
 	}
 	}
 
 
 	if (nested_cpu_has_ept(vmcs12)) {
 	if (nested_cpu_has_ept(vmcs12)) {
@@ -10278,6 +10538,201 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 	kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
 }
 }
 
 
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ *   we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ *      'NDST' <-- vcpu->pre_pcpu
+ *      'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ *   interrupt is posted for this vCPU, we cannot block it, in
+ *   this case, return 1, otherwise, return 0.
+ *
+ */
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+	unsigned long flags;
+	unsigned int dest;
+	struct pi_desc old, new;
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	vcpu->pre_pcpu = vcpu->cpu;
+	spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+			  vcpu->pre_pcpu), flags);
+	list_add_tail(&vcpu->blocked_vcpu_list,
+		      &per_cpu(blocked_vcpu_on_cpu,
+		      vcpu->pre_pcpu));
+	spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
+			       vcpu->pre_pcpu), flags);
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		/*
+		 * We should not block the vCPU if
+		 * an interrupt is posted for it.
+		 */
+		if (pi_test_on(pi_desc) == 1) {
+			spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
+					  vcpu->pre_pcpu), flags);
+			list_del(&vcpu->blocked_vcpu_list);
+			spin_unlock_irqrestore(
+					&per_cpu(blocked_vcpu_on_cpu_lock,
+					vcpu->pre_pcpu), flags);
+			vcpu->pre_pcpu = -1;
+
+			return 1;
+		}
+
+		WARN((pi_desc->sn == 1),
+		     "Warning: SN field of posted-interrupts "
+		     "is set before blocking\n");
+
+		/*
+		 * Since vCPU can be preempted during this process,
+		 * vcpu->cpu could be different with pre_pcpu, we
+		 * need to set pre_pcpu as the destination of wakeup
+		 * notification event, then we can find the right vCPU
+		 * to wakeup in wakeup handler if interrupts happen
+		 * when the vCPU is in blocked state.
+		 */
+		dest = cpu_physical_id(vcpu->pre_pcpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* set 'NV' to 'wakeup vector' */
+		new.nv = POSTED_INTR_WAKEUP_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	return 0;
+}
+
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+	struct pi_desc old, new;
+	unsigned int dest;
+	unsigned long flags;
+
+	if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return;
+
+	do {
+		old.control = new.control = pi_desc->control;
+
+		dest = cpu_physical_id(vcpu->cpu);
+
+		if (x2apic_enabled())
+			new.ndst = dest;
+		else
+			new.ndst = (dest << 8) & 0xFF00;
+
+		/* Allow posting non-urgent interrupts */
+		new.sn = 0;
+
+		/* set 'NV' to 'notification vector' */
+		new.nv = POSTED_INTR_VECTOR;
+	} while (cmpxchg(&pi_desc->control, old.control,
+			new.control) != old.control);
+
+	if(vcpu->pre_pcpu != -1) {
+		spin_lock_irqsave(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		list_del(&vcpu->blocked_vcpu_list);
+		spin_unlock_irqrestore(
+			&per_cpu(blocked_vcpu_on_cpu_lock,
+			vcpu->pre_pcpu), flags);
+		vcpu->pre_pcpu = -1;
+	}
+}
+
+/*
+ * vmx_update_pi_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
+			      uint32_t guest_irq, bool set)
+{
+	struct kvm_kernel_irq_routing_entry *e;
+	struct kvm_irq_routing_table *irq_rt;
+	struct kvm_lapic_irq irq;
+	struct kvm_vcpu *vcpu;
+	struct vcpu_data vcpu_info;
+	int idx, ret = -EINVAL;
+
+	if (!kvm_arch_has_assigned_device(kvm) ||
+		!irq_remapping_cap(IRQ_POSTING_CAP))
+		return 0;
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+	BUG_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+	hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+		if (e->type != KVM_IRQ_ROUTING_MSI)
+			continue;
+		/*
+		 * VT-d PI cannot support posting multicast/broadcast
+		 * interrupts to a vCPU, we still use interrupt remapping
+		 * for these kind of interrupts.
+		 *
+		 * For lowest-priority interrupts, we only support
+		 * those with single CPU as the destination, e.g. user
+		 * configures the interrupts via /proc/irq or uses
+		 * irqbalance to make the interrupts single-CPU.
+		 *
+		 * We will support full lowest-priority interrupt later.
+		 */
+
+		kvm_set_msi_irq(e, &irq);
+		if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu))
+			continue;
+
+		vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
+		vcpu_info.vector = irq.vector;
+
+		trace_kvm_pi_irte_update(vcpu->vcpu_id, e->gsi,
+				vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+		if (set)
+			ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+		else {
+			/* suppress notification event before unposting */
+			pi_set_sn(vcpu_to_pi_desc(vcpu));
+			ret = irq_set_vcpu_affinity(host_irq, NULL);
+			pi_clear_sn(vcpu_to_pi_desc(vcpu));
+		}
+
+		if (ret < 0) {
+			printk(KERN_INFO "%s: failed to update PI IRTE\n",
+					__func__);
+			goto out;
+		}
+	}
+
+	ret = 0;
+out:
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+	return ret;
+}
+
 static struct kvm_x86_ops vmx_x86_ops = {
 static struct kvm_x86_ops vmx_x86_ops = {
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.cpu_has_kvm_support = cpu_has_kvm_support,
 	.disabled_by_bios = vmx_disabled_by_bios,
 	.disabled_by_bios = vmx_disabled_by_bios,
@@ -10347,7 +10802,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-	.vm_has_apicv = vmx_vm_has_apicv,
+	.cpu_uses_apicv = vmx_cpu_uses_apicv,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10394,7 +10849,12 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.flush_log_dirty = vmx_flush_log_dirty,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 	.enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
 
 
+	.pre_block = vmx_pre_block,
+	.post_block = vmx_post_block,
+
 	.pmu_ops = &intel_pmu_ops,
 	.pmu_ops = &intel_pmu_ops,
+
+	.update_pi_irte = vmx_update_pi_irte,
 };
 };
 
 
 static int __init vmx_init(void)
 static int __init vmx_init(void)

+ 197 - 59
arch/x86/kvm/x86.c

@@ -51,6 +51,8 @@
 #include <linux/pci.h>
 #include <linux/pci.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/timekeeper_internal.h>
 #include <linux/pvclock_gtod.h>
 #include <linux/pvclock_gtod.h>
+#include <linux/kvm_irqfd.h>
+#include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 #include <trace/events/kvm.h>
 
 
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
@@ -64,6 +66,7 @@
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/fpu/internal.h> /* Ugh! */
 #include <asm/pvclock.h>
 #include <asm/pvclock.h>
 #include <asm/div64.h>
 #include <asm/div64.h>
+#include <asm/irq_remapping.h>
 
 
 #define MAX_IO_MSRS 256
 #define MAX_IO_MSRS 256
 #define KVM_MAX_MCE_BANKS 32
 #define KVM_MAX_MCE_BANKS 32
@@ -622,7 +625,9 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 	if ((cr0 ^ old_cr0) & update_bits)
 	if ((cr0 ^ old_cr0) & update_bits)
 		kvm_mmu_reset_context(vcpu);
 		kvm_mmu_reset_context(vcpu);
 
 
-	if ((cr0 ^ old_cr0) & X86_CR0_CD)
+	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
+	    kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
+	    !kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 		kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
 
 
 	return 0;
 	return 0;
@@ -789,7 +794,7 @@ int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 {
 	if (cr8 & CR8_RESERVED_BITS)
 	if (cr8 & CR8_RESERVED_BITS)
 		return 1;
 		return 1;
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		kvm_lapic_set_tpr(vcpu, cr8);
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 	else
 		vcpu->arch.cr8 = cr8;
 		vcpu->arch.cr8 = cr8;
@@ -799,7 +804,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 {
 {
-	if (irqchip_in_kernel(vcpu->kvm))
+	if (lapic_in_kernel(vcpu))
 		return kvm_lapic_get_cr8(vcpu);
 		return kvm_lapic_get_cr8(vcpu);
 	else
 	else
 		return vcpu->arch.cr8;
 		return vcpu->arch.cr8;
@@ -953,6 +958,9 @@ static u32 emulated_msrs[] = {
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
 	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+	HV_X64_MSR_RESET,
+	HV_X64_MSR_VP_INDEX,
+	HV_X64_MSR_VP_RUNTIME,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 	MSR_KVM_PV_EOI_EN,
 
 
@@ -1898,6 +1906,8 @@ static void accumulate_steal_time(struct kvm_vcpu *vcpu)
 
 
 static void record_steal_time(struct kvm_vcpu *vcpu)
 static void record_steal_time(struct kvm_vcpu *vcpu)
 {
 {
+	accumulate_steal_time(vcpu);
+
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
 		return;
 		return;
 
 
@@ -2048,12 +2058,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		if (!(data & KVM_MSR_ENABLED))
 		if (!(data & KVM_MSR_ENABLED))
 			break;
 			break;
 
 
-		vcpu->arch.st.last_steal = current->sched_info.run_delay;
-
-		preempt_disable();
-		accumulate_steal_time(vcpu);
-		preempt_enable();
-
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 		kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 
 
 		break;
 		break;
@@ -2449,6 +2453,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_ENABLE_CAP_VM:
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_DISABLE_QUIRKS:
 	case KVM_CAP_SET_BOOT_CPU_ID:
 	case KVM_CAP_SET_BOOT_CPU_ID:
+ 	case KVM_CAP_SPLIT_IRQCHIP:
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 #ifdef CONFIG_KVM_DEVICE_ASSIGNMENT
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_ASSIGN_DEV_IRQ:
 	case KVM_CAP_PCI_2_3:
 	case KVM_CAP_PCI_2_3:
@@ -2628,7 +2633,6 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		vcpu->cpu = cpu;
 		vcpu->cpu = cpu;
 	}
 	}
 
 
-	accumulate_steal_time(vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
 }
 }
 
 
@@ -2662,12 +2666,24 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 {
 {
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 	if (irq->irq >= KVM_NR_INTERRUPTS)
 		return -EINVAL;
 		return -EINVAL;
-	if (irqchip_in_kernel(vcpu->kvm))
+
+	if (!irqchip_in_kernel(vcpu->kvm)) {
+		kvm_queue_interrupt(vcpu, irq->irq, false);
+		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		return 0;
+	}
+
+	/*
+	 * With in-kernel LAPIC, we only use this to inject EXTINT, so
+	 * fail for in-kernel 8259.
+	 */
+	if (pic_in_kernel(vcpu->kvm))
 		return -ENXIO;
 		return -ENXIO;
 
 
-	kvm_queue_interrupt(vcpu, irq->irq, false);
-	kvm_make_request(KVM_REQ_EVENT, vcpu);
+	if (vcpu->arch.pending_external_vector != -1)
+		return -EEXIST;
 
 
+	vcpu->arch.pending_external_vector = irq->irq;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -3176,7 +3192,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		struct kvm_vapic_addr va;
 		struct kvm_vapic_addr va;
 
 
 		r = -EINVAL;
 		r = -EINVAL;
-		if (!irqchip_in_kernel(vcpu->kvm))
+		if (!lapic_in_kernel(vcpu))
 			goto out;
 			goto out;
 		r = -EFAULT;
 		r = -EFAULT;
 		if (copy_from_user(&va, argp, sizeof va))
 		if (copy_from_user(&va, argp, sizeof va))
@@ -3425,41 +3441,35 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 
 
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	memcpy(ps, &kvm->arch.vpit->pit_state, sizeof(struct kvm_pit_state));
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 }
 
 
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps)
 {
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
 	memcpy(&kvm->arch.vpit->pit_state, ps, sizeof(struct kvm_pit_state));
 	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
 	kvm_pit_load_count(kvm, 0, ps->channels[0].count, 0);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 }
 
 
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 {
-	int r = 0;
-
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
 		sizeof(ps->channels));
 		sizeof(ps->channels));
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	ps->flags = kvm->arch.vpit->pit_state.flags;
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
 	memset(&ps->reserved, 0, sizeof(ps->reserved));
-	return r;
+	return 0;
 }
 }
 
 
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 {
 {
-	int r = 0, start = 0;
+	int start = 0;
 	u32 prev_legacy, cur_legacy;
 	u32 prev_legacy, cur_legacy;
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	mutex_lock(&kvm->arch.vpit->pit_state.lock);
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
 	prev_legacy = kvm->arch.vpit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
@@ -3471,7 +3481,7 @@ static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
 	kvm->arch.vpit->pit_state.flags = ps->flags;
 	kvm->arch.vpit->pit_state.flags = ps->flags;
 	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	kvm_pit_load_count(kvm, 0, kvm->arch.vpit->pit_state.channels[0].count, start);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
-	return r;
+	return 0;
 }
 }
 
 
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
 static int kvm_vm_ioctl_reinject(struct kvm *kvm,
@@ -3556,6 +3566,28 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
 		kvm->arch.disabled_quirks = cap->args[0];
 		kvm->arch.disabled_quirks = cap->args[0];
 		r = 0;
 		r = 0;
 		break;
 		break;
+	case KVM_CAP_SPLIT_IRQCHIP: {
+		mutex_lock(&kvm->lock);
+		r = -EINVAL;
+		if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
+			goto split_irqchip_unlock;
+		r = -EEXIST;
+		if (irqchip_in_kernel(kvm))
+			goto split_irqchip_unlock;
+		if (atomic_read(&kvm->online_vcpus))
+			goto split_irqchip_unlock;
+		r = kvm_setup_empty_irq_routing(kvm);
+		if (r)
+			goto split_irqchip_unlock;
+		/* Pairs with irqchip_in_kernel. */
+		smp_wmb();
+		kvm->arch.irqchip_split = true;
+		kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
+		r = 0;
+split_irqchip_unlock:
+		mutex_unlock(&kvm->lock);
+		break;
+	}
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 		break;
 		break;
@@ -3669,7 +3701,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		}
 
 
 		r = -ENXIO;
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto get_irqchip_out;
 			goto get_irqchip_out;
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		r = kvm_vm_ioctl_get_irqchip(kvm, chip);
 		if (r)
 		if (r)
@@ -3693,7 +3725,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		}
 		}
 
 
 		r = -ENXIO;
 		r = -ENXIO;
-		if (!irqchip_in_kernel(kvm))
+		if (!irqchip_in_kernel(kvm) || irqchip_split(kvm))
 			goto set_irqchip_out;
 			goto set_irqchip_out;
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		r = kvm_vm_ioctl_set_irqchip(kvm, chip);
 		if (r)
 		if (r)
@@ -4060,6 +4092,15 @@ static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
 }
 }
 
 
+static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
+		unsigned long addr, void *val, unsigned int bytes)
+{
+	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
+
+	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
+}
+
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
 				       gva_t addr, void *val,
 				       gva_t addr, void *val,
 				       unsigned int bytes,
 				       unsigned int bytes,
@@ -4795,6 +4836,7 @@ static const struct x86_emulate_ops emulate_ops = {
 	.write_gpr           = emulator_write_gpr,
 	.write_gpr           = emulator_write_gpr,
 	.read_std            = kvm_read_guest_virt_system,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
+	.read_phys           = kvm_read_guest_phys_system,
 	.fetch               = kvm_fetch_guest_virt,
 	.fetch               = kvm_fetch_guest_virt,
 	.read_emulated       = emulator_read_emulated,
 	.read_emulated       = emulator_read_emulated,
 	.write_emulated      = emulator_write_emulated,
 	.write_emulated      = emulator_write_emulated,
@@ -5667,7 +5709,7 @@ void kvm_arch_exit(void)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
 {
 {
 	++vcpu->stat.halt_exits;
 	++vcpu->stat.halt_exits;
-	if (irqchip_in_kernel(vcpu->kvm)) {
+	if (lapic_in_kernel(vcpu)) {
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
 		return 1;
 		return 1;
 	} else {
 	} else {
@@ -5774,9 +5816,15 @@ static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
  */
  */
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
 {
 {
-	return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
-		vcpu->run->request_interrupt_window &&
-		kvm_arch_interrupt_allowed(vcpu));
+	if (!vcpu->run->request_interrupt_window || pic_in_kernel(vcpu->kvm))
+		return false;
+
+	if (kvm_cpu_has_interrupt(vcpu))
+		return false;
+
+	return (irqchip_split(vcpu->kvm)
+		? kvm_apic_accept_pic_intr(vcpu)
+		: kvm_arch_interrupt_allowed(vcpu));
 }
 }
 
 
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 static void post_kvm_run_save(struct kvm_vcpu *vcpu)
@@ -5787,13 +5835,17 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)
 	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->cr8 = kvm_get_cr8(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
 	kvm_run->apic_base = kvm_get_apic_base(vcpu);
-	if (irqchip_in_kernel(vcpu->kvm))
-		kvm_run->ready_for_interrupt_injection = 1;
-	else
+	if (!irqchip_in_kernel(vcpu->kvm))
 		kvm_run->ready_for_interrupt_injection =
 		kvm_run->ready_for_interrupt_injection =
 			kvm_arch_interrupt_allowed(vcpu) &&
 			kvm_arch_interrupt_allowed(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu) &&
 			!kvm_cpu_has_interrupt(vcpu) &&
 			!kvm_event_needs_reinjection(vcpu);
 			!kvm_event_needs_reinjection(vcpu);
+	else if (!pic_in_kernel(vcpu->kvm))
+		kvm_run->ready_for_interrupt_injection =
+			kvm_apic_accept_pic_intr(vcpu) &&
+			!kvm_cpu_has_interrupt(vcpu);
+	else
+		kvm_run->ready_for_interrupt_injection = 1;
 }
 }
 
 
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -6144,18 +6196,18 @@ static void process_smi(struct kvm_vcpu *vcpu)
 
 
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 {
 {
-	u64 eoi_exit_bitmap[4];
-	u32 tmr[8];
-
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
 		return;
 		return;
 
 
-	memset(eoi_exit_bitmap, 0, 32);
-	memset(tmr, 0, 32);
+	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 
 
-	kvm_ioapic_scan_entry(vcpu, eoi_exit_bitmap, tmr);
-	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
-	kvm_apic_update_tmr(vcpu, tmr);
+	if (irqchip_split(vcpu->kvm))
+		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
+	else {
+		kvm_x86_ops->sync_pir_to_irr(vcpu);
+		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+	}
+	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 }
 
 
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
 static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu)
@@ -6168,7 +6220,7 @@ void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
 {
 {
 	struct page *page = NULL;
 	struct page *page = NULL;
 
 
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		return;
 		return;
 
 
 	if (!kvm_x86_ops->set_apic_access_page_addr)
 	if (!kvm_x86_ops->set_apic_access_page_addr)
@@ -6206,7 +6258,7 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 {
 	int r;
 	int r;
-	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
+	bool req_int_win = !lapic_in_kernel(vcpu) &&
 		vcpu->run->request_interrupt_window;
 		vcpu->run->request_interrupt_window;
 	bool req_immediate_exit = false;
 	bool req_immediate_exit = false;
 
 
@@ -6258,6 +6310,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_pmu_handle_event(vcpu);
 			kvm_pmu_handle_event(vcpu);
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 		if (kvm_check_request(KVM_REQ_PMI, vcpu))
 			kvm_pmu_deliver_pmi(vcpu);
 			kvm_pmu_deliver_pmi(vcpu);
+		if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
+			BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
+			if (test_bit(vcpu->arch.pending_ioapic_eoi,
+				     (void *) vcpu->arch.eoi_exit_bitmap)) {
+				vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
+				vcpu->run->eoi.vector =
+						vcpu->arch.pending_ioapic_eoi;
+				r = 0;
+				goto out;
+			}
+		}
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 		if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
 			vcpu_scan_ioapic(vcpu);
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
@@ -6268,6 +6331,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			r = 0;
 			r = 0;
 			goto out;
 			goto out;
 		}
 		}
+		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			r = 0;
+			goto out;
+		}
+	}
+
+	/*
+	 * KVM_REQ_EVENT is not set when posted interrupts are set by
+	 * VT-d hardware, so we have to update RVI unconditionally.
+	 */
+	if (kvm_lapic_enabled(vcpu)) {
+		/*
+		 * Update architecture specific hints for APIC
+		 * virtual interrupt delivery.
+		 */
+		if (kvm_x86_ops->hwapic_irr_update)
+			kvm_x86_ops->hwapic_irr_update(vcpu,
+				kvm_lapic_find_highest_irr(vcpu));
 	}
 	}
 
 
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
 	if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -6286,13 +6369,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			kvm_x86_ops->enable_irq_window(vcpu);
 			kvm_x86_ops->enable_irq_window(vcpu);
 
 
 		if (kvm_lapic_enabled(vcpu)) {
 		if (kvm_lapic_enabled(vcpu)) {
-			/*
-			 * Update architecture specific hints for APIC
-			 * virtual interrupt delivery.
-			 */
-			if (kvm_x86_ops->hwapic_irr_update)
-				kvm_x86_ops->hwapic_irr_update(vcpu,
-					kvm_lapic_find_highest_irr(vcpu));
 			update_cr8_intercept(vcpu);
 			update_cr8_intercept(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 			kvm_lapic_sync_to_vapic(vcpu);
 		}
 		}
@@ -6428,10 +6504,15 @@ out:
 
 
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
 {
 {
-	if (!kvm_arch_vcpu_runnable(vcpu)) {
+	if (!kvm_arch_vcpu_runnable(vcpu) &&
+	    (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 		kvm_vcpu_block(vcpu);
 		kvm_vcpu_block(vcpu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
+
+		if (kvm_x86_ops->post_block)
+			kvm_x86_ops->post_block(vcpu);
+
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 		if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			return 1;
 			return 1;
 	}
 	}
@@ -6468,10 +6549,12 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 
 
 	for (;;) {
 	for (;;) {
-		if (kvm_vcpu_running(vcpu))
+		if (kvm_vcpu_running(vcpu)) {
 			r = vcpu_enter_guest(vcpu);
 			r = vcpu_enter_guest(vcpu);
-		else
+		} else {
 			r = vcpu_block(kvm, vcpu);
 			r = vcpu_block(kvm, vcpu);
+		}
+
 		if (r <= 0)
 		if (r <= 0)
 			break;
 			break;
 
 
@@ -6480,8 +6563,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			kvm_inject_pending_timer_irqs(vcpu);
 			kvm_inject_pending_timer_irqs(vcpu);
 
 
 		if (dm_request_for_irq_injection(vcpu)) {
 		if (dm_request_for_irq_injection(vcpu)) {
-			r = -EINTR;
-			vcpu->run->exit_reason = KVM_EXIT_INTR;
+			r = 0;
+			vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
 			++vcpu->stat.request_irq_exits;
 			++vcpu->stat.request_irq_exits;
 			break;
 			break;
 		}
 		}
@@ -6608,7 +6691,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	}
 	}
 
 
 	/* re-sync apic's tpr */
 	/* re-sync apic's tpr */
-	if (!irqchip_in_kernel(vcpu->kvm)) {
+	if (!lapic_in_kernel(vcpu)) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 		if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
 			r = -EINVAL;
 			r = -EINVAL;
 			goto out;
 			goto out;
@@ -7308,7 +7391,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
 
 
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
 {
 {
-	return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL);
+	return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
 }
 }
 
 
 struct static_key kvm_no_apic_vcpu __read_mostly;
 struct static_key kvm_no_apic_vcpu __read_mostly;
@@ -7377,6 +7460,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_async_pf_hash_reset(vcpu);
 	kvm_pmu_init(vcpu);
 	kvm_pmu_init(vcpu);
 
 
+	vcpu->arch.pending_external_vector = -1;
+
 	return 0;
 	return 0;
 
 
 fail_free_mce_banks:
 fail_free_mce_banks:
@@ -7402,7 +7487,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 	kvm_mmu_destroy(vcpu);
 	kvm_mmu_destroy(vcpu);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	free_page((unsigned long)vcpu->arch.pio_data);
 	free_page((unsigned long)vcpu->arch.pio_data);
-	if (!irqchip_in_kernel(vcpu->kvm))
+	if (!lapic_in_kernel(vcpu))
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 		static_key_slow_dec(&kvm_no_apic_vcpu);
 }
 }
 
 
@@ -8029,7 +8114,59 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
 
 
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (kvm_x86_ops->update_pi_irte) {
+		irqfd->producer = prod;
+		return kvm_x86_ops->update_pi_irte(irqfd->kvm,
+				prod->irq, irqfd->gsi, 1);
+	}
+
+	return -EINVAL;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+				      struct irq_bypass_producer *prod)
+{
+	int ret;
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(cons, struct kvm_kernel_irqfd, consumer);
+
+	if (!kvm_x86_ops->update_pi_irte) {
+		WARN_ON(irqfd->producer != NULL);
+		return;
+	}
+
+	WARN_ON(irqfd->producer != prod);
+	irqfd->producer = NULL;
+
+	/*
+	 * When producer of consumer is unregistered, we change back to
+	 * remapped mode, so we can re-use the current implementation
+	 * when the irq is masked/disabed or the consumer side (KVM
+	 * int this case doesn't want to receive the interrupts.
+	*/
+	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+	if (ret)
+		printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
+		       " fails: %d\n", irqfd->consumer.token, ret);
+}
+
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+				   uint32_t guest_irq, bool set)
+{
+	if (!kvm_x86_ops->update_pi_irte)
+		return -EINVAL;
+
+	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+}
+
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
@@ -8044,3 +8181,4 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);

+ 0 - 5
drivers/hv/hyperv_vmbus.h

@@ -63,9 +63,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 #define HV_SYNIC_VERSION		(1)
 
 
-/* Define the expected SynIC version. */
-#define HV_SYNIC_VERSION_1		(0x1)
-
 /* Define synthetic interrupt controller message constants. */
 /* Define synthetic interrupt controller message constants. */
 #define HV_MESSAGE_SIZE			(256)
 #define HV_MESSAGE_SIZE			(256)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
 #define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
@@ -105,8 +102,6 @@ enum hv_message_type {
 	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
 	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
 };
 };
 
 
-/* Define the number of synthetic interrupt sources. */
-#define HV_SYNIC_SINT_COUNT		(16)
 #define HV_SYNIC_STIMER_COUNT		(4)
 #define HV_SYNIC_STIMER_COUNT		(4)
 
 
 /* Define invalid partition identifier. */
 /* Define invalid partition identifier. */

+ 8 - 4
drivers/iommu/irq_remapping.c

@@ -22,7 +22,7 @@ int irq_remap_broken;
 int disable_sourceid_checking;
 int disable_sourceid_checking;
 int no_x2apic_optout;
 int no_x2apic_optout;
 
 
-int disable_irq_post = 1;
+int disable_irq_post = 0;
 
 
 static int disable_irq_remap;
 static int disable_irq_remap;
 static struct irq_remap_ops *remap_ops;
 static struct irq_remap_ops *remap_ops;
@@ -58,14 +58,18 @@ static __init int setup_irqremap(char *str)
 		return -EINVAL;
 		return -EINVAL;
 
 
 	while (*str) {
 	while (*str) {
-		if (!strncmp(str, "on", 2))
+		if (!strncmp(str, "on", 2)) {
 			disable_irq_remap = 0;
 			disable_irq_remap = 0;
-		else if (!strncmp(str, "off", 3))
+			disable_irq_post = 0;
+		} else if (!strncmp(str, "off", 3)) {
 			disable_irq_remap = 1;
 			disable_irq_remap = 1;
-		else if (!strncmp(str, "nosid", 5))
+			disable_irq_post = 1;
+		} else if (!strncmp(str, "nosid", 5))
 			disable_sourceid_checking = 1;
 			disable_sourceid_checking = 1;
 		else if (!strncmp(str, "no_x2apic_optout", 16))
 		else if (!strncmp(str, "no_x2apic_optout", 16))
 			no_x2apic_optout = 1;
 			no_x2apic_optout = 1;
+		else if (!strncmp(str, "nopost", 6))
+			disable_irq_post = 1;
 
 
 		str += strcspn(str, ",");
 		str += strcspn(str, ",");
 		while (*str == ',')
 		while (*str == ',')

+ 1 - 0
drivers/vfio/Kconfig

@@ -33,3 +33,4 @@ menuconfig VFIO
 
 
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/pci/Kconfig"
 source "drivers/vfio/platform/Kconfig"
 source "drivers/vfio/platform/Kconfig"
+source "virt/lib/Kconfig"

+ 1 - 0
drivers/vfio/pci/Kconfig

@@ -2,6 +2,7 @@ config VFIO_PCI
 	tristate "VFIO support for PCI devices"
 	tristate "VFIO support for PCI devices"
 	depends on VFIO && PCI && EVENTFD
 	depends on VFIO && PCI && EVENTFD
 	select VFIO_VIRQFD
 	select VFIO_VIRQFD
+	select IRQ_BYPASS_MANAGER
 	help
 	help
 	  Support for the PCI VFIO bus driver.  This is required to make
 	  Support for the PCI VFIO bus driver.  This is required to make
 	  use of PCI drivers using the VFIO framework.
 	  use of PCI drivers using the VFIO framework.

+ 9 - 0
drivers/vfio/pci/vfio_pci_intrs.c

@@ -319,6 +319,7 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 
 
 	if (vdev->ctx[vector].trigger) {
 	if (vdev->ctx[vector].trigger) {
 		free_irq(irq, vdev->ctx[vector].trigger);
 		free_irq(irq, vdev->ctx[vector].trigger);
+		irq_bypass_unregister_producer(&vdev->ctx[vector].producer);
 		kfree(vdev->ctx[vector].name);
 		kfree(vdev->ctx[vector].name);
 		eventfd_ctx_put(vdev->ctx[vector].trigger);
 		eventfd_ctx_put(vdev->ctx[vector].trigger);
 		vdev->ctx[vector].trigger = NULL;
 		vdev->ctx[vector].trigger = NULL;
@@ -360,6 +361,14 @@ static int vfio_msi_set_vector_signal(struct vfio_pci_device *vdev,
 		return ret;
 		return ret;
 	}
 	}
 
 
+	vdev->ctx[vector].producer.token = trigger;
+	vdev->ctx[vector].producer.irq = irq;
+	ret = irq_bypass_register_producer(&vdev->ctx[vector].producer);
+	if (unlikely(ret))
+		dev_info(&pdev->dev,
+		"irq bypass producer (token %p) registration fails: %d\n",
+		vdev->ctx[vector].producer.token, ret);
+
 	vdev->ctx[vector].trigger = trigger;
 	vdev->ctx[vector].trigger = trigger;
 
 
 	return 0;
 	return 0;

+ 2 - 0
drivers/vfio/pci/vfio_pci_private.h

@@ -13,6 +13,7 @@
 
 
 #include <linux/mutex.h>
 #include <linux/mutex.h>
 #include <linux/pci.h>
 #include <linux/pci.h>
+#include <linux/irqbypass.h>
 
 
 #ifndef VFIO_PCI_PRIVATE_H
 #ifndef VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
 #define VFIO_PCI_PRIVATE_H
@@ -29,6 +30,7 @@ struct vfio_pci_irq_ctx {
 	struct virqfd		*mask;
 	struct virqfd		*mask;
 	char			*name;
 	char			*name;
 	bool			masked;
 	bool			masked;
+	struct irq_bypass_producer	producer;
 };
 };
 
 
 struct vfio_pci_device {
 struct vfio_pci_device {

+ 3 - 1
include/kvm/arm_arch_timer.h

@@ -51,7 +51,7 @@ struct arch_timer_cpu {
 	bool				armed;
 	bool				armed;
 
 
 	/* Timer IRQ */
 	/* Timer IRQ */
-	const struct kvm_irq_level	*irq;
+	struct kvm_irq_level		irq;
 
 
 	/* VGIC mapping */
 	/* VGIC mapping */
 	struct irq_phys_map		*map;
 	struct irq_phys_map		*map;
@@ -71,5 +71,7 @@ u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
 
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
+void kvm_timer_schedule(struct kvm_vcpu *vcpu);
+void kvm_timer_unschedule(struct kvm_vcpu *vcpu);
 
 
 #endif
 #endif

+ 3 - 13
include/kvm/arm_vgic.h

@@ -112,7 +112,6 @@ struct vgic_vmcr {
 struct vgic_ops {
 struct vgic_ops {
 	struct vgic_lr	(*get_lr)(const struct kvm_vcpu *, int);
 	struct vgic_lr	(*get_lr)(const struct kvm_vcpu *, int);
 	void	(*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
 	void	(*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
-	void	(*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr);
 	u64	(*get_elrsr)(const struct kvm_vcpu *vcpu);
 	u64	(*get_elrsr)(const struct kvm_vcpu *vcpu);
 	u64	(*get_eisr)(const struct kvm_vcpu *vcpu);
 	u64	(*get_eisr)(const struct kvm_vcpu *vcpu);
 	void	(*clear_eisr)(struct kvm_vcpu *vcpu);
 	void	(*clear_eisr)(struct kvm_vcpu *vcpu);
@@ -159,7 +158,6 @@ struct irq_phys_map {
 	u32			virt_irq;
 	u32			virt_irq;
 	u32			phys_irq;
 	u32			phys_irq;
 	u32			irq;
 	u32			irq;
-	bool			active;
 };
 };
 
 
 struct irq_phys_map_entry {
 struct irq_phys_map_entry {
@@ -296,22 +294,16 @@ struct vgic_v3_cpu_if {
 };
 };
 
 
 struct vgic_cpu {
 struct vgic_cpu {
-	/* per IRQ to LR mapping */
-	u8		*vgic_irq_lr_map;
-
 	/* Pending/active/both interrupts on this VCPU */
 	/* Pending/active/both interrupts on this VCPU */
-	DECLARE_BITMAP(	pending_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(	active_percpu, VGIC_NR_PRIVATE_IRQS);
-	DECLARE_BITMAP(	pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
+	DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
 
 
 	/* Pending/active/both shared interrupts, dynamically sized */
 	/* Pending/active/both shared interrupts, dynamically sized */
 	unsigned long	*pending_shared;
 	unsigned long	*pending_shared;
 	unsigned long   *active_shared;
 	unsigned long   *active_shared;
 	unsigned long   *pend_act_shared;
 	unsigned long   *pend_act_shared;
 
 
-	/* Bitmap of used/free list registers */
-	DECLARE_BITMAP(	lr_used, VGIC_V2_MAX_LRS);
-
 	/* Number of list registers on this CPU */
 	/* Number of list registers on this CPU */
 	int		nr_lr;
 	int		nr_lr;
 
 
@@ -354,8 +346,6 @@ int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
 struct irq_phys_map *kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu,
 					   int virt_irq, int irq);
 					   int virt_irq, int irq);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
 int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, struct irq_phys_map *map);
-bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map);
-void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active);
 
 
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define irqchip_in_kernel(k)	(!!((k)->arch.vgic.in_kernel))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))
 #define vgic_initialized(k)	(!!((k)->arch.vgic.nr_cpus))

+ 1 - 0
include/linux/hyperv.h

@@ -26,6 +26,7 @@
 #define _HYPERV_H
 #define _HYPERV_H
 
 
 #include <uapi/linux/hyperv.h>
 #include <uapi/linux/hyperv.h>
+#include <uapi/asm/hyperv.h>
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 #include <linux/scatterlist.h>
 #include <linux/scatterlist.h>

+ 90 - 0
include/linux/irqbypass.h

@@ -0,0 +1,90 @@
+/*
+ * IRQ offload/bypass manager
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef IRQBYPASS_H
+#define IRQBYPASS_H
+
+#include <linux/list.h>
+
+struct irq_bypass_consumer;
+
+/*
+ * Theory of operation
+ *
+ * The IRQ bypass manager is a simple set of lists and callbacks that allows
+ * IRQ producers (ex. physical interrupt sources) to be matched to IRQ
+ * consumers (ex. virtualization hardware that allows IRQ bypass or offload)
+ * via a shared token (ex. eventfd_ctx).  Producers and consumers register
+ * independently.  When a token match is found, the optional @stop callback
+ * will be called for each participant.  The pair will then be connected via
+ * the @add_* callbacks, and finally the optional @start callback will allow
+ * any final coordination.  When either participant is unregistered, the
+ * process is repeated using the @del_* callbacks in place of the @add_*
+ * callbacks.  Match tokens must be unique per producer/consumer, 1:N pairings
+ * are not supported.
+ */
+
+/**
+ * struct irq_bypass_producer - IRQ bypass producer definition
+ * @node: IRQ bypass manager private list management
+ * @token: opaque token to match between producer and consumer
+ * @irq: Linux IRQ number for the producer device
+ * @add_consumer: Connect the IRQ producer to an IRQ consumer (optional)
+ * @del_consumer: Disconnect the IRQ producer from an IRQ consumer (optional)
+ * @stop: Perform any quiesce operations necessary prior to add/del (optional)
+ * @start: Perform any startup operations necessary after add/del (optional)
+ *
+ * The IRQ bypass producer structure represents an interrupt source for
+ * participation in possible host bypass, for instance an interrupt vector
+ * for a physical device assigned to a VM.
+ */
+struct irq_bypass_producer {
+	struct list_head node;
+	void *token;
+	int irq;
+	int (*add_consumer)(struct irq_bypass_producer *,
+			    struct irq_bypass_consumer *);
+	void (*del_consumer)(struct irq_bypass_producer *,
+			     struct irq_bypass_consumer *);
+	void (*stop)(struct irq_bypass_producer *);
+	void (*start)(struct irq_bypass_producer *);
+};
+
+/**
+ * struct irq_bypass_consumer - IRQ bypass consumer definition
+ * @node: IRQ bypass manager private list management
+ * @token: opaque token to match between producer and consumer
+ * @add_producer: Connect the IRQ consumer to an IRQ producer
+ * @del_producer: Disconnect the IRQ consumer from an IRQ producer
+ * @stop: Perform any quiesce operations necessary prior to add/del (optional)
+ * @start: Perform any startup operations necessary after add/del (optional)
+ *
+ * The IRQ bypass consumer structure represents an interrupt sink for
+ * participation in possible host bypass, for instance a hypervisor may
+ * support offloads to allow bypassing the host entirely or offload
+ * portions of the interrupt handling to the VM.
+ */
+struct irq_bypass_consumer {
+	struct list_head node;
+	void *token;
+	int (*add_producer)(struct irq_bypass_consumer *,
+			    struct irq_bypass_producer *);
+	void (*del_producer)(struct irq_bypass_consumer *,
+			     struct irq_bypass_producer *);
+	void (*stop)(struct irq_bypass_consumer *);
+	void (*start)(struct irq_bypass_consumer *);
+};
+
+int irq_bypass_register_producer(struct irq_bypass_producer *);
+void irq_bypass_unregister_producer(struct irq_bypass_producer *);
+int irq_bypass_register_consumer(struct irq_bypass_consumer *);
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *);
+
+#endif /* IRQBYPASS_H */

+ 40 - 2
include/linux/kvm_host.h

@@ -24,6 +24,7 @@
 #include <linux/err.h>
 #include <linux/err.h>
 #include <linux/irqflags.h>
 #include <linux/irqflags.h>
 #include <linux/context_tracking.h>
 #include <linux/context_tracking.h>
+#include <linux/irqbypass.h>
 #include <asm/signal.h>
 #include <asm/signal.h>
 
 
 #include <linux/kvm.h>
 #include <linux/kvm.h>
@@ -140,6 +141,8 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
 #define KVM_REQ_SMI               26
 #define KVM_REQ_HV_CRASH          27
 #define KVM_REQ_HV_CRASH          27
+#define KVM_REQ_IOAPIC_EOI_EXIT   28
+#define KVM_REQ_HV_RESET          29
 
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -231,6 +234,9 @@ struct kvm_vcpu {
 	unsigned long requests;
 	unsigned long requests;
 	unsigned long guest_debug;
 	unsigned long guest_debug;
 
 
+	int pre_pcpu;
+	struct list_head blocked_vcpu_list;
+
 	struct mutex mutex;
 	struct mutex mutex;
 	struct kvm_run *run;
 	struct kvm_run *run;
 
 
@@ -329,6 +335,18 @@ struct kvm_kernel_irq_routing_entry {
 	struct hlist_node link;
 	struct hlist_node link;
 };
 };
 
 
+#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
+struct kvm_irq_routing_table {
+	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+	u32 nr_rt_entries;
+	/*
+	 * Array indexed by gsi. Each entry contains list of irq chips
+	 * the gsi is connected to.
+	 */
+	struct hlist_head map[0];
+};
+#endif
+
 #ifndef KVM_PRIVATE_MEM_SLOTS
 #ifndef KVM_PRIVATE_MEM_SLOTS
 #define KVM_PRIVATE_MEM_SLOTS 0
 #define KVM_PRIVATE_MEM_SLOTS 0
 #endif
 #endif
@@ -455,10 +473,14 @@ void vcpu_put(struct kvm_vcpu *vcpu);
 
 
 #ifdef __KVM_HAVE_IOAPIC
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
 void kvm_vcpu_request_scan_ioapic(struct kvm *kvm);
+void kvm_arch_irq_routing_update(struct kvm *kvm);
 #else
 #else
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm)
 {
 {
 }
 }
+static inline void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+}
 #endif
 #endif
 
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 #ifdef CONFIG_HAVE_KVM_IRQFD
@@ -625,6 +647,8 @@ int kvm_vcpu_write_guest(struct kvm_vcpu *vcpu, gpa_t gpa, const void *data,
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn);
 
 
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
 void kvm_vcpu_block(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu);
+void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 void kvm_vcpu_kick(struct kvm_vcpu *vcpu);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 int kvm_vcpu_yield_to(struct kvm_vcpu *target);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
 void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu);
@@ -803,10 +827,13 @@ int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin);
 
 
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
 		bool line_status);
 		bool line_status);
-int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, int level);
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm,
 		int irq_source_id, int level, bool line_status);
 		int irq_source_id, int level, bool line_status);
+int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
+			       struct kvm *kvm, int irq_source_id,
+			       int level, bool line_status);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
 bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin);
+void kvm_notify_acked_gsi(struct kvm *kvm, int gsi);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin);
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 void kvm_register_irq_ack_notifier(struct kvm *kvm,
 				   struct kvm_irq_ack_notifier *kian);
 				   struct kvm_irq_ack_notifier *kian);
@@ -1002,6 +1029,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq)
 #endif
 #endif
 
 
 int kvm_setup_default_irq_routing(struct kvm *kvm);
 int kvm_setup_default_irq_routing(struct kvm *kvm);
+int kvm_setup_empty_irq_routing(struct kvm *kvm);
 int kvm_set_irq_routing(struct kvm *kvm,
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *entries,
 			const struct kvm_irq_routing_entry *entries,
 			unsigned nr,
 			unsigned nr,
@@ -1144,5 +1172,15 @@ static inline void kvm_vcpu_set_dy_eligible(struct kvm_vcpu *vcpu, bool val)
 {
 {
 }
 }
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
 #endif /* CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT */
-#endif
 
 
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *,
+			   struct irq_bypass_producer *);
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *,
+			   struct irq_bypass_producer *);
+void kvm_arch_irq_bypass_stop(struct irq_bypass_consumer *);
+void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *);
+int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
+				  uint32_t guest_irq, bool set);
+#endif /* CONFIG_HAVE_KVM_IRQ_BYPASS */
+#endif

+ 71 - 0
include/linux/kvm_irqfd.h

@@ -0,0 +1,71 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * irqfd: Allows an fd to be used to inject an interrupt to the guest
+ * Credit goes to Avi Kivity for the original idea.
+ */
+
+#ifndef __LINUX_KVM_IRQFD_H
+#define __LINUX_KVM_IRQFD_H
+
+#include <linux/kvm_host.h>
+#include <linux/poll.h>
+
+/*
+ * Resampling irqfds are a special variety of irqfds used to emulate
+ * level triggered interrupts.  The interrupt is asserted on eventfd
+ * trigger.  On acknowledgment through the irq ack notifier, the
+ * interrupt is de-asserted and userspace is notified through the
+ * resamplefd.  All resamplers on the same gsi are de-asserted
+ * together, so we don't need to track the state of each individual
+ * user.  We can also therefore share the same irq source ID.
+ */
+struct kvm_kernel_irqfd_resampler {
+	struct kvm *kvm;
+	/*
+	 * List of resampling struct _irqfd objects sharing this gsi.
+	 * RCU list modified under kvm->irqfds.resampler_lock
+	 */
+	struct list_head list;
+	struct kvm_irq_ack_notifier notifier;
+	/*
+	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
+	 * resamplers among irqfds on the same gsi.
+	 * Accessed and modified under kvm->irqfds.resampler_lock
+	 */
+	struct list_head link;
+};
+
+struct kvm_kernel_irqfd {
+	/* Used for MSI fast-path */
+	struct kvm *kvm;
+	wait_queue_t wait;
+	/* Update side is protected by irqfds.lock */
+	struct kvm_kernel_irq_routing_entry irq_entry;
+	seqcount_t irq_entry_sc;
+	/* Used for level IRQ fast-path */
+	int gsi;
+	struct work_struct inject;
+	/* The resampler used by this irqfd (resampler-only) */
+	struct kvm_kernel_irqfd_resampler *resampler;
+	/* Eventfd notified on resample (resampler-only) */
+	struct eventfd_ctx *resamplefd;
+	/* Entry in list of irqfds for a resampler (resampler-only) */
+	struct list_head resampler_link;
+	/* Used for setup/shutdown */
+	struct eventfd_ctx *eventfd;
+	struct list_head list;
+	poll_table pt;
+	struct work_struct shutdown;
+	struct irq_bypass_consumer consumer;
+	struct irq_bypass_producer *producer;
+};
+
+#endif /* __LINUX_KVM_IRQFD_H */

+ 7 - 0
include/uapi/linux/kvm.h

@@ -183,6 +183,7 @@ struct kvm_s390_skeys {
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_EPR              23
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_S390_STSI        25
 #define KVM_EXIT_S390_STSI        25
+#define KVM_EXIT_IOAPIC_EOI       26
 
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
 /* Emulate instruction failed. */
@@ -333,6 +334,10 @@ struct kvm_run {
 			__u8 sel1;
 			__u8 sel1;
 			__u16 sel2;
 			__u16 sel2;
 		} s390_stsi;
 		} s390_stsi;
+		/* KVM_EXIT_IOAPIC_EOI */
+		struct {
+			__u8 vector;
+		} eoi;
 		/* Fix the size of the union. */
 		/* Fix the size of the union. */
 		char padding[256];
 		char padding[256];
 	};
 	};
@@ -824,6 +829,8 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_MULTI_ADDRESS_SPACE 118
 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119
 #define KVM_CAP_GUEST_DEBUG_HW_BPS 119
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
+#define KVM_CAP_SPLIT_IRQCHIP 121
+#define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
 
 
 #ifdef KVM_CAP_IRQ_ROUTING
 #ifdef KVM_CAP_IRQ_ROUTING
 
 

+ 2 - 0
kernel/sched/cputime.c

@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*ut = p->utime;
 	*st = p->stime;
 	*st = p->stime;
 }
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 {
@@ -652,6 +653,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
 {

+ 1 - 0
virt/Makefile

@@ -0,0 +1 @@
+obj-y	+= lib/

+ 4 - 1
virt/kvm/Kconfig

@@ -46,4 +46,7 @@ config KVM_GENERIC_DIRTYLOG_READ_PROTECT
 
 
 config KVM_COMPAT
 config KVM_COMPAT
        def_bool y
        def_bool y
-       depends on COMPAT && !S390
+       depends on KVM && COMPAT && !S390
+
+config HAVE_KVM_IRQ_BYPASS
+       bool

+ 117 - 56
virt/kvm/arm/arch_timer.c

@@ -28,6 +28,8 @@
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_vgic.h>
 #include <kvm/arm_arch_timer.h>
 #include <kvm/arm_arch_timer.h>
 
 
+#include "trace.h"
+
 static struct timecounter *timecounter;
 static struct timecounter *timecounter;
 static struct workqueue_struct *wqueue;
 static struct workqueue_struct *wqueue;
 static unsigned int host_vtimer_irq;
 static unsigned int host_vtimer_irq;
@@ -59,18 +61,6 @@ static void timer_disarm(struct arch_timer_cpu *timer)
 	}
 	}
 }
 }
 
 
-static void kvm_timer_inject_irq(struct kvm_vcpu *vcpu)
-{
-	int ret;
-	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-
-	kvm_vgic_set_phys_irq_active(timer->map, true);
-	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
-					 timer->map,
-					 timer->irq->level);
-	WARN_ON(ret);
-}
-
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 static irqreturn_t kvm_arch_timer_handler(int irq, void *dev_id)
 {
 {
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
 	struct kvm_vcpu *vcpu = *(struct kvm_vcpu **)dev_id;
@@ -111,14 +101,20 @@ static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
 	return HRTIMER_NORESTART;
 	return HRTIMER_NORESTART;
 }
 }
 
 
+static bool kvm_timer_irq_can_fire(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	return !(timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) &&
+		(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE);
+}
+
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 {
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	cycle_t cval, now;
 	cycle_t cval, now;
 
 
-	if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-	    !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE) ||
-	    kvm_vgic_get_phys_irq_active(timer->map))
+	if (!kvm_timer_irq_can_fire(vcpu))
 		return false;
 		return false;
 
 
 	cval = timer->cntv_cval;
 	cval = timer->cntv_cval;
@@ -127,12 +123,94 @@ bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
 	return cval <= now;
 	return cval <= now;
 }
 }
 
 
+static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level)
+{
+	int ret;
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	BUG_ON(!vgic_initialized(vcpu->kvm));
+
+	timer->irq.level = new_level;
+	trace_kvm_timer_update_irq(vcpu->vcpu_id, timer->map->virt_irq,
+				   timer->irq.level);
+	ret = kvm_vgic_inject_mapped_irq(vcpu->kvm, vcpu->vcpu_id,
+					 timer->map,
+					 timer->irq.level);
+	WARN_ON(ret);
+}
+
+/*
+ * Check if there was a change in the timer state (should we raise or lower
+ * the line level to the GIC).
+ */
+static void kvm_timer_update_state(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+
+	/*
+	 * If userspace modified the timer registers via SET_ONE_REG before
+	 * the vgic was initialized, we mustn't set the timer->irq.level value
+	 * because the guest would never see the interrupt.  Instead wait
+	 * until we call this function from kvm_timer_flush_hwstate.
+	 */
+	if (!vgic_initialized(vcpu->kvm))
+	    return;
+
+	if (kvm_timer_should_fire(vcpu) != timer->irq.level)
+		kvm_timer_update_irq(vcpu, !timer->irq.level);
+}
+
+/*
+ * Schedule the background timer before calling kvm_vcpu_block, so that this
+ * thread is removed from its waitqueue and made runnable when there's a timer
+ * interrupt to handle.
+ */
+void kvm_timer_schedule(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	u64 ns;
+	cycle_t cval, now;
+
+	BUG_ON(timer_is_armed(timer));
+
+	/*
+	 * No need to schedule a background timer if the guest timer has
+	 * already expired, because kvm_vcpu_block will return before putting
+	 * the thread to sleep.
+	 */
+	if (kvm_timer_should_fire(vcpu))
+		return;
+
+	/*
+	 * If the timer is not capable of raising interrupts (disabled or
+	 * masked), then there's no more work for us to do.
+	 */
+	if (!kvm_timer_irq_can_fire(vcpu))
+		return;
+
+	/*  The timer has not yet expired, schedule a background timer */
+	cval = timer->cntv_cval;
+	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+
+	ns = cyclecounter_cyc2ns(timecounter->cc,
+				 cval - now,
+				 timecounter->mask,
+				 &timecounter->frac);
+	timer_arm(timer, ns);
+}
+
+void kvm_timer_unschedule(struct kvm_vcpu *vcpu)
+{
+	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+	timer_disarm(timer);
+}
+
 /**
 /**
  * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
  * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
  * @vcpu: The vcpu pointer
  * @vcpu: The vcpu pointer
  *
  *
- * Disarm any pending soft timers, since the world-switch code will write the
- * virtual timer state back to the physical CPU.
+ * Check if the virtual timer has expired while we were running in the host,
+ * and inject an interrupt if that was the case.
  */
  */
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 {
 {
@@ -140,28 +218,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 	bool phys_active;
 	bool phys_active;
 	int ret;
 	int ret;
 
 
-	/*
-	 * We're about to run this vcpu again, so there is no need to
-	 * keep the background timer running, as we're about to
-	 * populate the CPU timer again.
-	 */
-	timer_disarm(timer);
+	kvm_timer_update_state(vcpu);
 
 
 	/*
 	/*
-	 * If the timer expired while we were not scheduled, now is the time
-	 * to inject it.
+	 * If we enter the guest with the virtual input level to the VGIC
+	 * asserted, then we have already told the VGIC what we need to, and
+	 * we don't need to exit from the guest until the guest deactivates
+	 * the already injected interrupt, so therefore we should set the
+	 * hardware active state to prevent unnecessary exits from the guest.
+	 *
+	 * Conversely, if the virtual input level is deasserted, then always
+	 * clear the hardware active state to ensure that hardware interrupts
+	 * from the timer triggers a guest exit.
 	 */
 	 */
-	if (kvm_timer_should_fire(vcpu))
-		kvm_timer_inject_irq(vcpu);
-
-	/*
-	 * We keep track of whether the edge-triggered interrupt has been
-	 * signalled to the vgic/guest, and if so, we mask the interrupt and
-	 * the physical distributor to prevent the timer from raising a
-	 * physical interrupt whenever we run a guest, preventing forward
-	 * VCPU progress.
-	 */
-	if (kvm_vgic_get_phys_irq_active(timer->map))
+	if (timer->irq.level)
 		phys_active = true;
 		phys_active = true;
 	else
 	else
 		phys_active = false;
 		phys_active = false;
@@ -176,32 +246,20 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
  * kvm_timer_sync_hwstate - sync timer state from cpu
  * kvm_timer_sync_hwstate - sync timer state from cpu
  * @vcpu: The vcpu pointer
  * @vcpu: The vcpu pointer
  *
  *
- * Check if the virtual timer was armed and either schedule a corresponding
- * soft timer or inject directly if already expired.
+ * Check if the virtual timer has expired while we were running in the guest,
+ * and inject an interrupt if that was the case.
  */
  */
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 {
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
 	struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
-	cycle_t cval, now;
-	u64 ns;
 
 
 	BUG_ON(timer_is_armed(timer));
 	BUG_ON(timer_is_armed(timer));
 
 
-	if (kvm_timer_should_fire(vcpu)) {
-		/*
-		 * Timer has already expired while we were not
-		 * looking. Inject the interrupt and carry on.
-		 */
-		kvm_timer_inject_irq(vcpu);
-		return;
-	}
-
-	cval = timer->cntv_cval;
-	now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
-
-	ns = cyclecounter_cyc2ns(timecounter->cc, cval - now, timecounter->mask,
-				 &timecounter->frac);
-	timer_arm(timer, ns);
+	/*
+	 * The guest could have modified the timer registers or the timer
+	 * could have expired, update the timer state.
+	 */
+	kvm_timer_update_state(vcpu);
 }
 }
 
 
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
@@ -216,7 +274,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * kvm_vcpu_set_target(). To handle this, we determine
 	 * kvm_vcpu_set_target(). To handle this, we determine
 	 * vcpu timer irq number when the vcpu is reset.
 	 * vcpu timer irq number when the vcpu is reset.
 	 */
 	 */
-	timer->irq = irq;
+	timer->irq.irq = irq->irq;
 
 
 	/*
 	/*
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
 	 * The bits in CNTV_CTL are architecturally reset to UNKNOWN for ARMv8
@@ -225,6 +283,7 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu,
 	 * the ARMv7 architecture.
 	 * the ARMv7 architecture.
 	 */
 	 */
 	timer->cntv_ctl = 0;
 	timer->cntv_ctl = 0;
+	kvm_timer_update_state(vcpu);
 
 
 	/*
 	/*
 	 * Tell the VGIC that the virtual interrupt is tied to a
 	 * Tell the VGIC that the virtual interrupt is tied to a
@@ -269,6 +328,8 @@ int kvm_arm_timer_set_reg(struct kvm_vcpu *vcpu, u64 regid, u64 value)
 	default:
 	default:
 		return -1;
 		return -1;
 	}
 	}
+
+	kvm_timer_update_state(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 

+ 63 - 0
virt/kvm/arm/trace.h

@@ -0,0 +1,63 @@
+#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KVM_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kvm
+
+/*
+ * Tracepoints for vgic
+ */
+TRACE_EVENT(vgic_update_irq_pending,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, bool level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	bool,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level: %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+/*
+ * Tracepoints for arch_timer
+ */
+TRACE_EVENT(kvm_timer_update_irq,
+	TP_PROTO(unsigned long vcpu_id, __u32 irq, int level),
+	TP_ARGS(vcpu_id, irq, level),
+
+	TP_STRUCT__entry(
+		__field(	unsigned long,	vcpu_id	)
+		__field(	__u32,		irq	)
+		__field(	int,		level	)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id	= vcpu_id;
+		__entry->irq		= irq;
+		__entry->level		= level;
+	),
+
+	TP_printk("VCPU: %ld, IRQ %d, level %d",
+		  __entry->vcpu_id, __entry->irq, __entry->level)
+);
+
+#endif /* _TRACE_KVM_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH ../../../virt/kvm/arm
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>

+ 1 - 5
virt/kvm/arm/vgic-v2.c

@@ -79,11 +79,7 @@ static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
 		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
 		lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
 
 
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-}
 
 
-static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
-				  struct vgic_lr lr_desc)
-{
 	if (!(lr_desc.state & LR_STATE_MASK))
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
 		vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
 	else
 	else
@@ -158,6 +154,7 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 * anyway.
 	 */
 	 */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
+	vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
 
 
 	/* Get the show on the road... */
 	/* Get the show on the road... */
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
 	vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
@@ -166,7 +163,6 @@ static void vgic_v2_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v2_ops = {
 static const struct vgic_ops vgic_v2_ops = {
 	.get_lr			= vgic_v2_get_lr,
 	.get_lr			= vgic_v2_get_lr,
 	.set_lr			= vgic_v2_set_lr,
 	.set_lr			= vgic_v2_set_lr,
-	.sync_lr_elrsr		= vgic_v2_sync_lr_elrsr,
 	.get_elrsr		= vgic_v2_get_elrsr,
 	.get_elrsr		= vgic_v2_get_elrsr,
 	.get_eisr		= vgic_v2_get_eisr,
 	.get_eisr		= vgic_v2_get_eisr,
 	.clear_eisr		= vgic_v2_clear_eisr,
 	.clear_eisr		= vgic_v2_clear_eisr,

+ 1 - 5
virt/kvm/arm/vgic-v3.c

@@ -112,11 +112,7 @@ static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
 	}
 	}
 
 
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
 	vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[LR_INDEX(lr)] = lr_val;
-}
 
 
-static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
-				  struct vgic_lr lr_desc)
-{
 	if (!(lr_desc.state & LR_STATE_MASK))
 	if (!(lr_desc.state & LR_STATE_MASK))
 		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
 		vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
 	else
 	else
@@ -193,6 +189,7 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 	 * anyway.
 	 * anyway.
 	 */
 	 */
 	vgic_v3->vgic_vmcr = 0;
 	vgic_v3->vgic_vmcr = 0;
+	vgic_v3->vgic_elrsr = ~0;
 
 
 	/*
 	/*
 	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
 	 * If we are emulating a GICv3, we do it in an non-GICv2-compatible
@@ -211,7 +208,6 @@ static void vgic_v3_enable(struct kvm_vcpu *vcpu)
 static const struct vgic_ops vgic_v3_ops = {
 static const struct vgic_ops vgic_v3_ops = {
 	.get_lr			= vgic_v3_get_lr,
 	.get_lr			= vgic_v3_get_lr,
 	.set_lr			= vgic_v3_set_lr,
 	.set_lr			= vgic_v3_set_lr,
-	.sync_lr_elrsr		= vgic_v3_sync_lr_elrsr,
 	.get_elrsr		= vgic_v3_get_elrsr,
 	.get_elrsr		= vgic_v3_get_elrsr,
 	.get_eisr		= vgic_v3_get_eisr,
 	.get_eisr		= vgic_v3_get_eisr,
 	.clear_eisr		= vgic_v3_clear_eisr,
 	.clear_eisr		= vgic_v3_clear_eisr,

+ 119 - 189
virt/kvm/arm/vgic.c

@@ -34,6 +34,9 @@
 #include <asm/kvm.h>
 #include <asm/kvm.h>
 #include <kvm/iodev.h>
 #include <kvm/iodev.h>
 
 
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
 /*
 /*
  * How the whole thing works (courtesy of Christoffer Dall):
  * How the whole thing works (courtesy of Christoffer Dall):
  *
  *
@@ -102,11 +105,13 @@
 #include "vgic.h"
 #include "vgic.h"
 
 
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu);
+static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
 static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
+static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
 static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
 static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
 						int virt_irq);
 						int virt_irq);
+static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
 
 
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_ops *vgic_ops;
 static const struct vgic_params *vgic;
 static const struct vgic_params *vgic;
@@ -357,6 +362,11 @@ static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 
 
 	vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
 	vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
+	if (!vgic_dist_irq_get_level(vcpu, irq)) {
+		vgic_dist_irq_clear_pending(vcpu, irq);
+		if (!compute_pending_for_cpu(vcpu))
+			clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
+	}
 }
 }
 
 
 static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
 static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
@@ -531,34 +541,6 @@ bool vgic_handle_set_pending_reg(struct kvm *kvm,
 	return false;
 	return false;
 }
 }
 
 
-/*
- * If a mapped interrupt's state has been modified by the guest such that it
- * is no longer active or pending, without it have gone through the sync path,
- * then the map->active field must be cleared so the interrupt can be taken
- * again.
- */
-static void vgic_handle_clear_mapped_irq(struct kvm_vcpu *vcpu)
-{
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-	struct list_head *root;
-	struct irq_phys_map_entry *entry;
-	struct irq_phys_map *map;
-
-	rcu_read_lock();
-
-	/* Check for PPIs */
-	root = &vgic_cpu->irq_phys_map_list;
-	list_for_each_entry_rcu(entry, root, entry) {
-		map = &entry->map;
-
-		if (!vgic_dist_irq_is_pending(vcpu, map->virt_irq) &&
-		    !vgic_irq_is_active(vcpu, map->virt_irq))
-			map->active = false;
-	}
-
-	rcu_read_unlock();
-}
-
 bool vgic_handle_clear_pending_reg(struct kvm *kvm,
 bool vgic_handle_clear_pending_reg(struct kvm *kvm,
 				   struct kvm_exit_mmio *mmio,
 				   struct kvm_exit_mmio *mmio,
 				   phys_addr_t offset, int vcpu_id)
 				   phys_addr_t offset, int vcpu_id)
@@ -589,7 +571,6 @@ bool vgic_handle_clear_pending_reg(struct kvm *kvm,
 					  vcpu_id, offset);
 					  vcpu_id, offset);
 		vgic_reg_access(mmio, reg, offset, mode);
 		vgic_reg_access(mmio, reg, offset, mode);
 
 
-		vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
 		vgic_update_state(kvm);
 		vgic_update_state(kvm);
 		return true;
 		return true;
 	}
 	}
@@ -627,7 +608,6 @@ bool vgic_handle_clear_active_reg(struct kvm *kvm,
 			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
 			ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
 
 
 	if (mmio->is_write) {
 	if (mmio->is_write) {
-		vgic_handle_clear_mapped_irq(kvm_get_vcpu(kvm, vcpu_id));
 		vgic_update_state(kvm);
 		vgic_update_state(kvm);
 		return true;
 		return true;
 	}
 	}
@@ -684,10 +664,9 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
 	vgic_reg_access(mmio, &val, offset,
 	vgic_reg_access(mmio, &val, offset,
 			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
 			ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
 	if (mmio->is_write) {
 	if (mmio->is_write) {
-		if (offset < 8) {
-			*reg = ~0U; /* Force PPIs/SGIs to 1 */
+		/* Ignore writes to read-only SGI and PPI bits */
+		if (offset < 8)
 			return false;
 			return false;
-		}
 
 
 		val = vgic_cfg_compress(val);
 		val = vgic_cfg_compress(val);
 		if (offset & 4) {
 		if (offset & 4) {
@@ -713,9 +692,11 @@ bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 {
 {
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 elrsr = vgic_get_elrsr(vcpu);
+	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	int i;
 	int i;
 
 
-	for_each_set_bit(i, vgic_cpu->lr_used, vgic_cpu->nr_lr) {
+	for_each_clear_bit(i, elrsr_ptr, vgic_cpu->nr_lr) {
 		struct vgic_lr lr = vgic_get_lr(vcpu, i);
 		struct vgic_lr lr = vgic_get_lr(vcpu, i);
 
 
 		/*
 		/*
@@ -736,30 +717,14 @@ void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
 		 * interrupt then move the active state to the
 		 * interrupt then move the active state to the
 		 * distributor tracking bit.
 		 * distributor tracking bit.
 		 */
 		 */
-		if (lr.state & LR_STATE_ACTIVE) {
+		if (lr.state & LR_STATE_ACTIVE)
 			vgic_irq_set_active(vcpu, lr.irq);
 			vgic_irq_set_active(vcpu, lr.irq);
-			lr.state &= ~LR_STATE_ACTIVE;
-		}
 
 
 		/*
 		/*
 		 * Reestablish the pending state on the distributor and the
 		 * Reestablish the pending state on the distributor and the
-		 * CPU interface.  It may have already been pending, but that
-		 * is fine, then we are only setting a few bits that were
-		 * already set.
+		 * CPU interface and mark the LR as free for other use.
 		 */
 		 */
-		if (lr.state & LR_STATE_PENDING) {
-			vgic_dist_irq_set_pending(vcpu, lr.irq);
-			lr.state &= ~LR_STATE_PENDING;
-		}
-
-		vgic_set_lr(vcpu, i, lr);
-
-		/*
-		 * Mark the LR as free for other use.
-		 */
-		BUG_ON(lr.state & LR_STATE_MASK);
-		vgic_retire_lr(i, lr.irq, vcpu);
-		vgic_irq_clear_queued(vcpu, lr.irq);
+		vgic_retire_lr(i, vcpu);
 
 
 		/* Finally update the VGIC state. */
 		/* Finally update the VGIC state. */
 		vgic_update_state(vcpu->kvm);
 		vgic_update_state(vcpu->kvm);
@@ -1067,12 +1032,6 @@ static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
 	vgic_ops->set_lr(vcpu, lr, vlr);
 	vgic_ops->set_lr(vcpu, lr, vlr);
 }
 }
 
 
-static void vgic_sync_lr_elrsr(struct kvm_vcpu *vcpu, int lr,
-			       struct vgic_lr vlr)
-{
-	vgic_ops->sync_lr_elrsr(vcpu, lr, vlr);
-}
-
 static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
 static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
 {
 {
 	return vgic_ops->get_elrsr(vcpu);
 	return vgic_ops->get_elrsr(vcpu);
@@ -1118,25 +1077,23 @@ static inline void vgic_enable(struct kvm_vcpu *vcpu)
 	vgic_ops->enable(vcpu);
 	vgic_ops->enable(vcpu);
 }
 }
 
 
-static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
+static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
 {
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
 	struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
 
 
+	vgic_irq_clear_queued(vcpu, vlr.irq);
+
 	/*
 	/*
 	 * We must transfer the pending state back to the distributor before
 	 * We must transfer the pending state back to the distributor before
 	 * retiring the LR, otherwise we may loose edge-triggered interrupts.
 	 * retiring the LR, otherwise we may loose edge-triggered interrupts.
 	 */
 	 */
 	if (vlr.state & LR_STATE_PENDING) {
 	if (vlr.state & LR_STATE_PENDING) {
-		vgic_dist_irq_set_pending(vcpu, irq);
+		vgic_dist_irq_set_pending(vcpu, vlr.irq);
 		vlr.hwirq = 0;
 		vlr.hwirq = 0;
 	}
 	}
 
 
 	vlr.state = 0;
 	vlr.state = 0;
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_set_lr(vcpu, lr_nr, vlr);
-	clear_bit(lr_nr, vgic_cpu->lr_used);
-	vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
-	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 }
 
 
 /*
 /*
@@ -1150,17 +1107,15 @@ static void vgic_retire_lr(int lr_nr, int irq, struct kvm_vcpu *vcpu)
  */
  */
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
 static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
 {
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+	u64 elrsr = vgic_get_elrsr(vcpu);
+	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	int lr;
 	int lr;
 
 
-	for_each_set_bit(lr, vgic_cpu->lr_used, vgic->nr_lr) {
+	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
 		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 
 
-		if (!vgic_irq_is_enabled(vcpu, vlr.irq)) {
-			vgic_retire_lr(lr, vlr.irq, vcpu);
-			if (vgic_irq_is_queued(vcpu, vlr.irq))
-				vgic_irq_clear_queued(vcpu, vlr.irq);
-		}
+		if (!vgic_irq_is_enabled(vcpu, vlr.irq))
+			vgic_retire_lr(lr, vcpu);
 	}
 	}
 }
 }
 
 
@@ -1200,7 +1155,6 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 	}
 	}
 
 
 	vgic_set_lr(vcpu, lr_nr, vlr);
 	vgic_set_lr(vcpu, lr_nr, vlr);
-	vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 }
 
 
 /*
 /*
@@ -1210,8 +1164,9 @@ static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
  */
  */
 bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 {
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+	u64 elrsr = vgic_get_elrsr(vcpu);
+	unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
 	struct vgic_lr vlr;
 	struct vgic_lr vlr;
 	int lr;
 	int lr;
 
 
@@ -1222,28 +1177,22 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
 
 
 	kvm_debug("Queue IRQ%d\n", irq);
 	kvm_debug("Queue IRQ%d\n", irq);
 
 
-	lr = vgic_cpu->vgic_irq_lr_map[irq];
-
 	/* Do we have an active interrupt for the same CPUID? */
 	/* Do we have an active interrupt for the same CPUID? */
-	if (lr != LR_EMPTY) {
+	for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
 		vlr = vgic_get_lr(vcpu, lr);
 		vlr = vgic_get_lr(vcpu, lr);
-		if (vlr.source == sgi_source_id) {
+		if (vlr.irq == irq && vlr.source == sgi_source_id) {
 			kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
 			kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
-			BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
 			vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
 			vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
 			return true;
 			return true;
 		}
 		}
 	}
 	}
 
 
 	/* Try to use another LR for this interrupt */
 	/* Try to use another LR for this interrupt */
-	lr = find_first_zero_bit((unsigned long *)vgic_cpu->lr_used,
-			       vgic->nr_lr);
+	lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
 	if (lr >= vgic->nr_lr)
 	if (lr >= vgic->nr_lr)
 		return false;
 		return false;
 
 
 	kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
 	kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-	vgic_cpu->vgic_irq_lr_map[irq] = lr;
-	set_bit(lr, vgic_cpu->lr_used);
 
 
 	vlr.irq = irq;
 	vlr.irq = irq;
 	vlr.source = sgi_source_id;
 	vlr.source = sgi_source_id;
@@ -1338,12 +1287,60 @@ epilog:
 	}
 	}
 }
 }
 
 
+static int process_queued_irq(struct kvm_vcpu *vcpu,
+				   int lr, struct vgic_lr vlr)
+{
+	int pending = 0;
+
+	/*
+	 * If the IRQ was EOIed (called from vgic_process_maintenance) or it
+	 * went from active to non-active (called from vgic_sync_hwirq) it was
+	 * also ACKed and we we therefore assume we can clear the soft pending
+	 * state (should it had been set) for this interrupt.
+	 *
+	 * Note: if the IRQ soft pending state was set after the IRQ was
+	 * acked, it actually shouldn't be cleared, but we have no way of
+	 * knowing that unless we start trapping ACKs when the soft-pending
+	 * state is set.
+	 */
+	vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
+
+	/*
+	 * Tell the gic to start sampling this interrupt again.
+	 */
+	vgic_irq_clear_queued(vcpu, vlr.irq);
+
+	/* Any additional pending interrupt? */
+	if (vgic_irq_is_edge(vcpu, vlr.irq)) {
+		BUG_ON(!(vlr.state & LR_HW));
+		pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
+	} else {
+		if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
+			vgic_cpu_irq_set(vcpu, vlr.irq);
+			pending = 1;
+		} else {
+			vgic_dist_irq_clear_pending(vcpu, vlr.irq);
+			vgic_cpu_irq_clear(vcpu, vlr.irq);
+		}
+	}
+
+	/*
+	 * Despite being EOIed, the LR may not have
+	 * been marked as empty.
+	 */
+	vlr.state = 0;
+	vlr.hwirq = 0;
+	vgic_set_lr(vcpu, lr, vlr);
+
+	return pending;
+}
+
 static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 {
 {
 	u32 status = vgic_get_interrupt_status(vcpu);
 	u32 status = vgic_get_interrupt_status(vcpu);
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-	bool level_pending = false;
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm *kvm = vcpu->kvm;
+	int level_pending = 0;
 
 
 	kvm_debug("STATUS = %08x\n", status);
 	kvm_debug("STATUS = %08x\n", status);
 
 
@@ -1358,54 +1355,22 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 
 
 		for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
 		for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
 			struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 			struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-			WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
 
 
-			spin_lock(&dist->lock);
-			vgic_irq_clear_queued(vcpu, vlr.irq);
+			WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
 			WARN_ON(vlr.state & LR_STATE_MASK);
 			WARN_ON(vlr.state & LR_STATE_MASK);
-			vlr.state = 0;
-			vgic_set_lr(vcpu, lr, vlr);
 
 
-			/*
-			 * If the IRQ was EOIed it was also ACKed and we we
-			 * therefore assume we can clear the soft pending
-			 * state (should it had been set) for this interrupt.
-			 *
-			 * Note: if the IRQ soft pending state was set after
-			 * the IRQ was acked, it actually shouldn't be
-			 * cleared, but we have no way of knowing that unless
-			 * we start trapping ACKs when the soft-pending state
-			 * is set.
-			 */
-			vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
 
 
 			/*
 			/*
 			 * kvm_notify_acked_irq calls kvm_set_irq()
 			 * kvm_notify_acked_irq calls kvm_set_irq()
-			 * to reset the IRQ level. Need to release the
-			 * lock for kvm_set_irq to grab it.
+			 * to reset the IRQ level, which grabs the dist->lock
+			 * so we call this before taking the dist->lock.
 			 */
 			 */
-			spin_unlock(&dist->lock);
-
 			kvm_notify_acked_irq(kvm, 0,
 			kvm_notify_acked_irq(kvm, 0,
 					     vlr.irq - VGIC_NR_PRIVATE_IRQS);
 					     vlr.irq - VGIC_NR_PRIVATE_IRQS);
-			spin_lock(&dist->lock);
-
-			/* Any additional pending interrupt? */
-			if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
-				vgic_cpu_irq_set(vcpu, vlr.irq);
-				level_pending = true;
-			} else {
-				vgic_dist_irq_clear_pending(vcpu, vlr.irq);
-				vgic_cpu_irq_clear(vcpu, vlr.irq);
-			}
 
 
+			spin_lock(&dist->lock);
+			level_pending |= process_queued_irq(vcpu, lr, vlr);
 			spin_unlock(&dist->lock);
 			spin_unlock(&dist->lock);
-
-			/*
-			 * Despite being EOIed, the LR may not have
-			 * been marked as empty.
-			 */
-			vgic_sync_lr_elrsr(vcpu, lr, vlr);
 		}
 		}
 	}
 	}
 
 
@@ -1426,35 +1391,40 @@ static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
 /*
 /*
  * Save the physical active state, and reset it to inactive.
  * Save the physical active state, and reset it to inactive.
  *
  *
- * Return 1 if HW interrupt went from active to inactive, and 0 otherwise.
+ * Return true if there's a pending forwarded interrupt to queue.
  */
  */
-static int vgic_sync_hwirq(struct kvm_vcpu *vcpu, struct vgic_lr vlr)
+static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
 {
 {
+	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct irq_phys_map *map;
 	struct irq_phys_map *map;
+	bool phys_active;
+	bool level_pending;
 	int ret;
 	int ret;
 
 
 	if (!(vlr.state & LR_HW))
 	if (!(vlr.state & LR_HW))
-		return 0;
+		return false;
 
 
 	map = vgic_irq_map_search(vcpu, vlr.irq);
 	map = vgic_irq_map_search(vcpu, vlr.irq);
 	BUG_ON(!map);
 	BUG_ON(!map);
 
 
 	ret = irq_get_irqchip_state(map->irq,
 	ret = irq_get_irqchip_state(map->irq,
 				    IRQCHIP_STATE_ACTIVE,
 				    IRQCHIP_STATE_ACTIVE,
-				    &map->active);
+				    &phys_active);
 
 
 	WARN_ON(ret);
 	WARN_ON(ret);
 
 
-	if (map->active)
+	if (phys_active)
 		return 0;
 		return 0;
 
 
-	return 1;
+	spin_lock(&dist->lock);
+	level_pending = process_queued_irq(vcpu, lr, vlr);
+	spin_unlock(&dist->lock);
+	return level_pending;
 }
 }
 
 
 /* Sync back the VGIC state after a guest run */
 /* Sync back the VGIC state after a guest run */
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 {
 {
-	struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 	u64 elrsr;
 	u64 elrsr;
 	unsigned long *elrsr_ptr;
 	unsigned long *elrsr_ptr;
@@ -1462,40 +1432,18 @@ static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	bool level_pending;
 	bool level_pending;
 
 
 	level_pending = vgic_process_maintenance(vcpu);
 	level_pending = vgic_process_maintenance(vcpu);
-	elrsr = vgic_get_elrsr(vcpu);
-	elrsr_ptr = u64_to_bitmask(&elrsr);
 
 
 	/* Deal with HW interrupts, and clear mappings for empty LRs */
 	/* Deal with HW interrupts, and clear mappings for empty LRs */
 	for (lr = 0; lr < vgic->nr_lr; lr++) {
 	for (lr = 0; lr < vgic->nr_lr; lr++) {
-		struct vgic_lr vlr;
-
-		if (!test_bit(lr, vgic_cpu->lr_used))
-			continue;
-
-		vlr = vgic_get_lr(vcpu, lr);
-		if (vgic_sync_hwirq(vcpu, vlr)) {
-			/*
-			 * So this is a HW interrupt that the guest
-			 * EOI-ed. Clean the LR state and allow the
-			 * interrupt to be sampled again.
-			 */
-			vlr.state = 0;
-			vlr.hwirq = 0;
-			vgic_set_lr(vcpu, lr, vlr);
-			vgic_irq_clear_queued(vcpu, vlr.irq);
-			set_bit(lr, elrsr_ptr);
-		}
-
-		if (!test_bit(lr, elrsr_ptr))
-			continue;
-
-		clear_bit(lr, vgic_cpu->lr_used);
+		struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
 
 
+		level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
 		BUG_ON(vlr.irq >= dist->nr_irqs);
 		BUG_ON(vlr.irq >= dist->nr_irqs);
-		vgic_cpu->vgic_irq_lr_map[vlr.irq] = LR_EMPTY;
 	}
 	}
 
 
 	/* Check if we still have something up our sleeve... */
 	/* Check if we still have something up our sleeve... */
+	elrsr = vgic_get_elrsr(vcpu);
+	elrsr_ptr = u64_to_bitmask(&elrsr);
 	pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
 	pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
 	if (level_pending || pending < vgic->nr_lr)
 	if (level_pending || pending < vgic->nr_lr)
 		set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
 		set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
@@ -1585,6 +1533,8 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
 	int enabled;
 	int enabled;
 	bool ret = true, can_inject = true;
 	bool ret = true, can_inject = true;
 
 
+	trace_vgic_update_irq_pending(cpuid, irq_num, level);
+
 	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
 	if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
 		return -EINVAL;
 		return -EINVAL;
 
 
@@ -1863,30 +1813,6 @@ static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
 	kfree(entry);
 	kfree(entry);
 }
 }
 
 
-/**
- * kvm_vgic_get_phys_irq_active - Return the active state of a mapped IRQ
- *
- * Return the logical active state of a mapped interrupt. This doesn't
- * necessarily reflects the current HW state.
- */
-bool kvm_vgic_get_phys_irq_active(struct irq_phys_map *map)
-{
-	BUG_ON(!map);
-	return map->active;
-}
-
-/**
- * kvm_vgic_set_phys_irq_active - Set the active state of a mapped IRQ
- *
- * Set the logical active state of a mapped interrupt. This doesn't
- * immediately affects the HW state.
- */
-void kvm_vgic_set_phys_irq_active(struct irq_phys_map *map, bool active)
-{
-	BUG_ON(!map);
-	map->active = active;
-}
-
 /**
 /**
  * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
  * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
  * @vcpu: The VCPU pointer
  * @vcpu: The VCPU pointer
@@ -1942,12 +1868,10 @@ void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kfree(vgic_cpu->pending_shared);
 	kfree(vgic_cpu->pending_shared);
 	kfree(vgic_cpu->active_shared);
 	kfree(vgic_cpu->active_shared);
 	kfree(vgic_cpu->pend_act_shared);
 	kfree(vgic_cpu->pend_act_shared);
-	kfree(vgic_cpu->vgic_irq_lr_map);
 	vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
 	vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
 	vgic_cpu->pending_shared = NULL;
 	vgic_cpu->pending_shared = NULL;
 	vgic_cpu->active_shared = NULL;
 	vgic_cpu->active_shared = NULL;
 	vgic_cpu->pend_act_shared = NULL;
 	vgic_cpu->pend_act_shared = NULL;
-	vgic_cpu->vgic_irq_lr_map = NULL;
 }
 }
 
 
 static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
@@ -1958,18 +1882,14 @@ static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
 	vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
 	vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-	vgic_cpu->vgic_irq_lr_map = kmalloc(nr_irqs, GFP_KERNEL);
 
 
 	if (!vgic_cpu->pending_shared
 	if (!vgic_cpu->pending_shared
 		|| !vgic_cpu->active_shared
 		|| !vgic_cpu->active_shared
-		|| !vgic_cpu->pend_act_shared
-		|| !vgic_cpu->vgic_irq_lr_map) {
+		|| !vgic_cpu->pend_act_shared) {
 		kvm_vgic_vcpu_destroy(vcpu);
 		kvm_vgic_vcpu_destroy(vcpu);
 		return -ENOMEM;
 		return -ENOMEM;
 	}
 	}
 
 
-	memset(vgic_cpu->vgic_irq_lr_map, LR_EMPTY, nr_irqs);
-
 	/*
 	/*
 	 * Store the number of LRs per vcpu, so we don't have to go
 	 * Store the number of LRs per vcpu, so we don't have to go
 	 * all the way to the distributor structure to find out. Only
 	 * all the way to the distributor structure to find out. Only
@@ -2111,14 +2031,24 @@ int vgic_init(struct kvm *kvm)
 			break;
 			break;
 		}
 		}
 
 
-		for (i = 0; i < dist->nr_irqs; i++) {
-			if (i < VGIC_NR_PPIS)
+		/*
+		 * Enable and configure all SGIs to be edge-triggere and
+		 * configure all PPIs as level-triggered.
+		 */
+		for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
+			if (i < VGIC_NR_SGIS) {
+				/* SGIs */
 				vgic_bitmap_set_irq_val(&dist->irq_enabled,
 				vgic_bitmap_set_irq_val(&dist->irq_enabled,
 							vcpu->vcpu_id, i, 1);
 							vcpu->vcpu_id, i, 1);
-			if (i < VGIC_NR_PRIVATE_IRQS)
 				vgic_bitmap_set_irq_val(&dist->irq_cfg,
 				vgic_bitmap_set_irq_val(&dist->irq_cfg,
 							vcpu->vcpu_id, i,
 							vcpu->vcpu_id, i,
 							VGIC_CFG_EDGE);
 							VGIC_CFG_EDGE);
+			} else if (i < VGIC_NR_PRIVATE_IRQS) {
+				/* PPIs */
+				vgic_bitmap_set_irq_val(&dist->irq_cfg,
+							vcpu->vcpu_id, i,
+							VGIC_CFG_LEVEL);
+			}
 		}
 		}
 
 
 		vgic_enable(vcpu);
 		vgic_enable(vcpu);

+ 4 - 0
virt/kvm/async_pf.c

@@ -94,6 +94,10 @@ static void async_pf_execute(struct work_struct *work)
 
 
 	trace_kvm_async_pf_completed(addr, gva);
 	trace_kvm_async_pf_completed(addr, gva);
 
 
+	/*
+	 * This memory barrier pairs with prepare_to_wait's set_current_state()
+	 */
+	smp_mb();
 	if (waitqueue_active(&vcpu->wq))
 	if (waitqueue_active(&vcpu->wq))
 		wake_up_interruptible(&vcpu->wq);
 		wake_up_interruptible(&vcpu->wq);
 
 

+ 97 - 93
virt/kvm/eventfd.c

@@ -23,6 +23,7 @@
 
 
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/kvm.h>
+#include <linux/kvm_irqfd.h>
 #include <linux/workqueue.h>
 #include <linux/workqueue.h>
 #include <linux/syscalls.h>
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/wait.h>
@@ -34,73 +35,20 @@
 #include <linux/srcu.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/seqlock.h>
 #include <linux/seqlock.h>
+#include <linux/irqbypass.h>
 #include <trace/events/kvm.h>
 #include <trace/events/kvm.h>
 
 
 #include <kvm/iodev.h>
 #include <kvm/iodev.h>
 
 
 #ifdef CONFIG_HAVE_KVM_IRQFD
 #ifdef CONFIG_HAVE_KVM_IRQFD
-/*
- * --------------------------------------------------------------------
- * irqfd: Allows an fd to be used to inject an interrupt to the guest
- *
- * Credit goes to Avi Kivity for the original idea.
- * --------------------------------------------------------------------
- */
-
-/*
- * Resampling irqfds are a special variety of irqfds used to emulate
- * level triggered interrupts.  The interrupt is asserted on eventfd
- * trigger.  On acknowledgement through the irq ack notifier, the
- * interrupt is de-asserted and userspace is notified through the
- * resamplefd.  All resamplers on the same gsi are de-asserted
- * together, so we don't need to track the state of each individual
- * user.  We can also therefore share the same irq source ID.
- */
-struct _irqfd_resampler {
-	struct kvm *kvm;
-	/*
-	 * List of resampling struct _irqfd objects sharing this gsi.
-	 * RCU list modified under kvm->irqfds.resampler_lock
-	 */
-	struct list_head list;
-	struct kvm_irq_ack_notifier notifier;
-	/*
-	 * Entry in list of kvm->irqfd.resampler_list.  Use for sharing
-	 * resamplers among irqfds on the same gsi.
-	 * Accessed and modified under kvm->irqfds.resampler_lock
-	 */
-	struct list_head link;
-};
-
-struct _irqfd {
-	/* Used for MSI fast-path */
-	struct kvm *kvm;
-	wait_queue_t wait;
-	/* Update side is protected by irqfds.lock */
-	struct kvm_kernel_irq_routing_entry irq_entry;
-	seqcount_t irq_entry_sc;
-	/* Used for level IRQ fast-path */
-	int gsi;
-	struct work_struct inject;
-	/* The resampler used by this irqfd (resampler-only) */
-	struct _irqfd_resampler *resampler;
-	/* Eventfd notified on resample (resampler-only) */
-	struct eventfd_ctx *resamplefd;
-	/* Entry in list of irqfds for a resampler (resampler-only) */
-	struct list_head resampler_link;
-	/* Used for setup/shutdown */
-	struct eventfd_ctx *eventfd;
-	struct list_head list;
-	poll_table pt;
-	struct work_struct shutdown;
-};
 
 
 static struct workqueue_struct *irqfd_cleanup_wq;
 static struct workqueue_struct *irqfd_cleanup_wq;
 
 
 static void
 static void
 irqfd_inject(struct work_struct *work)
 irqfd_inject(struct work_struct *work)
 {
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, inject);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(work, struct kvm_kernel_irqfd, inject);
 	struct kvm *kvm = irqfd->kvm;
 	struct kvm *kvm = irqfd->kvm;
 
 
 	if (!irqfd->resampler) {
 	if (!irqfd->resampler) {
@@ -121,12 +69,13 @@ irqfd_inject(struct work_struct *work)
 static void
 static void
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 {
 {
-	struct _irqfd_resampler *resampler;
+	struct kvm_kernel_irqfd_resampler *resampler;
 	struct kvm *kvm;
 	struct kvm *kvm;
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;
 	int idx;
 	int idx;
 
 
-	resampler = container_of(kian, struct _irqfd_resampler, notifier);
+	resampler = container_of(kian,
+			struct kvm_kernel_irqfd_resampler, notifier);
 	kvm = resampler->kvm;
 	kvm = resampler->kvm;
 
 
 	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
 	kvm_set_irq(kvm, KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
@@ -141,9 +90,9 @@ irqfd_resampler_ack(struct kvm_irq_ack_notifier *kian)
 }
 }
 
 
 static void
 static void
-irqfd_resampler_shutdown(struct _irqfd *irqfd)
+irqfd_resampler_shutdown(struct kvm_kernel_irqfd *irqfd)
 {
 {
-	struct _irqfd_resampler *resampler = irqfd->resampler;
+	struct kvm_kernel_irqfd_resampler *resampler = irqfd->resampler;
 	struct kvm *kvm = resampler->kvm;
 	struct kvm *kvm = resampler->kvm;
 
 
 	mutex_lock(&kvm->irqfds.resampler_lock);
 	mutex_lock(&kvm->irqfds.resampler_lock);
@@ -168,7 +117,8 @@ irqfd_resampler_shutdown(struct _irqfd *irqfd)
 static void
 static void
 irqfd_shutdown(struct work_struct *work)
 irqfd_shutdown(struct work_struct *work)
 {
 {
-	struct _irqfd *irqfd = container_of(work, struct _irqfd, shutdown);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(work, struct kvm_kernel_irqfd, shutdown);
 	u64 cnt;
 	u64 cnt;
 
 
 	/*
 	/*
@@ -191,6 +141,9 @@ irqfd_shutdown(struct work_struct *work)
 	/*
 	/*
 	 * It is now safe to release the object's resources
 	 * It is now safe to release the object's resources
 	 */
 	 */
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+	irq_bypass_unregister_consumer(&irqfd->consumer);
+#endif
 	eventfd_ctx_put(irqfd->eventfd);
 	eventfd_ctx_put(irqfd->eventfd);
 	kfree(irqfd);
 	kfree(irqfd);
 }
 }
@@ -198,7 +151,7 @@ irqfd_shutdown(struct work_struct *work)
 
 
 /* assumes kvm->irqfds.lock is held */
 /* assumes kvm->irqfds.lock is held */
 static bool
 static bool
-irqfd_is_active(struct _irqfd *irqfd)
+irqfd_is_active(struct kvm_kernel_irqfd *irqfd)
 {
 {
 	return list_empty(&irqfd->list) ? false : true;
 	return list_empty(&irqfd->list) ? false : true;
 }
 }
@@ -209,7 +162,7 @@ irqfd_is_active(struct _irqfd *irqfd)
  * assumes kvm->irqfds.lock is held
  * assumes kvm->irqfds.lock is held
  */
  */
 static void
 static void
-irqfd_deactivate(struct _irqfd *irqfd)
+irqfd_deactivate(struct kvm_kernel_irqfd *irqfd)
 {
 {
 	BUG_ON(!irqfd_is_active(irqfd));
 	BUG_ON(!irqfd_is_active(irqfd));
 
 
@@ -218,13 +171,23 @@ irqfd_deactivate(struct _irqfd *irqfd)
 	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 	queue_work(irqfd_cleanup_wq, &irqfd->shutdown);
 }
 }
 
 
+int __attribute__((weak)) kvm_arch_set_irq_inatomic(
+				struct kvm_kernel_irq_routing_entry *irq,
+				struct kvm *kvm, int irq_source_id,
+				int level,
+				bool line_status)
+{
+	return -EWOULDBLOCK;
+}
+
 /*
 /*
  * Called with wqh->lock held and interrupts disabled
  * Called with wqh->lock held and interrupts disabled
  */
  */
 static int
 static int
 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
 {
-	struct _irqfd *irqfd = container_of(wait, struct _irqfd, wait);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(wait, struct kvm_kernel_irqfd, wait);
 	unsigned long flags = (unsigned long)key;
 	unsigned long flags = (unsigned long)key;
 	struct kvm_kernel_irq_routing_entry irq;
 	struct kvm_kernel_irq_routing_entry irq;
 	struct kvm *kvm = irqfd->kvm;
 	struct kvm *kvm = irqfd->kvm;
@@ -238,10 +201,9 @@ irqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void *key)
 			irq = irqfd->irq_entry;
 			irq = irqfd->irq_entry;
 		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 		} while (read_seqcount_retry(&irqfd->irq_entry_sc, seq));
 		/* An event has been signaled, inject an interrupt */
 		/* An event has been signaled, inject an interrupt */
-		if (irq.type == KVM_IRQ_ROUTING_MSI)
-			kvm_set_msi(&irq, kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 1,
-					false);
-		else
+		if (kvm_arch_set_irq_inatomic(&irq, kvm,
+					      KVM_USERSPACE_IRQ_SOURCE_ID, 1,
+					      false) == -EWOULDBLOCK)
 			schedule_work(&irqfd->inject);
 			schedule_work(&irqfd->inject);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 		srcu_read_unlock(&kvm->irq_srcu, idx);
 	}
 	}
@@ -274,37 +236,54 @@ static void
 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 irqfd_ptable_queue_proc(struct file *file, wait_queue_head_t *wqh,
 			poll_table *pt)
 			poll_table *pt)
 {
 {
-	struct _irqfd *irqfd = container_of(pt, struct _irqfd, pt);
+	struct kvm_kernel_irqfd *irqfd =
+		container_of(pt, struct kvm_kernel_irqfd, pt);
 	add_wait_queue(wqh, &irqfd->wait);
 	add_wait_queue(wqh, &irqfd->wait);
 }
 }
 
 
 /* Must be called under irqfds.lock */
 /* Must be called under irqfds.lock */
-static void irqfd_update(struct kvm *kvm, struct _irqfd *irqfd)
+static void irqfd_update(struct kvm *kvm, struct kvm_kernel_irqfd *irqfd)
 {
 {
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry *e;
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
 	struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS];
-	int i, n_entries;
+	int n_entries;
 
 
 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 	n_entries = kvm_irq_map_gsi(kvm, entries, irqfd->gsi);
 
 
 	write_seqcount_begin(&irqfd->irq_entry_sc);
 	write_seqcount_begin(&irqfd->irq_entry_sc);
 
 
-	irqfd->irq_entry.type = 0;
-
 	e = entries;
 	e = entries;
-	for (i = 0; i < n_entries; ++i, ++e) {
-		/* Only fast-path MSI. */
-		if (e->type == KVM_IRQ_ROUTING_MSI)
-			irqfd->irq_entry = *e;
-	}
+	if (n_entries == 1)
+		irqfd->irq_entry = *e;
+	else
+		irqfd->irq_entry.type = 0;
 
 
 	write_seqcount_end(&irqfd->irq_entry_sc);
 	write_seqcount_end(&irqfd->irq_entry_sc);
 }
 }
 
 
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+void __attribute__((weak)) kvm_arch_irq_bypass_stop(
+				struct irq_bypass_consumer *cons)
+{
+}
+
+void __attribute__((weak)) kvm_arch_irq_bypass_start(
+				struct irq_bypass_consumer *cons)
+{
+}
+
+int  __attribute__((weak)) kvm_arch_update_irqfd_routing(
+				struct kvm *kvm, unsigned int host_irq,
+				uint32_t guest_irq, bool set)
+{
+	return 0;
+}
+#endif
+
 static int
 static int
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 {
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct fd f;
 	struct fd f;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	struct eventfd_ctx *eventfd = NULL, *resamplefd = NULL;
 	int ret;
 	int ret;
@@ -340,7 +319,7 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	irqfd->eventfd = eventfd;
 	irqfd->eventfd = eventfd;
 
 
 	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
 	if (args->flags & KVM_IRQFD_FLAG_RESAMPLE) {
-		struct _irqfd_resampler *resampler;
+		struct kvm_kernel_irqfd_resampler *resampler;
 
 
 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
 		resamplefd = eventfd_ctx_fdget(args->resamplefd);
 		if (IS_ERR(resamplefd)) {
 		if (IS_ERR(resamplefd)) {
@@ -428,6 +407,17 @@ kvm_irqfd_assign(struct kvm *kvm, struct kvm_irqfd *args)
 	 * we might race against the POLLHUP
 	 * we might race against the POLLHUP
 	 */
 	 */
 	fdput(f);
 	fdput(f);
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+	irqfd->consumer.token = (void *)irqfd->eventfd;
+	irqfd->consumer.add_producer = kvm_arch_irq_bypass_add_producer;
+	irqfd->consumer.del_producer = kvm_arch_irq_bypass_del_producer;
+	irqfd->consumer.stop = kvm_arch_irq_bypass_stop;
+	irqfd->consumer.start = kvm_arch_irq_bypass_start;
+	ret = irq_bypass_register_consumer(&irqfd->consumer);
+	if (ret)
+		pr_info("irq bypass consumer (token %p) registration fails: %d\n",
+				irqfd->consumer.token, ret);
+#endif
 
 
 	return 0;
 	return 0;
 
 
@@ -469,9 +459,18 @@ bool kvm_irq_has_notifier(struct kvm *kvm, unsigned irqchip, unsigned pin)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
 EXPORT_SYMBOL_GPL(kvm_irq_has_notifier);
 
 
-void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+void kvm_notify_acked_gsi(struct kvm *kvm, int gsi)
 {
 {
 	struct kvm_irq_ack_notifier *kian;
 	struct kvm_irq_ack_notifier *kian;
+
+	hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
+				 link)
+		if (kian->gsi == gsi)
+			kian->irq_acked(kian);
+}
+
+void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
+{
 	int gsi, idx;
 	int gsi, idx;
 
 
 	trace_kvm_ack_irq(irqchip, pin);
 	trace_kvm_ack_irq(irqchip, pin);
@@ -479,10 +478,7 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin)
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	idx = srcu_read_lock(&kvm->irq_srcu);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	gsi = kvm_irq_map_chip_pin(kvm, irqchip, pin);
 	if (gsi != -1)
 	if (gsi != -1)
-		hlist_for_each_entry_rcu(kian, &kvm->irq_ack_notifier_list,
-					 link)
-			if (kian->gsi == gsi)
-				kian->irq_acked(kian);
+		kvm_notify_acked_gsi(kvm, gsi);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 }
 
 
@@ -525,7 +521,7 @@ kvm_eventfd_init(struct kvm *kvm)
 static int
 static int
 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 kvm_irqfd_deassign(struct kvm *kvm, struct kvm_irqfd *args)
 {
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 	struct eventfd_ctx *eventfd;
 	struct eventfd_ctx *eventfd;
 
 
 	eventfd = eventfd_ctx_fdget(args->fd);
 	eventfd = eventfd_ctx_fdget(args->fd);
@@ -581,7 +577,7 @@ kvm_irqfd(struct kvm *kvm, struct kvm_irqfd *args)
 void
 void
 kvm_irqfd_release(struct kvm *kvm)
 kvm_irqfd_release(struct kvm *kvm)
 {
 {
-	struct _irqfd *irqfd, *tmp;
+	struct kvm_kernel_irqfd *irqfd, *tmp;
 
 
 	spin_lock_irq(&kvm->irqfds.lock);
 	spin_lock_irq(&kvm->irqfds.lock);
 
 
@@ -604,13 +600,23 @@ kvm_irqfd_release(struct kvm *kvm)
  */
  */
 void kvm_irq_routing_update(struct kvm *kvm)
 void kvm_irq_routing_update(struct kvm *kvm)
 {
 {
-	struct _irqfd *irqfd;
+	struct kvm_kernel_irqfd *irqfd;
 
 
 	spin_lock_irq(&kvm->irqfds.lock);
 	spin_lock_irq(&kvm->irqfds.lock);
 
 
-	list_for_each_entry(irqfd, &kvm->irqfds.items, list)
+	list_for_each_entry(irqfd, &kvm->irqfds.items, list) {
 		irqfd_update(kvm, irqfd);
 		irqfd_update(kvm, irqfd);
 
 
+#ifdef CONFIG_HAVE_KVM_IRQ_BYPASS
+		if (irqfd->producer) {
+			int ret = kvm_arch_update_irqfd_routing(
+					irqfd->kvm, irqfd->producer->irq,
+					irqfd->gsi, 1);
+			WARN_ON(ret);
+		}
+#endif
+	}
+
 	spin_unlock_irq(&kvm->irqfds.lock);
 	spin_unlock_irq(&kvm->irqfds.lock);
 }
 }
 
 
@@ -914,9 +920,7 @@ kvm_assign_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 		return -EINVAL;
 		return -EINVAL;
 
 
 	/* ioeventfd with no length can't be combined with DATAMATCH */
 	/* ioeventfd with no length can't be combined with DATAMATCH */
-	if (!args->len &&
-	    args->flags & (KVM_IOEVENTFD_FLAG_PIO |
-			   KVM_IOEVENTFD_FLAG_DATAMATCH))
+	if (!args->len && (args->flags & KVM_IOEVENTFD_FLAG_DATAMATCH))
 		return -EINVAL;
 		return -EINVAL;
 
 
 	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);
 	ret = kvm_assign_ioeventfd_idx(kvm, bus_idx, args);

+ 5 - 13
virt/kvm/irqchip.c

@@ -31,16 +31,6 @@
 #include <trace/events/kvm.h>
 #include <trace/events/kvm.h>
 #include "irq.h"
 #include "irq.h"
 
 
-struct kvm_irq_routing_table {
-	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
-	u32 nr_rt_entries;
-	/*
-	 * Array indexed by gsi. Each entry contains list of irq chips
-	 * the gsi is connected to.
-	 */
-	struct hlist_head map[0];
-};
-
 int kvm_irq_map_gsi(struct kvm *kvm,
 int kvm_irq_map_gsi(struct kvm *kvm,
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 		    struct kvm_kernel_irq_routing_entry *entries, int gsi)
 {
 {
@@ -154,11 +144,11 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
 
 
 	/*
 	/*
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
 	 * Do not allow GSI to be mapped to the same irqchip more than once.
-	 * Allow only one to one mapping between GSI and MSI.
+	 * Allow only one to one mapping between GSI and non-irqchip routing.
 	 */
 	 */
 	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
 	hlist_for_each_entry(ei, &rt->map[ue->gsi], link)
-		if (ei->type == KVM_IRQ_ROUTING_MSI ||
-		    ue->type == KVM_IRQ_ROUTING_MSI ||
+		if (ei->type != KVM_IRQ_ROUTING_IRQCHIP ||
+		    ue->type != KVM_IRQ_ROUTING_IRQCHIP ||
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
 		    ue->u.irqchip.irqchip == ei->irqchip.irqchip)
 			return r;
 			return r;
 
 
@@ -231,6 +221,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	kvm_irq_routing_update(kvm);
 	kvm_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 	mutex_unlock(&kvm->irq_lock);
 
 
+	kvm_arch_irq_routing_update(kvm);
+
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
 
 	new = old;
 	new = old;

+ 9 - 2
virt/kvm/kvm_main.c

@@ -230,6 +230,9 @@ int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
 	init_waitqueue_head(&vcpu->wq);
 	init_waitqueue_head(&vcpu->wq);
 	kvm_async_pf_vcpu_init(vcpu);
 	kvm_async_pf_vcpu_init(vcpu);
 
 
+	vcpu->pre_pcpu = -1;
+	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);
+
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 	if (!page) {
 	if (!page) {
 		r = -ENOMEM;
 		r = -ENOMEM;
@@ -2018,6 +2021,8 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		} while (single_task_running() && ktime_before(cur, stop));
 		} while (single_task_running() && ktime_before(cur, stop));
 	}
 	}
 
 
+	kvm_arch_vcpu_blocking(vcpu);
+
 	for (;;) {
 	for (;;) {
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 
@@ -2031,6 +2036,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 	finish_wait(&vcpu->wq, &wait);
 	finish_wait(&vcpu->wq, &wait);
 	cur = ktime_get();
 	cur = ktime_get();
 
 
+	kvm_arch_vcpu_unblocking(vcpu);
 out:
 out:
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 	block_ns = ktime_to_ns(cur) - ktime_to_ns(start);
 
 
@@ -2718,6 +2724,7 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg)
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IRQFD:
 	case KVM_CAP_IRQFD_RESAMPLE:
 	case KVM_CAP_IRQFD_RESAMPLE:
 #endif
 #endif
+	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 	case KVM_CAP_CHECK_EXTENSION_VM:
 		return 1;
 		return 1;
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
@@ -3341,7 +3348,7 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - 1)
 		return -ENOSPC;
 		return -ENOSPC;
 
 
-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count + 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count + 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 	if (!new_bus)
 		return -ENOMEM;
 		return -ENOMEM;
@@ -3373,7 +3380,7 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
 	if (r)
 	if (r)
 		return r;
 		return r;
 
 
-	new_bus = kzalloc(sizeof(*bus) + ((bus->dev_count - 1) *
+	new_bus = kmalloc(sizeof(*bus) + ((bus->dev_count - 1) *
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 			  sizeof(struct kvm_io_range)), GFP_KERNEL);
 	if (!new_bus)
 	if (!new_bus)
 		return -ENOMEM;
 		return -ENOMEM;

+ 2 - 0
virt/lib/Kconfig

@@ -0,0 +1,2 @@
+config IRQ_BYPASS_MANAGER
+	tristate

+ 1 - 0
virt/lib/Makefile

@@ -0,0 +1 @@
+obj-$(CONFIG_IRQ_BYPASS_MANAGER) += irqbypass.o

+ 257 - 0
virt/lib/irqbypass.c

@@ -0,0 +1,257 @@
+/*
+ * IRQ offload/bypass manager
+ *
+ * Copyright (C) 2015 Red Hat, Inc.
+ * Copyright (c) 2015 Linaro Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * Various virtualization hardware acceleration techniques allow bypassing or
+ * offloading interrupts received from devices around the host kernel.  Posted
+ * Interrupts on Intel VT-d systems can allow interrupts to be received
+ * directly by a virtual machine.  ARM IRQ Forwarding allows forwarded physical
+ * interrupts to be directly deactivated by the guest.  This manager allows
+ * interrupt producers and consumers to find each other to enable this sort of
+ * bypass.
+ */
+
+#include <linux/irqbypass.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("IRQ bypass manager utility module");
+
+static LIST_HEAD(producers);
+static LIST_HEAD(consumers);
+static DEFINE_MUTEX(lock);
+
+/* @lock must be held when calling connect */
+static int __connect(struct irq_bypass_producer *prod,
+		     struct irq_bypass_consumer *cons)
+{
+	int ret = 0;
+
+	if (prod->stop)
+		prod->stop(prod);
+	if (cons->stop)
+		cons->stop(cons);
+
+	if (prod->add_consumer)
+		ret = prod->add_consumer(prod, cons);
+
+	if (!ret) {
+		ret = cons->add_producer(cons, prod);
+		if (ret && prod->del_consumer)
+			prod->del_consumer(prod, cons);
+	}
+
+	if (cons->start)
+		cons->start(cons);
+	if (prod->start)
+		prod->start(prod);
+
+	return ret;
+}
+
+/* @lock must be held when calling disconnect */
+static void __disconnect(struct irq_bypass_producer *prod,
+			 struct irq_bypass_consumer *cons)
+{
+	if (prod->stop)
+		prod->stop(prod);
+	if (cons->stop)
+		cons->stop(cons);
+
+	cons->del_producer(cons, prod);
+
+	if (prod->del_consumer)
+		prod->del_consumer(prod, cons);
+
+	if (cons->start)
+		cons->start(cons);
+	if (prod->start)
+		prod->start(prod);
+}
+
+/**
+ * irq_bypass_register_producer - register IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Add the provided IRQ producer to the list of producers and connect
+ * with any matching token found on the IRQ consumers list.
+ */
+int irq_bypass_register_producer(struct irq_bypass_producer *producer)
+{
+	struct irq_bypass_producer *tmp;
+	struct irq_bypass_consumer *consumer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &producers, node) {
+		if (tmp->token == producer->token) {
+			mutex_unlock(&lock);
+			module_put(THIS_MODULE);
+			return -EBUSY;
+		}
+	}
+
+	list_for_each_entry(consumer, &consumers, node) {
+		if (consumer->token == producer->token) {
+			int ret = __connect(producer, consumer);
+			if (ret) {
+				mutex_unlock(&lock);
+				module_put(THIS_MODULE);
+				return ret;
+			}
+			break;
+		}
+	}
+
+	list_add(&producer->node, &producers);
+
+	mutex_unlock(&lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_producer);
+
+/**
+ * irq_bypass_unregister_producer - unregister IRQ bypass producer
+ * @producer: pointer to producer structure
+ *
+ * Remove a previously registered IRQ producer from the list of producers
+ * and disconnect it from any connected IRQ consumer.
+ */
+void irq_bypass_unregister_producer(struct irq_bypass_producer *producer)
+{
+	struct irq_bypass_producer *tmp;
+	struct irq_bypass_consumer *consumer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return; /* nothing in the list anyway */
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &producers, node) {
+		if (tmp->token != producer->token)
+			continue;
+
+		list_for_each_entry(consumer, &consumers, node) {
+			if (consumer->token == producer->token) {
+				__disconnect(producer, consumer);
+				break;
+			}
+		}
+
+		list_del(&producer->node);
+		module_put(THIS_MODULE);
+		break;
+	}
+
+	mutex_unlock(&lock);
+
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_producer);
+
+/**
+ * irq_bypass_register_consumer - register IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Add the provided IRQ consumer to the list of consumers and connect
+ * with any matching token found on the IRQ producer list.
+ */
+int irq_bypass_register_consumer(struct irq_bypass_consumer *consumer)
+{
+	struct irq_bypass_consumer *tmp;
+	struct irq_bypass_producer *producer;
+
+	if (!consumer->add_producer || !consumer->del_producer)
+		return -EINVAL;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &consumers, node) {
+		if (tmp->token == consumer->token) {
+			mutex_unlock(&lock);
+			module_put(THIS_MODULE);
+			return -EBUSY;
+		}
+	}
+
+	list_for_each_entry(producer, &producers, node) {
+		if (producer->token == consumer->token) {
+			int ret = __connect(producer, consumer);
+			if (ret) {
+				mutex_unlock(&lock);
+				module_put(THIS_MODULE);
+				return ret;
+			}
+			break;
+		}
+	}
+
+	list_add(&consumer->node, &consumers);
+
+	mutex_unlock(&lock);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(irq_bypass_register_consumer);
+
+/**
+ * irq_bypass_unregister_consumer - unregister IRQ bypass consumer
+ * @consumer: pointer to consumer structure
+ *
+ * Remove a previously registered IRQ consumer from the list of consumers
+ * and disconnect it from any connected IRQ producer.
+ */
+void irq_bypass_unregister_consumer(struct irq_bypass_consumer *consumer)
+{
+	struct irq_bypass_consumer *tmp;
+	struct irq_bypass_producer *producer;
+
+	might_sleep();
+
+	if (!try_module_get(THIS_MODULE))
+		return; /* nothing in the list anyway */
+
+	mutex_lock(&lock);
+
+	list_for_each_entry(tmp, &consumers, node) {
+		if (tmp->token != consumer->token)
+			continue;
+
+		list_for_each_entry(producer, &producers, node) {
+			if (producer->token == consumer->token) {
+				__disconnect(producer, consumer);
+				break;
+			}
+		}
+
+		list_del(&consumer->node);
+		module_put(THIS_MODULE);
+		break;
+	}
+
+	mutex_unlock(&lock);
+
+	module_put(THIS_MODULE);
+}
+EXPORT_SYMBOL_GPL(irq_bypass_unregister_consumer);