Browse Source

Merge branch 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm

* 'kvm-updates/2.6.36' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (198 commits)
  KVM: VMX: Fix host GDT.LIMIT corruption
  KVM: MMU: using __xchg_spte more smarter
  KVM: MMU: cleanup spte set and accssed/dirty tracking
  KVM: MMU: don't atomicly set spte if it's not present
  KVM: MMU: fix page dirty tracking lost while sync page
  KVM: MMU: fix broken page accessed tracking with ept enabled
  KVM: MMU: add missing reserved bits check in speculative path
  KVM: MMU: fix mmu notifier invalidate handler for huge spte
  KVM: x86 emulator: fix xchg instruction emulation
  KVM: x86: Call mask notifiers from pic
  KVM: x86: never re-execute instruction with enabled tdp
  KVM: Document KVM_GET_SUPPORTED_CPUID2 ioctl
  KVM: x86: emulator: inc/dec can have lock prefix
  KVM: MMU: Eliminate redundant temporaries in FNAME(fetch)
  KVM: MMU: Validate all gptes during fetch, not just those used for new pages
  KVM: MMU: Simplify spte fetch() function
  KVM: MMU: Add gpte_valid() helper
  KVM: MMU: Add validate_direct_spte() helper
  KVM: MMU: Add drop_large_spte() helper
  KVM: MMU: Use __set_spte to link shadow pages
  ...
Linus Torvalds 15 years ago
parent
commit
5e83f6fbdb
63 changed files with 3328 additions and 2103 deletions
  1. 0 21
      Documentation/feature-removal-schedule.txt
  2. 174 34
      Documentation/kvm/api.txt
  3. 48 4
      Documentation/kvm/mmu.txt
  4. 153 0
      Documentation/kvm/msr.txt
  5. 38 0
      Documentation/kvm/review-checklist.txt
  6. 1 0
      arch/ia64/include/asm/kvm_host.h
  7. 13 37
      arch/ia64/kvm/kvm-ia64.c
  8. 9 1
      arch/powerpc/include/asm/kvm_book3s.h
  9. 15 12
      arch/powerpc/include/asm/kvm_fpu.h
  10. 15 3
      arch/powerpc/include/asm/kvm_host.h
  11. 0 4
      arch/powerpc/kernel/ppc_ksyms.c
  12. 2 1
      arch/powerpc/kvm/44x_tlb.c
  13. 2 0
      arch/powerpc/kvm/Makefile
  14. 39 40
      arch/powerpc/kvm/book3s.c
  15. 4 4
      arch/powerpc/kvm/book3s_32_mmu.c
  16. 12 122
      arch/powerpc/kvm/book3s_32_mmu_host.c
  17. 9 120
      arch/powerpc/kvm/book3s_64_mmu_host.c
  18. 277 0
      arch/powerpc/kvm/book3s_mmu_hpte.c
  19. 37 57
      arch/powerpc/kvm/book3s_paired_singles.c
  20. 1 11
      arch/powerpc/kvm/booke.c
  21. 18 0
      arch/powerpc/kvm/fpu.S
  22. 3 11
      arch/powerpc/kvm/powerpc.c
  23. 3 2
      arch/s390/include/asm/kvm_host.h
  24. 1 1
      arch/s390/kvm/intercept.c
  25. 19 45
      arch/s390/kvm/kvm-s390.c
  26. 1 1
      arch/s390/kvm/kvm-s390.h
  27. 2 0
      arch/x86/include/asm/i387.h
  28. 22 0
      arch/x86/include/asm/kvm.h
  29. 25 5
      arch/x86/include/asm/kvm_emulate.h
  30. 17 53
      arch/x86/include/asm/kvm_host.h
  31. 2 0
      arch/x86/include/asm/msr-index.h
  32. 5 0
      arch/x86/include/asm/vmx.h
  33. 6 0
      arch/x86/include/asm/xsave.h
  34. 2 1
      arch/x86/kernel/i387.c
  35. 1 0
      arch/x86/kernel/process.c
  36. 422 327
      arch/x86/kvm/emulate.c
  37. 90 56
      arch/x86/kvm/i8254.c
  38. 3 1
      arch/x86/kvm/i8254.h
  39. 31 17
      arch/x86/kvm/i8259.c
  40. 1 1
      arch/x86/kvm/irq.c
  41. 0 4
      arch/x86/kvm/irq.h
  42. 8 0
      arch/x86/kvm/kvm_cache_regs.h
  43. 8 9
      arch/x86/kvm/lapic.c
  44. 497 310
      arch/x86/kvm/mmu.c
  45. 1 1
      arch/x86/kvm/mmutrace.h
  46. 145 107
      arch/x86/kvm/paging_tmpl.h
  47. 121 17
      arch/x86/kvm/svm.c
  48. 15 1
      arch/x86/kvm/timer.c
  49. 175 78
      arch/x86/kvm/vmx.c
  50. 649 525
      arch/x86/kvm/x86.c
  51. 0 7
      arch/x86/kvm/x86.h
  52. 13 0
      include/linux/kvm.h
  53. 27 8
      include/linux/kvm_host.h
  54. 2 2
      include/linux/kvm_types.h
  55. 8 0
      include/linux/mm.h
  56. 33 0
      mm/memory-failure.c
  57. 1 6
      virt/kvm/assigned-dev.c
  58. 1 0
      virt/kvm/coalesced_mmio.c
  59. 1 0
      virt/kvm/eventfd.c
  60. 2 1
      virt/kvm/ioapic.c
  61. 9 3
      virt/kvm/iommu.c
  62. 10 5
      virt/kvm/irq_comm.c
  63. 79 27
      virt/kvm/kvm_main.c

+ 0 - 21
Documentation/feature-removal-schedule.txt

@@ -487,17 +487,6 @@ Who:	Jan Kiszka <jan.kiszka@web.de>
 
 
 ----------------------------
 ----------------------------
 
 
-What:	KVM memory aliases support
-When:	July 2010
-Why:	Memory aliasing support is used for speeding up guest vga access
-	through the vga windows.
-
-	Modern userspace no longer uses this feature, so it's just bitrotted
-	code and can be removed with no impact.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	xtime, wall_to_monotonic
 What:	xtime, wall_to_monotonic
 When:	2.6.36+
 When:	2.6.36+
 Files:	kernel/time/timekeeping.c include/linux/time.h
 Files:	kernel/time/timekeeping.c include/linux/time.h
@@ -508,16 +497,6 @@ Who:	John Stultz <johnstul@us.ibm.com>
 
 
 ----------------------------
 ----------------------------
 
 
-What:	KVM kernel-allocated memory slots
-When:	July 2010
-Why:	Since 2.6.25, kvm supports user-allocated memory slots, which are
-	much more flexible than kernel-allocated slots.  All current userspace
-	supports the newer interface and this code can be removed with no
-	impact.
-Who:	Avi Kivity <avi@redhat.com>
-
-----------------------------
-
 What:	KVM paravirt mmu host support
 What:	KVM paravirt mmu host support
 When:	January 2011
 When:	January 2011
 Why:	The paravirt mmu host support is slower than non-paravirt mmu, both
 Why:	The paravirt mmu host support is slower than non-paravirt mmu, both

+ 174 - 34
Documentation/kvm/api.txt

@@ -126,6 +126,10 @@ user fills in the size of the indices array in nmsrs, and in return
 kvm adjusts nmsrs to reflect the actual number of msrs and fills in
 kvm adjusts nmsrs to reflect the actual number of msrs and fills in
 the indices array with their numbers.
 the indices array with their numbers.
 
 
+Note: if kvm indicates supports MCE (KVM_CAP_MCE), then the MCE bank MSRs are
+not returned in the MSR list, as different vcpus can have a different number
+of banks, as set via the KVM_X86_SETUP_MCE ioctl.
+
 4.4 KVM_CHECK_EXTENSION
 4.4 KVM_CHECK_EXTENSION
 
 
 Capability: basic
 Capability: basic
@@ -160,29 +164,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_region (in)
 Parameters: struct kvm_memory_region (in)
 Returns: 0 on success, -1 on error
 Returns: 0 on success, -1 on error
 
 
-struct kvm_memory_region {
-	__u32 slot;
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size; /* bytes */
-};
-
-/* for kvm_memory_region::flags */
-#define KVM_MEM_LOG_DIRTY_PAGES  1UL
-
-This ioctl allows the user to create or modify a guest physical memory
-slot.  When changing an existing slot, it may be moved in the guest
-physical memory space, or its flags may be modified.  It may not be
-resized.  Slots may not overlap.
-
-The flags field supports just one flag, KVM_MEM_LOG_DIRTY_PAGES, which
-instructs kvm to keep track of writes to memory within the slot.  See
-the KVM_GET_DIRTY_LOG ioctl.
-
-It is recommended to use the KVM_SET_USER_MEMORY_REGION ioctl instead
-of this API, if available.  This newer API allows placing guest memory
-at specified locations in the host address space, yielding better
-control and easy access.
+This ioctl is obsolete and has been removed.
 
 
 4.6 KVM_CREATE_VCPU
 4.6 KVM_CREATE_VCPU
 
 
@@ -226,17 +208,7 @@ Type: vm ioctl
 Parameters: struct kvm_memory_alias (in)
 Parameters: struct kvm_memory_alias (in)
 Returns: 0 (success), -1 (error)
 Returns: 0 (success), -1 (error)
 
 
-struct kvm_memory_alias {
-	__u32 slot;  /* this has a different namespace than memory slots */
-	__u32 flags;
-	__u64 guest_phys_addr;
-	__u64 memory_size;
-	__u64 target_phys_addr;
-};
-
-Defines a guest physical address space region as an alias to another
-region.  Useful for aliased address, for example the VGA low memory
-window. Should not be used with userspace memory.
+This ioctl is obsolete and has been removed.
 
 
 4.9 KVM_RUN
 4.9 KVM_RUN
 
 
@@ -892,6 +864,174 @@ arguments.
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
 irqchip, the multiprocessing state must be maintained by userspace.
 irqchip, the multiprocessing state must be maintained by userspace.
 
 
+4.39 KVM_SET_IDENTITY_MAP_ADDR
+
+Capability: KVM_CAP_SET_IDENTITY_MAP_ADDR
+Architectures: x86
+Type: vm ioctl
+Parameters: unsigned long identity (in)
+Returns: 0 on success, -1 on error
+
+This ioctl defines the physical address of a one-page region in the guest
+physical address space.  The region must be within the first 4GB of the
+guest physical address space and must not conflict with any memory slot
+or any mmio address.  The guest may malfunction if it accesses this memory
+region.
+
+This ioctl is required on Intel-based hosts.  This is needed on Intel hardware
+because of a quirk in the virtualization implementation (see the internals
+documentation when it pops into existence).
+
+4.40 KVM_SET_BOOT_CPU_ID
+
+Capability: KVM_CAP_SET_BOOT_CPU_ID
+Architectures: x86, ia64
+Type: vm ioctl
+Parameters: unsigned long vcpu_id
+Returns: 0 on success, -1 on error
+
+Define which vcpu is the Bootstrap Processor (BSP).  Values are the same
+as the vcpu id in KVM_CREATE_VCPU.  If this ioctl is not called, the default
+is vcpu 0.
+
+4.41 KVM_GET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy current vcpu's xsave struct to the userspace.
+
+4.42 KVM_SET_XSAVE
+
+Capability: KVM_CAP_XSAVE
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xsave (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+This ioctl would copy userspace's xsave struct to the kernel.
+
+4.43 KVM_GET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (out)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would copy current vcpu's xcrs to the userspace.
+
+4.44 KVM_SET_XCRS
+
+Capability: KVM_CAP_XCRS
+Architectures: x86
+Type: vcpu ioctl
+Parameters: struct kvm_xcrs (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
+This ioctl would set vcpu's xcr to the value userspace specified.
+
+4.45 KVM_GET_SUPPORTED_CPUID
+
+Capability: KVM_CAP_EXT_CPUID
+Architectures: x86
+Type: system ioctl
+Parameters: struct kvm_cpuid2 (in/out)
+Returns: 0 on success, -1 on error
+
+struct kvm_cpuid2 {
+	__u32 nent;
+	__u32 padding;
+	struct kvm_cpuid_entry2 entries[0];
+};
+
+#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
+#define KVM_CPUID_FLAG_STATEFUL_FUNC    2
+#define KVM_CPUID_FLAG_STATE_READ_NEXT  4
+
+struct kvm_cpuid_entry2 {
+	__u32 function;
+	__u32 index;
+	__u32 flags;
+	__u32 eax;
+	__u32 ebx;
+	__u32 ecx;
+	__u32 edx;
+	__u32 padding[3];
+};
+
+This ioctl returns x86 cpuid features which are supported by both the hardware
+and kvm.  Userspace can use the information returned by this ioctl to
+construct cpuid information (for KVM_SET_CPUID2) that is consistent with
+hardware, kernel, and userspace capabilities, and with user requirements (for
+example, the user may wish to constrain cpuid to emulate older hardware,
+or for feature consistency across a cluster).
+
+Userspace invokes KVM_GET_SUPPORTED_CPUID by passing a kvm_cpuid2 structure
+with the 'nent' field indicating the number of entries in the variable-size
+array 'entries'.  If the number of entries is too low to describe the cpu
+capabilities, an error (E2BIG) is returned.  If the number is too high,
+the 'nent' field is adjusted and an error (ENOMEM) is returned.  If the
+number is just right, the 'nent' field is adjusted to the number of valid
+entries in the 'entries' array, which is then filled.
+
+The entries returned are the host cpuid as returned by the cpuid instruction,
+with unknown or unsupported features masked out.  The fields in each entry
+are defined as follows:
+
+  function: the eax value used to obtain the entry
+  index: the ecx value used to obtain the entry (for entries that are
+         affected by ecx)
+  flags: an OR of zero or more of the following:
+        KVM_CPUID_FLAG_SIGNIFCANT_INDEX:
+           if the index field is valid
+        KVM_CPUID_FLAG_STATEFUL_FUNC:
+           if cpuid for this function returns different values for successive
+           invocations; there will be several entries with the same function,
+           all with this flag set
+        KVM_CPUID_FLAG_STATE_READ_NEXT:
+           for KVM_CPUID_FLAG_STATEFUL_FUNC entries, set if this entry is
+           the first entry to be read by a cpu
+   eax, ebx, ecx, edx: the values returned by the cpuid instruction for
+         this function/index combination
+
 5. The kvm_run structure
 5. The kvm_run structure
 
 
 Application code obtains a pointer to the kvm_run structure by
 Application code obtains a pointer to the kvm_run structure by

+ 48 - 4
Documentation/kvm/mmu.txt

@@ -77,10 +77,10 @@ Memory
 
 
 Guest memory (gpa) is part of the user address space of the process that is
 Guest memory (gpa) is part of the user address space of the process that is
 using kvm.  Userspace defines the translation between guest addresses and user
 using kvm.  Userspace defines the translation between guest addresses and user
-addresses (gpa->hva); note that two gpas may alias to the same gva, but not
+addresses (gpa->hva); note that two gpas may alias to the same hva, but not
 vice versa.
 vice versa.
 
 
-These gvas may be backed using any method available to the host: anonymous
+These hvas may be backed using any method available to the host: anonymous
 memory, file backed memory, and device memory.  Memory might be paged by the
 memory, file backed memory, and device memory.  Memory might be paged by the
 host at any time.
 host at any time.
 
 
@@ -161,7 +161,7 @@ Shadow pages contain the following information:
   role.cr4_pae:
   role.cr4_pae:
     Contains the value of cr4.pae for which the page is valid (e.g. whether
     Contains the value of cr4.pae for which the page is valid (e.g. whether
     32-bit or 64-bit gptes are in use).
     32-bit or 64-bit gptes are in use).
-  role.cr4_nxe:
+  role.nxe:
     Contains the value of efer.nxe for which the page is valid.
     Contains the value of efer.nxe for which the page is valid.
   role.cr0_wp:
   role.cr0_wp:
     Contains the value of cr0.wp for which the page is valid.
     Contains the value of cr0.wp for which the page is valid.
@@ -180,7 +180,9 @@ Shadow pages contain the following information:
     guest pages as leaves.
     guest pages as leaves.
   gfns:
   gfns:
     An array of 512 guest frame numbers, one for each present pte.  Used to
     An array of 512 guest frame numbers, one for each present pte.  Used to
-    perform a reverse map from a pte to a gfn.
+    perform a reverse map from a pte to a gfn. When role.direct is set, any
+    element of this array can be calculated from the gfn field when used, in
+    this case, the array of gfns is not allocated. See role.direct and gfn.
   slot_bitmap:
   slot_bitmap:
     A bitmap containing one bit per memory slot.  If the page contains a pte
     A bitmap containing one bit per memory slot.  If the page contains a pte
     mapping a page from memory slot n, then bit n of slot_bitmap will be set
     mapping a page from memory slot n, then bit n of slot_bitmap will be set
@@ -296,6 +298,48 @@ Host translation updates:
   - look up affected sptes through reverse map
   - look up affected sptes through reverse map
   - drop (or update) translations
   - drop (or update) translations
 
 
+Emulating cr0.wp
+================
+
+If tdp is not enabled, the host must keep cr0.wp=1 so page write protection
+works for the guest kernel, not guest guest userspace.  When the guest
+cr0.wp=1, this does not present a problem.  However when the guest cr0.wp=0,
+we cannot map the permissions for gpte.u=1, gpte.w=0 to any spte (the
+semantics require allowing any guest kernel access plus user read access).
+
+We handle this by mapping the permissions to two possible sptes, depending
+on fault type:
+
+- kernel write fault: spte.u=0, spte.w=1 (allows full kernel access,
+  disallows user access)
+- read fault: spte.u=1, spte.w=0 (allows full read access, disallows kernel
+  write access)
+
+(user write faults generate a #PF)
+
+Large pages
+===========
+
+The mmu supports all combinations of large and small guest and host pages.
+Supported page sizes include 4k, 2M, 4M, and 1G.  4M pages are treated as
+two separate 2M pages, on both guest and host, since the mmu always uses PAE
+paging.
+
+To instantiate a large spte, four constraints must be satisfied:
+
+- the spte must point to a large host page
+- the guest pte must be a large pte of at least equivalent size (if tdp is
+  enabled, there is no guest pte and this condition is satisified)
+- if the spte will be writeable, the large page frame may not overlap any
+  write-protected pages
+- the guest page must be wholly contained by a single memory slot
+
+To check the last two conditions, the mmu maintains a ->write_count set of
+arrays for each memory slot and large page size.  Every write protected page
+causes its write_count to be incremented, thus preventing instantiation of
+a large spte.  The frames at the end of an unaligned memory slot have
+artificically inflated ->write_counts so they can never be instantiated.
+
 Further reading
 Further reading
 ===============
 ===============
 
 

+ 153 - 0
Documentation/kvm/msr.txt

@@ -0,0 +1,153 @@
+KVM-specific MSRs.
+Glauber Costa <glommer@redhat.com>, Red Hat Inc, 2010
+=====================================================
+
+KVM makes use of some custom MSRs to service some requests.
+At present, this facility is only used by kvmclock.
+
+Custom MSRs have a range reserved for them, that goes from
+0x4b564d00 to 0x4b564dff. There are MSRs outside this area,
+but they are deprecated and their use is discouraged.
+
+Custom MSR list
+--------
+
+The current supported Custom MSR list is:
+
+MSR_KVM_WALL_CLOCK_NEW:   0x4b564d00
+
+	data: 4-byte alignment physical address of a memory area which must be
+	in guest RAM. This memory is expected to hold a copy of the following
+	structure:
+
+	struct pvclock_wall_clock {
+		u32   version;
+		u32   sec;
+		u32   nsec;
+	} __attribute__((__packed__));
+
+	whose data will be filled in by the hypervisor. The hypervisor is only
+	guaranteed to update this data at the moment of MSR write.
+	Users that want to reliably query this information more than once have
+	to write more than once to this MSR. Fields have the following meanings:
+
+		version: guest has to check version before and after grabbing
+		time information and check that they are both equal and even.
+		An odd version indicates an in-progress update.
+
+		sec: number of seconds for wallclock.
+
+		nsec: number of nanoseconds for wallclock.
+
+	Note that although MSRs are per-CPU entities, the effect of this
+	particular MSR is global.
+
+	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+	leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME_NEW:  0x4b564d01
+
+	data: 4-byte aligned physical address of a memory area which must be in
+	guest RAM, plus an enable bit in bit 0. This memory is expected to hold
+	a copy of the following structure:
+
+	struct pvclock_vcpu_time_info {
+		u32   version;
+		u32   pad0;
+		u64   tsc_timestamp;
+		u64   system_time;
+		u32   tsc_to_system_mul;
+		s8    tsc_shift;
+		u8    flags;
+		u8    pad[2];
+	} __attribute__((__packed__)); /* 32 bytes */
+
+	whose data will be filled in by the hypervisor periodically. Only one
+	write, or registration, is needed for each VCPU. The interval between
+	updates of this structure is arbitrary and implementation-dependent.
+	The hypervisor may update this structure at any time it sees fit until
+	anything with bit0 == 0 is written to it.
+
+	Fields have the following meanings:
+
+		version: guest has to check version before and after grabbing
+		time information and check that they are both equal and even.
+		An odd version indicates an in-progress update.
+
+		tsc_timestamp: the tsc value at the current VCPU at the time
+		of the update of this structure. Guests can subtract this value
+		from current tsc to derive a notion of elapsed time since the
+		structure update.
+
+		system_time: a host notion of monotonic time, including sleep
+		time at the time this structure was last updated. Unit is
+		nanoseconds.
+
+		tsc_to_system_mul: a function of the tsc frequency. One has
+		to multiply any tsc-related quantity by this value to get
+		a value in nanoseconds, besides dividing by 2^tsc_shift
+
+		tsc_shift: cycle to nanosecond divider, as a power of two, to
+		allow for shift rights. One has to shift right any tsc-related
+		quantity by this value to get a value in nanoseconds, besides
+		multiplying by tsc_to_system_mul.
+
+		With this information, guests can derive per-CPU time by
+		doing:
+
+			time = (current_tsc - tsc_timestamp)
+			time = (time * tsc_to_system_mul) >> tsc_shift
+			time = time + system_time
+
+		flags: bits in this field indicate extended capabilities
+		coordinated between the guest and the hypervisor. Availability
+		of specific flags has to be checked in 0x40000001 cpuid leaf.
+		Current flags are:
+
+		 flag bit   | cpuid bit    | meaning
+		-------------------------------------------------------------
+			    |	           | time measures taken across
+		     0      |	   24      | multiple cpus are guaranteed to
+			    |		   | be monotonic
+		-------------------------------------------------------------
+
+	Availability of this MSR must be checked via bit 3 in 0x4000001 cpuid
+	leaf prior to usage.
+
+
+MSR_KVM_WALL_CLOCK:  0x11
+
+	data and functioning: same as MSR_KVM_WALL_CLOCK_NEW. Use that instead.
+
+	This MSR falls outside the reserved KVM range and may be removed in the
+	future. Its usage is deprecated.
+
+	Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+	leaf prior to usage.
+
+MSR_KVM_SYSTEM_TIME: 0x12
+
+	data and functioning: same as MSR_KVM_SYSTEM_TIME_NEW. Use that instead.
+
+	This MSR falls outside the reserved KVM range and may be removed in the
+	future. Its usage is deprecated.
+
+	Availability of this MSR must be checked via bit 0 in 0x4000001 cpuid
+	leaf prior to usage.
+
+	The suggested algorithm for detecting kvmclock presence is then:
+
+		if (!kvm_para_available())    /* refer to cpuid.txt */
+			return NON_PRESENT;
+
+		flags = cpuid_eax(0x40000001);
+		if (flags & 3) {
+			msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+			msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+			return PRESENT;
+		} else if (flags & 0) {
+			msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+			msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
+			return PRESENT;
+		} else
+			return NON_PRESENT;

+ 38 - 0
Documentation/kvm/review-checklist.txt

@@ -0,0 +1,38 @@
+Review checklist for kvm patches
+================================
+
+1.  The patch must follow Documentation/CodingStyle and
+    Documentation/SubmittingPatches.
+
+2.  Patches should be against kvm.git master branch.
+
+3.  If the patch introduces or modifies a new userspace API:
+    - the API must be documented in Documentation/kvm/api.txt
+    - the API must be discoverable using KVM_CHECK_EXTENSION
+
+4.  New state must include support for save/restore.
+
+5.  New features must default to off (userspace should explicitly request them).
+    Performance improvements can and should default to on.
+
+6.  New cpu features should be exposed via KVM_GET_SUPPORTED_CPUID2
+
+7.  Emulator changes should be accompanied by unit tests for qemu-kvm.git
+    kvm/test directory.
+
+8.  Changes should be vendor neutral when possible.  Changes to common code
+    are better than duplicating changes to vendor code.
+
+9.  Similarly, prefer changes to arch independent code than to arch dependent
+    code.
+
+10. User/kernel interfaces and guest/host interfaces must be 64-bit clean
+    (all variables and sizes naturally aligned on 64-bit; use specific types
+    only - u64 rather than ulong).
+
+11. New guest visible features must either be documented in a hardware manual
+    or be accompanied by documentation.
+
+12. Features must be robust against reset and kexec - for example, shared
+    host/guest memory must be unshared to prevent the host from writing to
+    guest memory that the guest has not reserved for this purpose.

+ 1 - 0
arch/ia64/include/asm/kvm_host.h

@@ -235,6 +235,7 @@ struct kvm_vm_data {
 #define KVM_REQ_PTC_G		32
 #define KVM_REQ_PTC_G		32
 #define KVM_REQ_RESUME		33
 #define KVM_REQ_RESUME		33
 
 
+#define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	1
 #define KVM_PAGES_PER_HPAGE(x)	1
 
 

+ 13 - 37
arch/ia64/kvm/kvm-ia64.c

@@ -725,8 +725,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	int r;
 	sigset_t sigsaved;
 	sigset_t sigsaved;
 
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
 
@@ -748,7 +746,6 @@ out:
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }
 
 
@@ -883,8 +880,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
 	struct vpd *vpd = to_host(vcpu->kvm, vcpu->arch.vpd);
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	for (i = 0; i < 16; i++) {
 	for (i = 0; i < 16; i++) {
 		vpd->vgr[i] = regs->vpd.vgr[i];
 		vpd->vgr[i] = regs->vpd.vgr[i];
 		vpd->vbgr[i] = regs->vpd.vbgr[i];
 		vpd->vbgr[i] = regs->vpd.vbgr[i];
@@ -931,8 +926,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
 	vcpu->arch.itc_offset = regs->saved_itc - kvm_get_itc(vcpu);
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 	set_bit(KVM_REQ_RESUME, &vcpu->requests);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1802,35 +1795,24 @@ void kvm_arch_exit(void)
 	kvm_vmm_info = NULL;
 	kvm_vmm_info = NULL;
 }
 }
 
 
-static int kvm_ia64_sync_dirty_log(struct kvm *kvm,
-		struct kvm_dirty_log *log)
+static void kvm_ia64_sync_dirty_log(struct kvm *kvm,
+				    struct kvm_memory_slot *memslot)
 {
 {
-	struct kvm_memory_slot *memslot;
-	int r, i;
+	int i;
 	long base;
 	long base;
 	unsigned long n;
 	unsigned long n;
 	unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
 	unsigned long *dirty_bitmap = (unsigned long *)(kvm->arch.vm_base +
 			offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 			offsetof(struct kvm_vm_data, kvm_mem_dirty_log));
 
 
-	r = -EINVAL;
-	if (log->slot >= KVM_MEMORY_SLOTS)
-		goto out;
-
-	memslot = &kvm->memslots->memslots[log->slot];
-	r = -ENOENT;
-	if (!memslot->dirty_bitmap)
-		goto out;
-
 	n = kvm_dirty_bitmap_bytes(memslot);
 	n = kvm_dirty_bitmap_bytes(memslot);
 	base = memslot->base_gfn / BITS_PER_LONG;
 	base = memslot->base_gfn / BITS_PER_LONG;
 
 
+	spin_lock(&kvm->arch.dirty_log_lock);
 	for (i = 0; i < n/sizeof(long); ++i) {
 	for (i = 0; i < n/sizeof(long); ++i) {
 		memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
 		memslot->dirty_bitmap[i] = dirty_bitmap[base + i];
 		dirty_bitmap[base + i] = 0;
 		dirty_bitmap[base + i] = 0;
 	}
 	}
-	r = 0;
-out:
-	return r;
+	spin_unlock(&kvm->arch.dirty_log_lock);
 }
 }
 
 
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
@@ -1842,12 +1824,17 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	int is_dirty = 0;
 	int is_dirty = 0;
 
 
 	mutex_lock(&kvm->slots_lock);
 	mutex_lock(&kvm->slots_lock);
-	spin_lock(&kvm->arch.dirty_log_lock);
 
 
-	r = kvm_ia64_sync_dirty_log(kvm, log);
-	if (r)
+	r = -EINVAL;
+	if (log->slot >= KVM_MEMORY_SLOTS)
+		goto out;
+
+	memslot = &kvm->memslots->memslots[log->slot];
+	r = -ENOENT;
+	if (!memslot->dirty_bitmap)
 		goto out;
 		goto out;
 
 
+	kvm_ia64_sync_dirty_log(kvm, memslot);
 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
 	r = kvm_get_dirty_log(kvm, log, &is_dirty);
 	if (r)
 	if (r)
 		goto out;
 		goto out;
@@ -1855,14 +1842,12 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	/* If nothing is dirty, don't bother messing with page tables. */
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 	if (is_dirty) {
 		kvm_flush_remote_tlbs(kvm);
 		kvm_flush_remote_tlbs(kvm);
-		memslot = &kvm->memslots->memslots[log->slot];
 		n = kvm_dirty_bitmap_bytes(memslot);
 		n = kvm_dirty_bitmap_bytes(memslot);
 		memset(memslot->dirty_bitmap, 0, n);
 		memset(memslot->dirty_bitmap, 0, n);
 	}
 	}
 	r = 0;
 	r = 0;
 out:
 out:
 	mutex_unlock(&kvm->slots_lock);
 	mutex_unlock(&kvm->slots_lock);
-	spin_unlock(&kvm->arch.dirty_log_lock);
 	return r;
 	return r;
 }
 }
 
 
@@ -1953,11 +1938,6 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 	return vcpu->arch.timer_fired;
 	return vcpu->arch.timer_fired;
 }
 }
 
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 {
 {
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
 	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) ||
@@ -1967,9 +1947,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 				    struct kvm_mp_state *mp_state)
 {
 {
-	vcpu_load(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
 	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -2000,10 +1978,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 {
 {
 	int r = 0;
 	int r = 0;
 
 
-	vcpu_load(vcpu);
 	vcpu->arch.mp_state = mp_state->mp_state;
 	vcpu->arch.mp_state = mp_state->mp_state;
 	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
 	if (vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)
 		r = vcpu_reset(vcpu);
 		r = vcpu_reset(vcpu);
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }

+ 9 - 1
arch/powerpc/include/asm/kvm_book3s.h

@@ -115,7 +115,15 @@ extern void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
 extern void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu);
-extern struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data);
+
+extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu);
+extern int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu);
+extern void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte);
+extern int kvmppc_mmu_hpte_sysinit(void);
+extern void kvmppc_mmu_hpte_sysexit(void);
+
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);
 extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec);

+ 15 - 12
arch/powerpc/include/asm/kvm_fpu.h

@@ -22,24 +22,24 @@
 
 
 #include <linux/types.h>
 #include <linux/types.h>
 
 
-extern void fps_fres(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_frsqrte(struct thread_struct *t, u32 *dst, u32 *src1);
-extern void fps_fsqrts(struct thread_struct *t, u32 *dst, u32 *src1);
+extern void fps_fres(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_frsqrte(u64 *fpscr, u32 *dst, u32 *src1);
+extern void fps_fsqrts(u64 *fpscr, u32 *dst, u32 *src1);
 
 
-extern void fps_fadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fdivs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fmuls(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
-extern void fps_fsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fdivs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fmuls(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
+extern void fps_fsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2);
 
 
-extern void fps_fmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		       u32 *src3);
 		       u32 *src3);
-extern void fps_fmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		       u32 *src3);
 		       u32 *src3);
-extern void fps_fnmadds(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmadds(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		        u32 *src3);
 		        u32 *src3);
-extern void fps_fnmsubs(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fnmsubs(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		        u32 *src3);
 		        u32 *src3);
-extern void fps_fsel(struct thread_struct *t, u32 *dst, u32 *src1, u32 *src2,
+extern void fps_fsel(u64 *fpscr, u32 *dst, u32 *src1, u32 *src2,
 		     u32 *src3);
 		     u32 *src3);
 
 
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
 #define FPD_ONE_IN(name) extern void fpd_ ## name(u64 *fpscr, u32 *cr, \
@@ -82,4 +82,7 @@ FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 FPD_THREE_IN(fnmadd)
 
 
+extern void kvm_cvt_fd(u32 *from, u64 *to, u64 *fpscr);
+extern void kvm_cvt_df(u64 *from, u32 *to, u64 *fpscr);
+
 #endif
 #endif

+ 15 - 3
arch/powerpc/include/asm/kvm_host.h

@@ -35,10 +35,17 @@
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 
 
 /* We don't currently support large pages. */
 /* We don't currently support large pages. */
+#define KVM_HPAGE_GFN_SHIFT(x)	0
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_NR_PAGE_SIZES	1
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 #define KVM_PAGES_PER_HPAGE(x)	(1UL<<31)
 
 
-#define HPTEG_CACHE_NUM 1024
+#define HPTEG_CACHE_NUM			(1 << 15)
+#define HPTEG_HASH_BITS_PTE		13
+#define HPTEG_HASH_BITS_VPTE		13
+#define HPTEG_HASH_BITS_VPTE_LONG	5
+#define HPTEG_HASH_NUM_PTE		(1 << HPTEG_HASH_BITS_PTE)
+#define HPTEG_HASH_NUM_VPTE		(1 << HPTEG_HASH_BITS_VPTE)
+#define HPTEG_HASH_NUM_VPTE_LONG	(1 << HPTEG_HASH_BITS_VPTE_LONG)
 
 
 struct kvm;
 struct kvm;
 struct kvm_run;
 struct kvm_run;
@@ -151,6 +158,9 @@ struct kvmppc_mmu {
 };
 };
 
 
 struct hpte_cache {
 struct hpte_cache {
+	struct hlist_node list_pte;
+	struct hlist_node list_vpte;
+	struct hlist_node list_vpte_long;
 	u64 host_va;
 	u64 host_va;
 	u64 pfn;
 	u64 pfn;
 	ulong slot;
 	ulong slot;
@@ -282,8 +292,10 @@ struct kvm_vcpu_arch {
 	unsigned long pending_exceptions;
 	unsigned long pending_exceptions;
 
 
 #ifdef CONFIG_PPC_BOOK3S
 #ifdef CONFIG_PPC_BOOK3S
-	struct hpte_cache hpte_cache[HPTEG_CACHE_NUM];
-	int hpte_cache_offset;
+	struct hlist_head hpte_hash_pte[HPTEG_HASH_NUM_PTE];
+	struct hlist_head hpte_hash_vpte[HPTEG_HASH_NUM_VPTE];
+	struct hlist_head hpte_hash_vpte_long[HPTEG_HASH_NUM_VPTE_LONG];
+	int hpte_cache_count;
 #endif
 #endif
 };
 };
 
 

+ 0 - 4
arch/powerpc/kernel/ppc_ksyms.c

@@ -101,10 +101,6 @@ EXPORT_SYMBOL(pci_dram_offset);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(start_thread);
 EXPORT_SYMBOL(kernel_thread);
 EXPORT_SYMBOL(kernel_thread);
 
 
-#ifdef CONFIG_PPC_FPU
-EXPORT_SYMBOL_GPL(cvt_df);
-EXPORT_SYMBOL_GPL(cvt_fd);
-#endif
 EXPORT_SYMBOL(giveup_fpu);
 EXPORT_SYMBOL(giveup_fpu);
 #ifdef CONFIG_ALTIVEC
 #ifdef CONFIG_ALTIVEC
 EXPORT_SYMBOL(giveup_altivec);
 EXPORT_SYMBOL(giveup_altivec);

+ 2 - 1
arch/powerpc/kvm/44x_tlb.c

@@ -316,7 +316,8 @@ void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 gvaddr, gpa_t gpaddr,
 	gfn = gpaddr >> PAGE_SHIFT;
 	gfn = gpaddr >> PAGE_SHIFT;
 	new_page = gfn_to_page(vcpu->kvm, gfn);
 	new_page = gfn_to_page(vcpu->kvm, gfn);
 	if (is_error_page(new_page)) {
 	if (is_error_page(new_page)) {
-		printk(KERN_ERR "Couldn't get guest page for gfn %lx!\n", gfn);
+		printk(KERN_ERR "Couldn't get guest page for gfn %llx!\n",
+			(unsigned long long)gfn);
 		kvm_release_page_clean(new_page);
 		kvm_release_page_clean(new_page);
 		return;
 		return;
 	}
 	}

+ 2 - 0
arch/powerpc/kvm/Makefile

@@ -45,6 +45,7 @@ kvm-book3s_64-objs := \
 	book3s.o \
 	book3s.o \
 	book3s_emulate.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu_host.o \
 	book3s_64_mmu.o \
 	book3s_64_mmu.o \
 	book3s_32_mmu.o
 	book3s_32_mmu.o
@@ -57,6 +58,7 @@ kvm-book3s_32-objs := \
 	book3s.o \
 	book3s.o \
 	book3s_emulate.o \
 	book3s_emulate.o \
 	book3s_interrupts.o \
 	book3s_interrupts.o \
+	book3s_mmu_hpte.o \
 	book3s_32_mmu_host.o \
 	book3s_32_mmu_host.o \
 	book3s_32_mmu.o
 	book3s_32_mmu.o
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)
 kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs)

+ 39 - 40
arch/powerpc/kvm/book3s.c

@@ -1047,8 +1047,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	regs->pc = kvmppc_get_pc(vcpu);
 	regs->pc = kvmppc_get_pc(vcpu);
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = kvmppc_get_ctr(vcpu);
 	regs->ctr = kvmppc_get_ctr(vcpu);
@@ -1069,8 +1067,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1078,8 +1074,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	kvmppc_set_pc(vcpu, regs->pc);
 	kvmppc_set_pc(vcpu, regs->pc);
 	kvmppc_set_cr(vcpu, regs->cr);
 	kvmppc_set_cr(vcpu, regs->cr);
 	kvmppc_set_ctr(vcpu, regs->ctr);
 	kvmppc_set_ctr(vcpu, regs->ctr);
@@ -1099,8 +1093,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1110,8 +1102,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	sregs->pvr = vcpu->arch.pvr;
 	sregs->pvr = vcpu->arch.pvr;
 
 
 	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
 	sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1;
@@ -1131,8 +1121,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		}
 		}
 	}
 	}
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1142,8 +1130,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu);
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	kvmppc_set_pvr(vcpu, sregs->pvr);
 	kvmppc_set_pvr(vcpu, sregs->pvr);
 
 
 	vcpu3s->sdr1 = sregs->u.s.sdr1;
 	vcpu3s->sdr1 = sregs->u.s.sdr1;
@@ -1171,8 +1157,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	/* Flush the MMU after messing with the segments */
 	/* Flush the MMU after messing with the segments */
 	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 	kvmppc_mmu_pte_flush(vcpu, 0, 0);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1309,12 +1293,17 @@ extern int __kvmppc_vcpu_entry(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu);
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 {
 {
 	int ret;
 	int ret;
-	struct thread_struct ext_bkp;
+	double fpr[32][TS_FPRWIDTH];
+	unsigned int fpscr;
+	int fpexc_mode;
 #ifdef CONFIG_ALTIVEC
 #ifdef CONFIG_ALTIVEC
-	bool save_vec = current->thread.used_vr;
+	vector128 vr[32];
+	vector128 vscr;
+	unsigned long uninitialized_var(vrsave);
+	int used_vr;
 #endif
 #endif
 #ifdef CONFIG_VSX
 #ifdef CONFIG_VSX
-	bool save_vsx = current->thread.used_vsr;
+	int used_vsr;
 #endif
 #endif
 	ulong ext_msr;
 	ulong ext_msr;
 
 
@@ -1327,27 +1316,27 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	/* Save FPU state in stack */
 	/* Save FPU state in stack */
 	if (current->thread.regs->msr & MSR_FP)
 	if (current->thread.regs->msr & MSR_FP)
 		giveup_fpu(current);
 		giveup_fpu(current);
-	memcpy(ext_bkp.fpr, current->thread.fpr, sizeof(current->thread.fpr));
-	ext_bkp.fpscr = current->thread.fpscr;
-	ext_bkp.fpexc_mode = current->thread.fpexc_mode;
+	memcpy(fpr, current->thread.fpr, sizeof(current->thread.fpr));
+	fpscr = current->thread.fpscr.val;
+	fpexc_mode = current->thread.fpexc_mode;
 
 
 #ifdef CONFIG_ALTIVEC
 #ifdef CONFIG_ALTIVEC
 	/* Save Altivec state in stack */
 	/* Save Altivec state in stack */
-	if (save_vec) {
+	used_vr = current->thread.used_vr;
+	if (used_vr) {
 		if (current->thread.regs->msr & MSR_VEC)
 		if (current->thread.regs->msr & MSR_VEC)
 			giveup_altivec(current);
 			giveup_altivec(current);
-		memcpy(ext_bkp.vr, current->thread.vr, sizeof(ext_bkp.vr));
-		ext_bkp.vscr = current->thread.vscr;
-		ext_bkp.vrsave = current->thread.vrsave;
+		memcpy(vr, current->thread.vr, sizeof(current->thread.vr));
+		vscr = current->thread.vscr;
+		vrsave = current->thread.vrsave;
 	}
 	}
-	ext_bkp.used_vr = current->thread.used_vr;
 #endif
 #endif
 
 
 #ifdef CONFIG_VSX
 #ifdef CONFIG_VSX
 	/* Save VSX state in stack */
 	/* Save VSX state in stack */
-	if (save_vsx && (current->thread.regs->msr & MSR_VSX))
+	used_vsr = current->thread.used_vsr;
+	if (used_vsr && (current->thread.regs->msr & MSR_VSX))
 			__giveup_vsx(current);
 			__giveup_vsx(current);
-	ext_bkp.used_vsr = current->thread.used_vsr;
 #endif
 #endif
 
 
 	/* Remember the MSR with disabled extensions */
 	/* Remember the MSR with disabled extensions */
@@ -1372,22 +1361,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 	kvmppc_giveup_ext(vcpu, MSR_VSX);
 	kvmppc_giveup_ext(vcpu, MSR_VSX);
 
 
 	/* Restore FPU state from stack */
 	/* Restore FPU state from stack */
-	memcpy(current->thread.fpr, ext_bkp.fpr, sizeof(ext_bkp.fpr));
-	current->thread.fpscr = ext_bkp.fpscr;
-	current->thread.fpexc_mode = ext_bkp.fpexc_mode;
+	memcpy(current->thread.fpr, fpr, sizeof(current->thread.fpr));
+	current->thread.fpscr.val = fpscr;
+	current->thread.fpexc_mode = fpexc_mode;
 
 
 #ifdef CONFIG_ALTIVEC
 #ifdef CONFIG_ALTIVEC
 	/* Restore Altivec state from stack */
 	/* Restore Altivec state from stack */
-	if (save_vec && current->thread.used_vr) {
-		memcpy(current->thread.vr, ext_bkp.vr, sizeof(ext_bkp.vr));
-		current->thread.vscr = ext_bkp.vscr;
-		current->thread.vrsave= ext_bkp.vrsave;
+	if (used_vr && current->thread.used_vr) {
+		memcpy(current->thread.vr, vr, sizeof(current->thread.vr));
+		current->thread.vscr = vscr;
+		current->thread.vrsave = vrsave;
 	}
 	}
-	current->thread.used_vr = ext_bkp.used_vr;
+	current->thread.used_vr = used_vr;
 #endif
 #endif
 
 
 #ifdef CONFIG_VSX
 #ifdef CONFIG_VSX
-	current->thread.used_vsr = ext_bkp.used_vsr;
+	current->thread.used_vsr = used_vsr;
 #endif
 #endif
 
 
 	return ret;
 	return ret;
@@ -1395,12 +1384,22 @@ int __kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
 
 static int kvmppc_book3s_init(void)
 static int kvmppc_book3s_init(void)
 {
 {
-	return kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
-			THIS_MODULE);
+	int r;
+
+	r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_book3s), 0,
+		     THIS_MODULE);
+
+	if (r)
+		return r;
+
+	r = kvmppc_mmu_hpte_sysinit();
+
+	return r;
 }
 }
 
 
 static void kvmppc_book3s_exit(void)
 static void kvmppc_book3s_exit(void)
 {
 {
+	kvmppc_mmu_hpte_sysexit();
 	kvm_exit();
 	kvm_exit();
 }
 }
 
 

+ 4 - 4
arch/powerpc/kvm/book3s_32_mmu.c

@@ -354,10 +354,10 @@ static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid,
 		*vsid = VSID_REAL_DR | gvsid;
 		*vsid = VSID_REAL_DR | gvsid;
 		break;
 		break;
 	case MSR_DR|MSR_IR:
 	case MSR_DR|MSR_IR:
-		if (!sr->valid)
-			return -1;
-
-		*vsid = sr->vsid;
+		if (sr->valid)
+			*vsid = sr->vsid;
+		else
+			*vsid = VSID_BAT | gvsid;
 		break;
 		break;
 	default:
 	default:
 		BUG();
 		BUG();

+ 12 - 122
arch/powerpc/kvm/book3s_32_mmu_host.c

@@ -19,6 +19,7 @@
  */
  */
 
 
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_book3s.h>
@@ -57,139 +58,26 @@
 static ulong htab;
 static ulong htab;
 static u32 htabmask;
 static u32 htabmask;
 
 
-static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 {
 	volatile u32 *pteg;
 	volatile u32 *pteg;
 
 
-	dprintk_mmu("KVM: Flushing SPTE: 0x%llx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
+	/* Remove from host HTAB */
 	pteg = (u32*)pte->slot;
 	pteg = (u32*)pte->slot;
-
 	pteg[0] = 0;
 	pteg[0] = 0;
+
+	/* And make sure it's gone from the TLB too */
 	asm volatile ("sync");
 	asm volatile ("sync");
 	asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
 	asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory");
 	asm volatile ("sync");
 	asm volatile ("sync");
 	asm volatile ("tlbsync");
 	asm volatile ("tlbsync");
-
-	pte->host_va = 0;
-
-	if (pte->pte.may_write)
-		kvm_release_pfn_dirty(pte->pfn);
-	else
-		kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%x & 0x%x\n",
-		    vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_ea &= ea_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-
-	/* Doing a complete flush -> start from scratch */
-	if (!ea_mask)
-		vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_vp &= vp_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.vpage & vp_mask) == guest_vp) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.raddr >= pa_start) &&
-		    (pte->pte.raddr < pa_end)) {
-			invalidate_pte(vcpu, pte);
-		}
-	}
-}
-
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-	int i;
-	u64 guest_vp;
-
-	guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-	for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if (pte->pte.vpage == guest_vp)
-			return &pte->pte;
-	}
-
-	return NULL;
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-		kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return vcpu->arch.hpte_cache_offset++;
 }
 }
 
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
  * a hash, so we don't waste cycles on looping */
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 {
-	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+	return hash_64(gvsid, SID_MAP_BITS);
 }
 }
 
 
 
 
@@ -256,7 +144,6 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte)
 	register int rr = 0;
 	register int rr = 0;
 	bool primary = false;
 	bool primary = false;
 	bool evict = false;
 	bool evict = false;
-	int hpte_id;
 	struct hpte_cache *pte;
 	struct hpte_cache *pte;
 
 
 	/* Get host physical address for gpa */
 	/* Get host physical address for gpa */
@@ -341,8 +228,7 @@ next_pteg:
 
 
 	/* Now tell our Shadow PTE code about the new page */
 	/* Now tell our Shadow PTE code about the new page */
 
 
-	hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-	pte = &vcpu->arch.hpte_cache[hpte_id];
+	pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 
 	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
 	dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n",
 		    orig_pte->may_write ? 'w' : '-',
 		    orig_pte->may_write ? 'w' : '-',
@@ -355,6 +241,8 @@ next_pteg:
 	pte->pte = *orig_pte;
 	pte->pte = *orig_pte;
 	pte->pfn = hpaddr >> PAGE_SHIFT;
 	pte->pfn = hpaddr >> PAGE_SHIFT;
 
 
+	kvmppc_mmu_hpte_cache_map(vcpu, pte);
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -439,7 +327,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 {
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kvmppc_mmu_hpte_destroy(vcpu);
 	preempt_disable();
 	preempt_disable();
 	__destroy_context(to_book3s(vcpu)->context_id);
 	__destroy_context(to_book3s(vcpu)->context_id);
 	preempt_enable();
 	preempt_enable();
@@ -479,5 +367,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
 	htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0;
 	htab = (ulong)__va(sdr1 & 0xffff0000);
 	htab = (ulong)__va(sdr1 & 0xffff0000);
 
 
+	kvmppc_mmu_hpte_init(vcpu);
+
 	return 0;
 	return 0;
 }
 }

+ 9 - 120
arch/powerpc/kvm/book3s_64_mmu_host.c

@@ -20,6 +20,7 @@
  */
  */
 
 
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
+#include <linux/hash.h>
 
 
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_ppc.h>
 #include <asm/kvm_book3s.h>
 #include <asm/kvm_book3s.h>
@@ -46,135 +47,20 @@
 #define dprintk_slb(a, ...) do { } while(0)
 #define dprintk_slb(a, ...) do { } while(0)
 #endif
 #endif
 
 
-static void invalidate_pte(struct hpte_cache *pte)
+void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
 {
 {
-	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
-		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
-
 	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
 	ppc_md.hpte_invalidate(pte->slot, pte->host_va,
 			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
 			       MMU_PAGE_4K, MMU_SEGSIZE_256M,
 			       false);
 			       false);
-	pte->host_va = 0;
-
-	if (pte->pte.may_write)
-		kvm_release_pfn_dirty(pte->pfn);
-	else
-		kvm_release_pfn_clean(pte->pfn);
-}
-
-void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_offset, guest_ea, ea_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_ea &= ea_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.eaddr & ea_mask) == guest_ea) {
-			invalidate_pte(pte);
-		}
-	}
-
-	/* Doing a complete flush -> start from scratch */
-	if (!ea_mask)
-		vcpu->arch.hpte_cache_offset = 0;
-}
-
-void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
-		    vcpu->arch.hpte_cache_offset, guest_vp, vp_mask);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	guest_vp &= vp_mask;
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.vpage & vp_mask) == guest_vp) {
-			invalidate_pte(pte);
-		}
-	}
-}
-
-void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
-{
-	int i;
-
-	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx & 0x%lx\n",
-		    vcpu->arch.hpte_cache_offset, pa_start, pa_end);
-	BUG_ON(vcpu->arch.hpte_cache_offset > HPTEG_CACHE_NUM);
-
-	for (i = 0; i < vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if ((pte->pte.raddr >= pa_start) &&
-		    (pte->pte.raddr < pa_end)) {
-			invalidate_pte(pte);
-		}
-	}
-}
-
-struct kvmppc_pte *kvmppc_mmu_find_pte(struct kvm_vcpu *vcpu, u64 ea, bool data)
-{
-	int i;
-	u64 guest_vp;
-
-	guest_vp = vcpu->arch.mmu.ea_to_vp(vcpu, ea, false);
-	for (i=0; i<vcpu->arch.hpte_cache_offset; i++) {
-		struct hpte_cache *pte;
-
-		pte = &vcpu->arch.hpte_cache[i];
-		if (!pte->host_va)
-			continue;
-
-		if (pte->pte.vpage == guest_vp)
-			return &pte->pte;
-	}
-
-	return NULL;
-}
-
-static int kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
-{
-	if (vcpu->arch.hpte_cache_offset == HPTEG_CACHE_NUM)
-		kvmppc_mmu_pte_flush(vcpu, 0, 0);
-
-	return vcpu->arch.hpte_cache_offset++;
 }
 }
 
 
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
 /* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using
  * a hash, so we don't waste cycles on looping */
  * a hash, so we don't waste cycles on looping */
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 {
-	return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^
-		     ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK));
+	return hash_64(gvsid, SID_MAP_BITS);
 }
 }
 
 
-
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid)
 {
 {
 	struct kvmppc_sid_map *map;
 	struct kvmppc_sid_map *map;
@@ -273,8 +159,7 @@ map_again:
 		attempt++;
 		attempt++;
 		goto map_again;
 		goto map_again;
 	} else {
 	} else {
-		int hpte_id = kvmppc_mmu_hpte_cache_next(vcpu);
-		struct hpte_cache *pte = &vcpu->arch.hpte_cache[hpte_id];
+		struct hpte_cache *pte = kvmppc_mmu_hpte_cache_next(vcpu);
 
 
 		dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
 		dprintk_mmu("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx\n",
 			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
 			    ((rflags & HPTE_R_PP) == 3) ? '-' : 'w',
@@ -292,6 +177,8 @@ map_again:
 		pte->host_va = va;
 		pte->host_va = va;
 		pte->pte = *orig_pte;
 		pte->pte = *orig_pte;
 		pte->pfn = hpaddr >> PAGE_SHIFT;
 		pte->pfn = hpaddr >> PAGE_SHIFT;
+
+		kvmppc_mmu_hpte_cache_map(vcpu, pte);
 	}
 	}
 
 
 	return 0;
 	return 0;
@@ -418,7 +305,7 @@ void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu)
 
 
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
 {
 {
-	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+	kvmppc_mmu_hpte_destroy(vcpu);
 	__destroy_context(to_book3s(vcpu)->context_id);
 	__destroy_context(to_book3s(vcpu)->context_id);
 }
 }
 
 
@@ -436,5 +323,7 @@ int kvmppc_mmu_init(struct kvm_vcpu *vcpu)
 	vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
 	vcpu3s->vsid_first = vcpu3s->context_id << USER_ESID_BITS;
 	vcpu3s->vsid_next = vcpu3s->vsid_first;
 	vcpu3s->vsid_next = vcpu3s->vsid_first;
 
 
+	kvmppc_mmu_hpte_init(vcpu);
+
 	return 0;
 	return 0;
 }
 }

+ 277 - 0
arch/powerpc/kvm/book3s_mmu_hpte.c

@@ -0,0 +1,277 @@
+/*
+ * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
+ *
+ * Authors:
+ *     Alexander Graf <agraf@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+
+#include <linux/kvm_host.h>
+#include <linux/hash.h>
+#include <linux/slab.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/machdep.h>
+#include <asm/mmu_context.h>
+#include <asm/hw_irq.h>
+
+#define PTE_SIZE	12
+
+/* #define DEBUG_MMU */
+
+#ifdef DEBUG_MMU
+#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__)
+#else
+#define dprintk_mmu(a, ...) do { } while(0)
+#endif
+
+static struct kmem_cache *hpte_cache;
+
+static inline u64 kvmppc_mmu_hash_pte(u64 eaddr)
+{
+	return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte(u64 vpage)
+{
+	return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE);
+}
+
+static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage)
+{
+	return hash_64((vpage & 0xffffff000ULL) >> 12,
+		       HPTEG_HASH_BITS_VPTE_LONG);
+}
+
+void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	u64 index;
+
+	/* Add to ePTE list */
+	index = kvmppc_mmu_hash_pte(pte->pte.eaddr);
+	hlist_add_head(&pte->list_pte, &vcpu->arch.hpte_hash_pte[index]);
+
+	/* Add to vPTE list */
+	index = kvmppc_mmu_hash_vpte(pte->pte.vpage);
+	hlist_add_head(&pte->list_vpte, &vcpu->arch.hpte_hash_vpte[index]);
+
+	/* Add to vPTE_long list */
+	index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage);
+	hlist_add_head(&pte->list_vpte_long,
+		       &vcpu->arch.hpte_hash_vpte_long[index]);
+}
+
+static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte)
+{
+	dprintk_mmu("KVM: Flushing SPT: 0x%lx (0x%llx) -> 0x%llx\n",
+		    pte->pte.eaddr, pte->pte.vpage, pte->host_va);
+
+	/* Different for 32 and 64 bit */
+	kvmppc_mmu_invalidate_pte(vcpu, pte);
+
+	if (pte->pte.may_write)
+		kvm_release_pfn_dirty(pte->pfn);
+	else
+		kvm_release_pfn_clean(pte->pfn);
+
+	hlist_del(&pte->list_pte);
+	hlist_del(&pte->list_vpte);
+	hlist_del(&pte->list_vpte_long);
+
+	vcpu->arch.hpte_cache_count--;
+	kmem_cache_free(hpte_cache, pte);
+}
+
+static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+	struct hlist_node *node, *tmp;
+	int i;
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+			invalidate_pte(vcpu, pte);
+	}
+}
+
+static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+
+	/* Find the list of entries in the map */
+	list = &vcpu->arch.hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_pte)
+		if ((pte->pte.eaddr & ~0xfffUL) == guest_ea)
+			invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask)
+{
+	u64 i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow PTEs: 0x%lx & 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, guest_ea, ea_mask);
+
+	guest_ea &= ea_mask;
+
+	switch (ea_mask) {
+	case ~0xfffUL:
+		kvmppc_mmu_pte_flush_page(vcpu, guest_ea);
+		break;
+	case 0x0ffff000:
+		/* 32-bit flush w/o segment, go through all possible segments */
+		for (i = 0; i < 0x100000000ULL; i += 0x10000000ULL)
+			kvmppc_mmu_pte_flush(vcpu, guest_ea | i, ~0xfffUL);
+		break;
+	case 0:
+		/* Doing a complete flush -> start from scratch */
+		kvmppc_mmu_pte_flush_all(vcpu);
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+}
+
+/* Flush with mask 0xfffffffff */
+static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xfffffffffULL;
+
+	list = &vcpu->arch.hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+}
+
+/* Flush with mask 0xffffff000 */
+static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp)
+{
+	struct hlist_head *list;
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	u64 vp_mask = 0xffffff000ULL;
+
+	list = &vcpu->arch.hpte_hash_vpte_long[
+		kvmppc_mmu_hash_vpte_long(guest_vp)];
+
+	/* Check the list for matching entries and invalidate */
+	hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+		if ((pte->pte.vpage & vp_mask) == guest_vp)
+			invalidate_pte(vcpu, pte);
+}
+
+void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask)
+{
+	dprintk_mmu("KVM: Flushing %d Shadow vPTEs: 0x%llx & 0x%llx\n",
+		    vcpu->arch.hpte_cache_count, guest_vp, vp_mask);
+	guest_vp &= vp_mask;
+
+	switch(vp_mask) {
+	case 0xfffffffffULL:
+		kvmppc_mmu_pte_vflush_short(vcpu, guest_vp);
+		break;
+	case 0xffffff000ULL:
+		kvmppc_mmu_pte_vflush_long(vcpu, guest_vp);
+		break;
+	default:
+		WARN_ON(1);
+		return;
+	}
+}
+
+void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end)
+{
+	struct hlist_node *node, *tmp;
+	struct hpte_cache *pte;
+	int i;
+
+	dprintk_mmu("KVM: Flushing %d Shadow pPTEs: 0x%lx - 0x%lx\n",
+		    vcpu->arch.hpte_cache_count, pa_start, pa_end);
+
+	for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) {
+		struct hlist_head *list = &vcpu->arch.hpte_hash_vpte_long[i];
+
+		hlist_for_each_entry_safe(pte, node, tmp, list, list_vpte_long)
+			if ((pte->pte.raddr >= pa_start) &&
+			    (pte->pte.raddr < pa_end))
+				invalidate_pte(vcpu, pte);
+	}
+}
+
+struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu)
+{
+	struct hpte_cache *pte;
+
+	pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL);
+	vcpu->arch.hpte_cache_count++;
+
+	if (vcpu->arch.hpte_cache_count == HPTEG_CACHE_NUM)
+		kvmppc_mmu_pte_flush_all(vcpu);
+
+	return pte;
+}
+
+void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu)
+{
+	kvmppc_mmu_pte_flush(vcpu, 0, 0);
+}
+
+static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		INIT_HLIST_HEAD(&hash_list[i]);
+}
+
+int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu)
+{
+	/* init hpte lookup hashes */
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_pte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_pte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte));
+	kvmppc_mmu_hpte_init_hash(vcpu->arch.hpte_hash_vpte_long,
+				  ARRAY_SIZE(vcpu->arch.hpte_hash_vpte_long));
+
+	return 0;
+}
+
+int kvmppc_mmu_hpte_sysinit(void)
+{
+	/* init hpte slab cache */
+	hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache),
+				       sizeof(struct hpte_cache), 0, NULL);
+
+	return 0;
+}
+
+void kvmppc_mmu_hpte_sysexit(void)
+{
+	kmem_cache_destroy(hpte_cache);
+}

+ 37 - 57
arch/powerpc/kvm/book3s_paired_singles.c

@@ -159,10 +159,7 @@
 
 
 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt)
 {
 {
-	struct thread_struct t;
-
-	t.fpscr.val = vcpu->arch.fpscr;
-	cvt_df((double*)&vcpu->arch.fpr[rt], (float*)&vcpu->arch.qpr[rt], &t);
+	kvm_cvt_df(&vcpu->arch.fpr[rt], &vcpu->arch.qpr[rt], &vcpu->arch.fpscr);
 }
 }
 
 
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
 static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store)
@@ -183,7 +180,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   int rs, ulong addr, int ls_type)
 				   int rs, ulong addr, int ls_type)
 {
 {
 	int emulated = EMULATE_FAIL;
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	int r;
 	char tmp[8];
 	char tmp[8];
 	int len = sizeof(u32);
 	int len = sizeof(u32);
@@ -191,8 +187,6 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	if (ls_type == FPU_LS_DOUBLE)
 	if (ls_type == FPU_LS_DOUBLE)
 		len = sizeof(u64);
 		len = sizeof(u64);
 
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	/* read from memory */
 	/* read from memory */
 	r = kvmppc_ld(vcpu, &addr, len, tmp, true);
 	r = kvmppc_ld(vcpu, &addr, len, tmp, true);
 	vcpu->arch.paddr_accessed = addr;
 	vcpu->arch.paddr_accessed = addr;
@@ -210,7 +204,7 @@ static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	/* put in registers */
 	/* put in registers */
 	switch (ls_type) {
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
 	case FPU_LS_SINGLE:
-		cvt_fd((float*)tmp, (double*)&vcpu->arch.fpr[rs], &t);
+		kvm_cvt_fd((u32*)tmp, &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
 		vcpu->arch.qpr[rs] = *((u32*)tmp);
 		vcpu->arch.qpr[rs] = *((u32*)tmp);
 		break;
 		break;
 	case FPU_LS_DOUBLE:
 	case FPU_LS_DOUBLE:
@@ -229,17 +223,14 @@ static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				    int rs, ulong addr, int ls_type)
 				    int rs, ulong addr, int ls_type)
 {
 {
 	int emulated = EMULATE_FAIL;
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	int r;
 	char tmp[8];
 	char tmp[8];
 	u64 val;
 	u64 val;
 	int len;
 	int len;
 
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	switch (ls_type) {
 	switch (ls_type) {
 	case FPU_LS_SINGLE:
 	case FPU_LS_SINGLE:
-		cvt_df((double*)&vcpu->arch.fpr[rs], (float*)tmp, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[rs], (u32*)tmp, &vcpu->arch.fpscr);
 		val = *((u32*)tmp);
 		val = *((u32*)tmp);
 		len = sizeof(u32);
 		len = sizeof(u32);
 		break;
 		break;
@@ -278,13 +269,10 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				   int rs, ulong addr, bool w, int i)
 				   int rs, ulong addr, bool w, int i)
 {
 {
 	int emulated = EMULATE_FAIL;
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	int r;
 	float one = 1.0;
 	float one = 1.0;
 	u32 tmp[2];
 	u32 tmp[2];
 
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	/* read from memory */
 	/* read from memory */
 	if (w) {
 	if (w) {
 		r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
 		r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true);
@@ -308,7 +296,7 @@ static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu,
 	emulated = EMULATE_DONE;
 	emulated = EMULATE_DONE;
 
 
 	/* put in registers */
 	/* put in registers */
-	cvt_fd((float*)&tmp[0], (double*)&vcpu->arch.fpr[rs], &t);
+	kvm_cvt_fd(&tmp[0], &vcpu->arch.fpr[rs], &vcpu->arch.fpscr);
 	vcpu->arch.qpr[rs] = tmp[1];
 	vcpu->arch.qpr[rs] = tmp[1];
 
 
 	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
 	dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0],
@@ -322,14 +310,11 @@ static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu,
 				    int rs, ulong addr, bool w, int i)
 				    int rs, ulong addr, bool w, int i)
 {
 {
 	int emulated = EMULATE_FAIL;
 	int emulated = EMULATE_FAIL;
-	struct thread_struct t;
 	int r;
 	int r;
 	u32 tmp[2];
 	u32 tmp[2];
 	int len = w ? sizeof(u32) : sizeof(u64);
 	int len = w ? sizeof(u32) : sizeof(u64);
 
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
-	cvt_df((double*)&vcpu->arch.fpr[rs], (float*)&tmp[0], &t);
+	kvm_cvt_df(&vcpu->arch.fpr[rs], &tmp[0], &vcpu->arch.fpscr);
 	tmp[1] = vcpu->arch.qpr[rs];
 	tmp[1] = vcpu->arch.qpr[rs];
 
 
 	r = kvmppc_st(vcpu, &addr, len, tmp, true);
 	r = kvmppc_st(vcpu, &addr, len, tmp, true);
@@ -517,7 +502,7 @@ static int get_d_signext(u32 inst)
 static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 				      int reg_out, int reg_in1, int reg_in2,
 				      int reg_out, int reg_in1, int reg_in2,
 				      int reg_in3, int scalar,
 				      int reg_in3, int scalar,
-				      void (*func)(struct thread_struct *t,
+				      void (*func)(u64 *fpscr,
 						 u32 *dst, u32 *src1,
 						 u32 *dst, u32 *src1,
 						 u32 *src2, u32 *src3))
 						 u32 *src2, u32 *src3))
 {
 {
@@ -526,27 +511,25 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 	u32 ps0_out;
 	u32 ps0_out;
 	u32 ps0_in1, ps0_in2, ps0_in3;
 	u32 ps0_in1, ps0_in2, ps0_in3;
 	u32 ps1_in1, ps1_in2, ps1_in3;
 	u32 ps1_in1, ps1_in2, ps1_in3;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 
 	/* RC */
 	/* RC */
 	WARN_ON(rc);
 	WARN_ON(rc);
 
 
 	/* PS0 */
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
-	cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
-	cvt_df((double*)&fpr[reg_in3], (float*)&ps0_in3, &t);
+	kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
+	kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
+	kvm_cvt_df(&fpr[reg_in3], &ps0_in3, &vcpu->arch.fpscr);
 
 
 	if (scalar & SCALAR_LOW)
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 		ps0_in2 = qpr[reg_in2];
 
 
-	func(&t, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3);
 
 
 	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 	dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
 			  ps0_in1, ps0_in2, ps0_in3, ps0_out);
 
 
 	if (!(scalar & SCALAR_NO_PS0))
 	if (!(scalar & SCALAR_NO_PS0))
-		cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+		kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
 
 	/* PS1 */
 	/* PS1 */
 	ps1_in1 = qpr[reg_in1];
 	ps1_in1 = qpr[reg_in1];
@@ -557,7 +540,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 		ps1_in2 = ps0_in2;
 		ps1_in2 = ps0_in2;
 
 
 	if (!(scalar & SCALAR_NO_PS1))
 	if (!(scalar & SCALAR_NO_PS1))
-		func(&t, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
+		func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3);
 
 
 	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 	dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n",
 			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
 			  ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]);
@@ -568,7 +551,7 @@ static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 				    int reg_out, int reg_in1, int reg_in2,
 				    int reg_out, int reg_in1, int reg_in2,
 				    int scalar,
 				    int scalar,
-				    void (*func)(struct thread_struct *t,
+				    void (*func)(u64 *fpscr,
 						 u32 *dst, u32 *src1,
 						 u32 *dst, u32 *src1,
 						 u32 *src2))
 						 u32 *src2))
 {
 {
@@ -578,27 +561,25 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	u32 ps0_in1, ps0_in2;
 	u32 ps0_in1, ps0_in2;
 	u32 ps1_out;
 	u32 ps1_out;
 	u32 ps1_in1, ps1_in2;
 	u32 ps1_in1, ps1_in2;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 
 	/* RC */
 	/* RC */
 	WARN_ON(rc);
 	WARN_ON(rc);
 
 
 	/* PS0 */
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in1], (float*)&ps0_in1, &t);
+	kvm_cvt_df(&fpr[reg_in1], &ps0_in1, &vcpu->arch.fpscr);
 
 
 	if (scalar & SCALAR_LOW)
 	if (scalar & SCALAR_LOW)
 		ps0_in2 = qpr[reg_in2];
 		ps0_in2 = qpr[reg_in2];
 	else
 	else
-		cvt_df((double*)&fpr[reg_in2], (float*)&ps0_in2, &t);
+		kvm_cvt_df(&fpr[reg_in2], &ps0_in2, &vcpu->arch.fpscr);
 
 
-	func(&t, &ps0_out, &ps0_in1, &ps0_in2);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in1, &ps0_in2);
 
 
 	if (!(scalar & SCALAR_NO_PS0)) {
 	if (!(scalar & SCALAR_NO_PS0)) {
 		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
 		dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n",
 				  ps0_in1, ps0_in2, ps0_out);
 				  ps0_in1, ps0_in2, ps0_out);
 
 
-		cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+		kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 	}
 	}
 
 
 	/* PS1 */
 	/* PS1 */
@@ -608,7 +589,7 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 	if (scalar & SCALAR_HIGH)
 	if (scalar & SCALAR_HIGH)
 		ps1_in2 = ps0_in2;
 		ps1_in2 = ps0_in2;
 
 
-	func(&t, &ps1_out, &ps1_in1, &ps1_in2);
+	func(&vcpu->arch.fpscr, &ps1_out, &ps1_in1, &ps1_in2);
 
 
 	if (!(scalar & SCALAR_NO_PS1)) {
 	if (!(scalar & SCALAR_NO_PS1)) {
 		qpr[reg_out] = ps1_out;
 		qpr[reg_out] = ps1_out;
@@ -622,31 +603,29 @@ static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc,
 
 
 static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
 static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc,
 				    int reg_out, int reg_in,
 				    int reg_out, int reg_in,
-				    void (*func)(struct thread_struct *t,
+				    void (*func)(u64 *t,
 						 u32 *dst, u32 *src1))
 						 u32 *dst, u32 *src1))
 {
 {
 	u32 *qpr = vcpu->arch.qpr;
 	u32 *qpr = vcpu->arch.qpr;
 	u64 *fpr = vcpu->arch.fpr;
 	u64 *fpr = vcpu->arch.fpr;
 	u32 ps0_out, ps0_in;
 	u32 ps0_out, ps0_in;
 	u32 ps1_in;
 	u32 ps1_in;
-	struct thread_struct t;
-	t.fpscr.val = vcpu->arch.fpscr;
 
 
 	/* RC */
 	/* RC */
 	WARN_ON(rc);
 	WARN_ON(rc);
 
 
 	/* PS0 */
 	/* PS0 */
-	cvt_df((double*)&fpr[reg_in], (float*)&ps0_in, &t);
-	func(&t, &ps0_out, &ps0_in);
+	kvm_cvt_df(&fpr[reg_in], &ps0_in, &vcpu->arch.fpscr);
+	func(&vcpu->arch.fpscr, &ps0_out, &ps0_in);
 
 
 	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
 	dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n",
 			  ps0_in, ps0_out);
 			  ps0_in, ps0_out);
 
 
-	cvt_fd((float*)&ps0_out, (double*)&fpr[reg_out], &t);
+	kvm_cvt_fd(&ps0_out, &fpr[reg_out], &vcpu->arch.fpscr);
 
 
 	/* PS1 */
 	/* PS1 */
 	ps1_in = qpr[reg_in];
 	ps1_in = qpr[reg_in];
-	func(&t, &qpr[reg_out], &ps1_in);
+	func(&vcpu->arch.fpscr, &qpr[reg_out], &ps1_in);
 
 
 	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
 	dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n",
 			  ps1_in, qpr[reg_out]);
 			  ps1_in, qpr[reg_out]);
@@ -672,13 +651,10 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
 
 	bool rcomp = (inst & 1) ? true : false;
 	bool rcomp = (inst & 1) ? true : false;
 	u32 cr = kvmppc_get_cr(vcpu);
 	u32 cr = kvmppc_get_cr(vcpu);
-	struct thread_struct t;
 #ifdef DEBUG
 #ifdef DEBUG
 	int i;
 	int i;
 #endif
 #endif
 
 
-	t.fpscr.val = vcpu->arch.fpscr;
-
 	if (!kvmppc_inst_is_paired_single(vcpu, inst))
 	if (!kvmppc_inst_is_paired_single(vcpu, inst))
 		return EMULATE_FAIL;
 		return EMULATE_FAIL;
 
 
@@ -695,7 +671,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
 #ifdef DEBUG
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 		u32 f;
 		u32 f;
-		cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
 		dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx    QPR[%d] = 0x%x\n",
 			i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
 			i, f, vcpu->arch.fpr[i], i, vcpu->arch.qpr[i]);
 	}
 	}
@@ -819,8 +795,9 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 			WARN_ON(rcomp);
 			WARN_ON(rcomp);
 			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
 			vcpu->arch.fpr[ax_rd] = vcpu->arch.fpr[ax_ra];
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-			       (float*)&vcpu->arch.qpr[ax_rd], &t);
+			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+				   &vcpu->arch.qpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			break;
 			break;
 		case OP_4X_PS_MERGE01:
 		case OP_4X_PS_MERGE01:
 			WARN_ON(rcomp);
 			WARN_ON(rcomp);
@@ -830,17 +807,20 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 		case OP_4X_PS_MERGE10:
 		case OP_4X_PS_MERGE10:
 			WARN_ON(rcomp);
 			WARN_ON(rcomp);
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-			cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-			       (double*)&vcpu->arch.fpr[ax_rd], &t);
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &vcpu->arch.fpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
 			/* vcpu->arch.qpr[ax_rd] = vcpu->arch.fpr[ax_rb]; */
-			cvt_df((double*)&vcpu->arch.fpr[ax_rb],
-			       (float*)&vcpu->arch.qpr[ax_rd], &t);
+			kvm_cvt_df(&vcpu->arch.fpr[ax_rb],
+				   &vcpu->arch.qpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			break;
 			break;
 		case OP_4X_PS_MERGE11:
 		case OP_4X_PS_MERGE11:
 			WARN_ON(rcomp);
 			WARN_ON(rcomp);
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
 			/* vcpu->arch.fpr[ax_rd] = vcpu->arch.qpr[ax_ra]; */
-			cvt_fd((float*)&vcpu->arch.qpr[ax_ra],
-			       (double*)&vcpu->arch.fpr[ax_rd], &t);
+			kvm_cvt_fd(&vcpu->arch.qpr[ax_ra],
+				   &vcpu->arch.fpr[ax_rd],
+				   &vcpu->arch.fpscr);
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb];
 			break;
 			break;
 		}
 		}
@@ -1275,7 +1255,7 @@ int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu)
 #ifdef DEBUG
 #ifdef DEBUG
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 	for (i = 0; i < ARRAY_SIZE(vcpu->arch.fpr); i++) {
 		u32 f;
 		u32 f;
-		cvt_df((double*)&vcpu->arch.fpr[i], (float*)&f, &t);
+		kvm_cvt_df(&vcpu->arch.fpr[i], &f, &vcpu->arch.fpscr);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
 		dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f);
 	}
 	}
 #endif
 #endif

+ 1 - 11
arch/powerpc/kvm/booke.c

@@ -144,7 +144,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu,
                                         unsigned int priority)
                                         unsigned int priority)
 {
 {
 	int allowed = 0;
 	int allowed = 0;
-	ulong msr_mask;
+	ulong uninitialized_var(msr_mask);
 	bool update_esr = false, update_dear = false;
 	bool update_esr = false, update_dear = false;
 
 
 	switch (priority) {
 	switch (priority) {
@@ -485,8 +485,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	regs->pc = vcpu->arch.pc;
 	regs->pc = vcpu->arch.pc;
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->cr = kvmppc_get_cr(vcpu);
 	regs->ctr = vcpu->arch.ctr;
 	regs->ctr = vcpu->arch.ctr;
@@ -507,8 +505,6 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 		regs->gpr[i] = kvmppc_get_gpr(vcpu, i);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -516,8 +512,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
 	int i;
 	int i;
 
 
-	vcpu_load(vcpu);
-
 	vcpu->arch.pc = regs->pc;
 	vcpu->arch.pc = regs->pc;
 	kvmppc_set_cr(vcpu, regs->cr);
 	kvmppc_set_cr(vcpu, regs->cr);
 	vcpu->arch.ctr = regs->ctr;
 	vcpu->arch.ctr = regs->ctr;
@@ -537,8 +531,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 	for (i = 0; i < ARRAY_SIZE(regs->gpr); i++)
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 		kvmppc_set_gpr(vcpu, i, regs->gpr[i]);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -569,9 +561,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 {
 {
 	int r;
 	int r;
 
 
-	vcpu_load(vcpu);
 	r = kvmppc_core_vcpu_translate(vcpu, tr);
 	r = kvmppc_core_vcpu_translate(vcpu, tr);
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }
 
 

+ 18 - 0
arch/powerpc/kvm/fpu.S

@@ -271,3 +271,21 @@ FPD_THREE_IN(fmsub)
 FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fmadd)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmsub)
 FPD_THREE_IN(fnmadd)
 FPD_THREE_IN(fnmadd)
+
+_GLOBAL(kvm_cvt_fd)
+	lfd	0,0(r5)			/* load up fpscr value */
+	MTFSF_L(0)
+	lfs	0,0(r3)
+	stfd	0,0(r4)
+	mffs	0
+	stfd	0,0(r5)			/* save new fpscr value */
+	blr
+
+_GLOBAL(kvm_cvt_df)
+	lfd	0,0(r5)			/* load up fpscr value */
+	MTFSF_L(0)
+	lfd	0,0(r3)
+	stfs	0,0(r4)
+	mffs	0
+	stfd	0,0(r5)			/* save new fpscr value */
+	blr

+ 3 - 11
arch/powerpc/kvm/powerpc.c

@@ -36,11 +36,6 @@
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include "trace.h"
 
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
 {
 {
 	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
 	return !(v->arch.msr & MSR_WE) || !!(v->arch.pending_exceptions);
@@ -287,7 +282,7 @@ static void kvmppc_complete_dcr_load(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
                                       struct kvm_run *run)
                                       struct kvm_run *run)
 {
 {
-	u64 gpr;
+	u64 uninitialized_var(gpr);
 
 
 	if (run->mmio.len > sizeof(gpr)) {
 	if (run->mmio.len > sizeof(gpr)) {
 		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
 		printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len);
@@ -423,8 +418,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	int r;
 	int r;
 	sigset_t sigsaved;
 	sigset_t sigsaved;
 
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
 
@@ -456,8 +449,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 
-	vcpu_put(vcpu);
-
 	return r;
 	return r;
 }
 }
 
 
@@ -523,8 +514,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		if (copy_from_user(&irq, argp, sizeof(irq)))
 		if (copy_from_user(&irq, argp, sizeof(irq)))
 			goto out;
 			goto out;
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
 		r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
-		break;
+		goto out;
 	}
 	}
+
 	case KVM_ENABLE_CAP:
 	case KVM_ENABLE_CAP:
 	{
 	{
 		struct kvm_enable_cap cap;
 		struct kvm_enable_cap cap;

+ 3 - 2
arch/s390/include/asm/kvm_host.h

@@ -26,7 +26,7 @@
 
 
 struct sca_entry {
 struct sca_entry {
 	atomic_t scn;
 	atomic_t scn;
-	__u64	reserved;
+	__u32	reserved;
 	__u64	sda;
 	__u64	sda;
 	__u64	reserved2[2];
 	__u64	reserved2[2];
 } __attribute__((packed));
 } __attribute__((packed));
@@ -41,7 +41,8 @@ struct sca_block {
 } __attribute__((packed));
 } __attribute__((packed));
 
 
 #define KVM_NR_PAGE_SIZES 2
 #define KVM_NR_PAGE_SIZES 2
-#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + ((x) - 1) * 8)
+#define KVM_HPAGE_GFN_SHIFT(x) (((x) - 1) * 8)
+#define KVM_HPAGE_SHIFT(x) (PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_SIZE(x) (1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)

+ 1 - 1
arch/s390/kvm/intercept.c

@@ -135,7 +135,7 @@ static int handle_stop(struct kvm_vcpu *vcpu)
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	spin_lock_bh(&vcpu->arch.local_int.lock);
 	if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
 	if (vcpu->arch.local_int.action_bits & ACTION_STORE_ON_STOP) {
 		vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
 		vcpu->arch.local_int.action_bits &= ~ACTION_STORE_ON_STOP;
-		rc = __kvm_s390_vcpu_store_status(vcpu,
+		rc = kvm_s390_vcpu_store_status(vcpu,
 						  KVM_S390_STORE_STATUS_NOADDR);
 						  KVM_S390_STORE_STATUS_NOADDR);
 		if (rc >= 0)
 		if (rc >= 0)
 			rc = -EOPNOTSUPP;
 			rc = -EOPNOTSUPP;

+ 19 - 45
arch/s390/kvm/kvm-s390.c

@@ -207,6 +207,7 @@ out_nokvm:
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 {
 {
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
 	VCPU_EVENT(vcpu, 3, "%s", "free cpu");
+	clear_bit(63 - vcpu->vcpu_id, (unsigned long *) &vcpu->kvm->arch.sca->mcn);
 	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
 	if (vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda ==
 		(__u64) vcpu->arch.sie_block)
 		(__u64) vcpu->arch.sie_block)
 		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
 		vcpu->kvm->arch.sca->cpu[vcpu->vcpu_id].sda = 0;
@@ -296,7 +297,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 {
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
 	atomic_set(&vcpu->arch.sie_block->cpuflags, CPUSTAT_ZARCH);
 	set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
 	set_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests);
-	vcpu->arch.sie_block->ecb   = 2;
+	vcpu->arch.sie_block->ecb   = 6;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
 	vcpu->arch.sie_block->eca   = 0xC1002001U;
 	vcpu->arch.sie_block->fac   = (int) (long) facilities;
 	vcpu->arch.sie_block->fac   = (int) (long) facilities;
 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
 	hrtimer_init(&vcpu->arch.ckc_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
@@ -329,6 +330,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
 		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
 		kvm->arch.sca->cpu[id].sda = (__u64) vcpu->arch.sie_block;
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaoh = (__u32)(((__u64)kvm->arch.sca) >> 32);
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
 	vcpu->arch.sie_block->scaol = (__u32)(__u64)kvm->arch.sca;
+	set_bit(63 - id, (unsigned long *) &kvm->arch.sca->mcn);
 
 
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	spin_lock_init(&vcpu->arch.local_int.lock);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
 	INIT_LIST_HEAD(&vcpu->arch.local_int.list);
@@ -363,63 +365,49 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
 
 
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 static int kvm_arch_vcpu_ioctl_initial_reset(struct kvm_vcpu *vcpu)
 {
 {
-	vcpu_load(vcpu);
 	kvm_s390_vcpu_initial_reset(vcpu);
 	kvm_s390_vcpu_initial_reset(vcpu);
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
 	memcpy(&vcpu->arch.guest_gprs, &regs->gprs, sizeof(regs->gprs));
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
 	memcpy(&regs->gprs, &vcpu->arch.guest_gprs, sizeof(regs->gprs));
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 				  struct kvm_sregs *sregs)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.guest_acrs, &sregs->acrs, sizeof(sregs->acrs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
 	memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs));
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 				  struct kvm_sregs *sregs)
 				  struct kvm_sregs *sregs)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
 	memcpy(&sregs->acrs, &vcpu->arch.guest_acrs, sizeof(sregs->acrs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
 	memcpy(&sregs->crs, &vcpu->arch.sie_block->gcr, sizeof(sregs->crs));
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 	memcpy(&vcpu->arch.guest_fpregs.fprs, &fpu->fprs, sizeof(fpu->fprs));
 	vcpu->arch.guest_fpregs.fpc = fpu->fpc;
 	vcpu->arch.guest_fpregs.fpc = fpu->fpc;
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
 	memcpy(&fpu->fprs, &vcpu->arch.guest_fpregs.fprs, sizeof(fpu->fprs));
 	fpu->fpc = vcpu->arch.guest_fpregs.fpc;
 	fpu->fpc = vcpu->arch.guest_fpregs.fpc;
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -427,14 +415,12 @@ static int kvm_arch_vcpu_ioctl_set_initial_psw(struct kvm_vcpu *vcpu, psw_t psw)
 {
 {
 	int rc = 0;
 	int rc = 0;
 
 
-	vcpu_load(vcpu);
 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
 	if (atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_RUNNING)
 		rc = -EBUSY;
 		rc = -EBUSY;
 	else {
 	else {
 		vcpu->run->psw_mask = psw.mask;
 		vcpu->run->psw_mask = psw.mask;
 		vcpu->run->psw_addr = psw.addr;
 		vcpu->run->psw_addr = psw.addr;
 	}
 	}
-	vcpu_put(vcpu);
 	return rc;
 	return rc;
 }
 }
 
 
@@ -498,8 +484,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int rc;
 	int rc;
 	sigset_t sigsaved;
 	sigset_t sigsaved;
 
 
-	vcpu_load(vcpu);
-
 rerun_vcpu:
 rerun_vcpu:
 	if (vcpu->requests)
 	if (vcpu->requests)
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
 		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -568,8 +552,6 @@ rerun_vcpu:
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 
-	vcpu_put(vcpu);
-
 	vcpu->stat.exit_userspace++;
 	vcpu->stat.exit_userspace++;
 	return rc;
 	return rc;
 }
 }
@@ -589,7 +571,7 @@ static int __guestcopy(struct kvm_vcpu *vcpu, u64 guestdest, const void *from,
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_NOADDR: -> 0x1200 on 64 bit
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  * KVM_S390_STORE_STATUS_PREFIXED: -> prefix
  */
  */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 {
 {
 	const unsigned char archmode = 1;
 	const unsigned char archmode = 1;
 	int prefix;
 	int prefix;
@@ -651,45 +633,42 @@ int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 	return 0;
 	return 0;
 }
 }
 
 
-static int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
-{
-	int rc;
-
-	vcpu_load(vcpu);
-	rc = __kvm_s390_vcpu_store_status(vcpu, addr);
-	vcpu_put(vcpu);
-	return rc;
-}
-
 long kvm_arch_vcpu_ioctl(struct file *filp,
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 			 unsigned int ioctl, unsigned long arg)
 {
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	void __user *argp = (void __user *)arg;
+	long r;
 
 
 	switch (ioctl) {
 	switch (ioctl) {
 	case KVM_S390_INTERRUPT: {
 	case KVM_S390_INTERRUPT: {
 		struct kvm_s390_interrupt s390int;
 		struct kvm_s390_interrupt s390int;
 
 
+		r = -EFAULT;
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
 		if (copy_from_user(&s390int, argp, sizeof(s390int)))
-			return -EFAULT;
-		return kvm_s390_inject_vcpu(vcpu, &s390int);
+			break;
+		r = kvm_s390_inject_vcpu(vcpu, &s390int);
+		break;
 	}
 	}
 	case KVM_S390_STORE_STATUS:
 	case KVM_S390_STORE_STATUS:
-		return kvm_s390_vcpu_store_status(vcpu, arg);
+		r = kvm_s390_vcpu_store_status(vcpu, arg);
+		break;
 	case KVM_S390_SET_INITIAL_PSW: {
 	case KVM_S390_SET_INITIAL_PSW: {
 		psw_t psw;
 		psw_t psw;
 
 
+		r = -EFAULT;
 		if (copy_from_user(&psw, argp, sizeof(psw)))
 		if (copy_from_user(&psw, argp, sizeof(psw)))
-			return -EFAULT;
-		return kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+			break;
+		r = kvm_arch_vcpu_ioctl_set_initial_psw(vcpu, psw);
+		break;
 	}
 	}
 	case KVM_S390_INITIAL_RESET:
 	case KVM_S390_INITIAL_RESET:
-		return kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		r = kvm_arch_vcpu_ioctl_initial_reset(vcpu);
+		break;
 	default:
 	default:
-		;
+		r = -EINVAL;
 	}
 	}
-	return -EINVAL;
+	return r;
 }
 }
 
 
 /* Section: memory related */
 /* Section: memory related */
@@ -744,11 +723,6 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
 {
 {
 }
 }
 
 
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	return gfn;
-}
-
 static int __init kvm_s390_init(void)
 static int __init kvm_s390_init(void)
 {
 {
 	int ret;
 	int ret;

+ 1 - 1
arch/s390/kvm/kvm-s390.h

@@ -92,7 +92,7 @@ int kvm_s390_handle_b2(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
 
 
 /* implemented in kvm-s390.c */
 /* implemented in kvm-s390.c */
-int __kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
+int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu,
 				 unsigned long addr);
 				 unsigned long addr);
 /* implemented in diag.c */
 /* implemented in diag.c */
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);
 int kvm_s390_handle_diag(struct kvm_vcpu *vcpu);

+ 2 - 0
arch/x86/include/asm/i387.h

@@ -482,6 +482,8 @@ static inline void fpu_copy(struct fpu *dst, struct fpu *src)
 	memcpy(dst->state, src->state, xstate_size);
 	memcpy(dst->state, src->state, xstate_size);
 }
 }
 
 
+extern void fpu_finit(struct fpu *fpu);
+
 #endif /* __ASSEMBLY__ */
 #endif /* __ASSEMBLY__ */
 
 
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
 #define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5

+ 22 - 0
arch/x86/include/asm/kvm.h

@@ -22,6 +22,8 @@
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_XEN_HVM
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_VCPU_EVENTS
 #define __KVM_HAVE_DEBUGREGS
 #define __KVM_HAVE_DEBUGREGS
+#define __KVM_HAVE_XSAVE
+#define __KVM_HAVE_XCRS
 
 
 /* Architectural interrupt line count. */
 /* Architectural interrupt line count. */
 #define KVM_NR_INTERRUPTS 256
 #define KVM_NR_INTERRUPTS 256
@@ -299,4 +301,24 @@ struct kvm_debugregs {
 	__u64 reserved[9];
 	__u64 reserved[9];
 };
 };
 
 
+/* for KVM_CAP_XSAVE */
+struct kvm_xsave {
+	__u32 region[1024];
+};
+
+#define KVM_MAX_XCRS	16
+
+struct kvm_xcr {
+	__u32 xcr;
+	__u32 reserved;
+	__u64 value;
+};
+
+struct kvm_xcrs {
+	__u32 nr_xcrs;
+	__u32 flags;
+	struct kvm_xcr xcrs[KVM_MAX_XCRS];
+	__u64 padding[16];
+};
+
 #endif /* _ASM_X86_KVM_H */
 #endif /* _ASM_X86_KVM_H */

+ 25 - 5
arch/x86/include/asm/kvm_emulate.h

@@ -51,8 +51,10 @@ struct x86_emulate_ctxt;
 #define X86EMUL_UNHANDLEABLE    1
 #define X86EMUL_UNHANDLEABLE    1
 /* Terminate emulation but return success to the caller. */
 /* Terminate emulation but return success to the caller. */
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
 #define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
-#define X86EMUL_RETRY_INSTR     2 /* retry the instruction for some reason */
-#define X86EMUL_CMPXCHG_FAILED  2 /* cmpxchg did not see expected value */
+#define X86EMUL_RETRY_INSTR     3 /* retry the instruction for some reason */
+#define X86EMUL_CMPXCHG_FAILED  4 /* cmpxchg did not see expected value */
+#define X86EMUL_IO_NEEDED       5 /* IO is needed to complete emulation */
+
 struct x86_emulate_ops {
 struct x86_emulate_ops {
 	/*
 	/*
 	 * read_std: Read bytes of standard (non-emulated/special) memory.
 	 * read_std: Read bytes of standard (non-emulated/special) memory.
@@ -92,6 +94,7 @@ struct x86_emulate_ops {
 	int (*read_emulated)(unsigned long addr,
 	int (*read_emulated)(unsigned long addr,
 			     void *val,
 			     void *val,
 			     unsigned int bytes,
 			     unsigned int bytes,
+			     unsigned int *error,
 			     struct kvm_vcpu *vcpu);
 			     struct kvm_vcpu *vcpu);
 
 
 	/*
 	/*
@@ -104,6 +107,7 @@ struct x86_emulate_ops {
 	int (*write_emulated)(unsigned long addr,
 	int (*write_emulated)(unsigned long addr,
 			      const void *val,
 			      const void *val,
 			      unsigned int bytes,
 			      unsigned int bytes,
+			      unsigned int *error,
 			      struct kvm_vcpu *vcpu);
 			      struct kvm_vcpu *vcpu);
 
 
 	/*
 	/*
@@ -118,6 +122,7 @@ struct x86_emulate_ops {
 				const void *old,
 				const void *old,
 				const void *new,
 				const void *new,
 				unsigned int bytes,
 				unsigned int bytes,
+				unsigned int *error,
 				struct kvm_vcpu *vcpu);
 				struct kvm_vcpu *vcpu);
 
 
 	int (*pio_in_emulated)(int size, unsigned short port, void *val,
 	int (*pio_in_emulated)(int size, unsigned short port, void *val,
@@ -132,18 +137,26 @@ struct x86_emulate_ops {
 				      int seg, struct kvm_vcpu *vcpu);
 				      int seg, struct kvm_vcpu *vcpu);
 	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
 	u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu);
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
 	void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu);
+	unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
 	ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu);
-	void (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
+	int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
 	int (*cpl)(struct kvm_vcpu *vcpu);
-	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
+	int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu);
+	int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu);
+	int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
+	int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
 };
 };
 
 
 /* Type, address-of, and value of an instruction's operand. */
 /* Type, address-of, and value of an instruction's operand. */
 struct operand {
 struct operand {
 	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
 	enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
 	unsigned int bytes;
 	unsigned int bytes;
-	unsigned long val, orig_val, *ptr;
+	unsigned long orig_val, *ptr;
+	union {
+		unsigned long val;
+		char valptr[sizeof(unsigned long) + 2];
+	};
 };
 };
 
 
 struct fetch_cache {
 struct fetch_cache {
@@ -186,6 +199,7 @@ struct decode_cache {
 	unsigned long modrm_val;
 	unsigned long modrm_val;
 	struct fetch_cache fetch;
 	struct fetch_cache fetch;
 	struct read_cache io_read;
 	struct read_cache io_read;
+	struct read_cache mem_read;
 };
 };
 
 
 struct x86_emulate_ctxt {
 struct x86_emulate_ctxt {
@@ -202,6 +216,12 @@ struct x86_emulate_ctxt {
 	int interruptibility;
 	int interruptibility;
 
 
 	bool restart; /* restart string instruction after writeback */
 	bool restart; /* restart string instruction after writeback */
+
+	int exception; /* exception that happens during emulation or -1 */
+	u32 error_code; /* error code for exception */
+	bool error_code_valid;
+	unsigned long cr2; /* faulted address in case of #PF */
+
 	/* decode cache */
 	/* decode cache */
 	struct decode_cache decode;
 	struct decode_cache decode;
 };
 };

+ 17 - 53
arch/x86/include/asm/kvm_host.h

@@ -15,6 +15,7 @@
 #include <linux/mm.h>
 #include <linux/mm.h>
 #include <linux/mmu_notifier.h>
 #include <linux/mmu_notifier.h>
 #include <linux/tracepoint.h>
 #include <linux/tracepoint.h>
+#include <linux/cpumask.h>
 
 
 #include <linux/kvm.h>
 #include <linux/kvm.h>
 #include <linux/kvm_para.h>
 #include <linux/kvm_para.h>
@@ -39,11 +40,14 @@
 				  0xFFFFFF0000000000ULL)
 				  0xFFFFFF0000000000ULL)
 
 
 #define INVALID_PAGE (~(hpa_t)0)
 #define INVALID_PAGE (~(hpa_t)0)
+#define VALID_PAGE(x) ((x) != INVALID_PAGE)
+
 #define UNMAPPED_GVA (~(gpa_t)0)
 #define UNMAPPED_GVA (~(gpa_t)0)
 
 
 /* KVM Hugepage definitions for x86 */
 /* KVM Hugepage definitions for x86 */
 #define KVM_NR_PAGE_SIZES	3
 #define KVM_NR_PAGE_SIZES	3
-#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + (((x) - 1) * 9))
+#define KVM_HPAGE_GFN_SHIFT(x)	(((x) - 1) * 9)
+#define KVM_HPAGE_SHIFT(x)	(PAGE_SHIFT + KVM_HPAGE_GFN_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_SIZE(x)	(1UL << KVM_HPAGE_SHIFT(x))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
 #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE)
@@ -69,8 +73,6 @@
 
 
 #define IOPL_SHIFT 12
 #define IOPL_SHIFT 12
 
 
-#define KVM_ALIAS_SLOTS 4
-
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_PERMILLE_MMU_PAGES 20
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MIN_ALLOC_MMU_PAGES 64
 #define KVM_MMU_HASH_SHIFT 10
 #define KVM_MMU_HASH_SHIFT 10
@@ -241,7 +243,7 @@ struct kvm_mmu {
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 	void (*prefetch_page)(struct kvm_vcpu *vcpu,
 			      struct kvm_mmu_page *page);
 			      struct kvm_mmu_page *page);
 	int (*sync_page)(struct kvm_vcpu *vcpu,
 	int (*sync_page)(struct kvm_vcpu *vcpu,
-			 struct kvm_mmu_page *sp);
+			 struct kvm_mmu_page *sp, bool clear_unsync);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
 	hpa_t root_hpa;
 	hpa_t root_hpa;
 	int root_level;
 	int root_level;
@@ -301,8 +303,8 @@ struct kvm_vcpu_arch {
 		unsigned long mmu_seq;
 		unsigned long mmu_seq;
 	} update_pte;
 	} update_pte;
 
 
-	struct i387_fxsave_struct host_fx_image;
-	struct i387_fxsave_struct guest_fx_image;
+	struct fpu guest_fpu;
+	u64 xcr0;
 
 
 	gva_t mmio_fault_cr2;
 	gva_t mmio_fault_cr2;
 	struct kvm_pio_request pio;
 	struct kvm_pio_request pio;
@@ -360,26 +362,11 @@ struct kvm_vcpu_arch {
 
 
 	/* fields used by HYPER-V emulation */
 	/* fields used by HYPER-V emulation */
 	u64 hv_vapic;
 	u64 hv_vapic;
-};
-
-struct kvm_mem_alias {
-	gfn_t base_gfn;
-	unsigned long npages;
-	gfn_t target_gfn;
-#define KVM_ALIAS_INVALID     1UL
-	unsigned long flags;
-};
 
 
-#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-
-struct kvm_mem_aliases {
-	struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
-	int naliases;
+	cpumask_var_t wbinvd_dirty_mask;
 };
 };
 
 
 struct kvm_arch {
 struct kvm_arch {
-	struct kvm_mem_aliases *aliases;
-
 	unsigned int n_free_mmu_pages;
 	unsigned int n_free_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_requested_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
 	unsigned int n_alloc_mmu_pages;
@@ -533,6 +520,8 @@ struct kvm_x86_ops {
 
 
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 	void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
 
 
+	bool (*has_wbinvd_exit)(void);
+
 	const struct trace_print_flags *exit_reasons_str;
 	const struct trace_print_flags *exit_reasons_str;
 };
 };
 
 
@@ -576,7 +565,6 @@ enum emulation_result {
 #define EMULTYPE_SKIP		    (1 << 2)
 #define EMULTYPE_SKIP		    (1 << 2)
 int emulate_instruction(struct kvm_vcpu *vcpu,
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2, u16 error_code, int emulation_type);
 			unsigned long cr2, u16 error_code, int emulation_type);
-void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
 
 
@@ -591,10 +579,7 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
 int emulate_clts(struct kvm_vcpu *vcpu);
 int emulate_clts(struct kvm_vcpu *vcpu);
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long *dest);
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
-		    unsigned long value);
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
 
 
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
@@ -602,15 +587,16 @@ int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code);
 		    bool has_error_code, u32 error_code);
 
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
 void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr);
 
 
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
 int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
@@ -630,12 +616,7 @@ int kvm_pic_set_irq(void *opaque, int irq, int level);
 
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 void kvm_inject_nmi(struct kvm_vcpu *vcpu);
 
 
-void fx_init(struct kvm_vcpu *vcpu);
-
-int emulator_write_emulated(unsigned long addr,
-			    const void *val,
-			    unsigned int bytes,
-			    struct kvm_vcpu *vcpu);
+int fx_init(struct kvm_vcpu *vcpu);
 
 
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -664,8 +645,6 @@ void kvm_disable_tdp(void);
 int complete_pio(struct kvm_vcpu *vcpu);
 int complete_pio(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 bool kvm_check_iopl(struct kvm_vcpu *vcpu);
 
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
-
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
 {
 {
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
 	struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
@@ -719,21 +698,6 @@ static inline unsigned long read_msr(unsigned long msr)
 }
 }
 #endif
 #endif
 
 
-static inline void kvm_fx_save(struct i387_fxsave_struct *image)
-{
-	asm("fxsave (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
-{
-	asm("fxrstor (%0)":: "r" (image));
-}
-
-static inline void kvm_fx_finit(void)
-{
-	asm("finit");
-}
-
 static inline u32 get_rdx_init_val(void)
 static inline u32 get_rdx_init_val(void)
 {
 {
 	return 0x600; /* P6 family */
 	return 0x600; /* P6 family */

+ 2 - 0
arch/x86/include/asm/msr-index.h

@@ -20,6 +20,7 @@
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_LMA		10 /* Long mode active (read-only) */
 #define _EFER_NX		11 /* No execute enable */
 #define _EFER_NX		11 /* No execute enable */
 #define _EFER_SVME		12 /* Enable virtualization */
 #define _EFER_SVME		12 /* Enable virtualization */
+#define _EFER_LMSLE		13 /* Long Mode Segment Limit Enable */
 #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 #define _EFER_FFXSR		14 /* Enable Fast FXSAVE/FXRSTOR */
 
 
 #define EFER_SCE		(1<<_EFER_SCE)
 #define EFER_SCE		(1<<_EFER_SCE)
@@ -27,6 +28,7 @@
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_LMA		(1<<_EFER_LMA)
 #define EFER_NX			(1<<_EFER_NX)
 #define EFER_NX			(1<<_EFER_NX)
 #define EFER_SVME		(1<<_EFER_SVME)
 #define EFER_SVME		(1<<_EFER_SVME)
+#define EFER_LMSLE		(1<<_EFER_LMSLE)
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 #define EFER_FFXSR		(1<<_EFER_FFXSR)
 
 
 /* Intel MSRs. Some also available on other CPUs */
 /* Intel MSRs. Some also available on other CPUs */

+ 5 - 0
arch/x86/include/asm/vmx.h

@@ -257,6 +257,7 @@ enum vmcs_field {
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_IO_INSTRUCTION      30
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_READ            31
 #define EXIT_REASON_MSR_WRITE           32
 #define EXIT_REASON_MSR_WRITE           32
+#define EXIT_REASON_INVALID_STATE	33
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MWAIT_INSTRUCTION   36
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_MONITOR_INSTRUCTION 39
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
 #define EXIT_REASON_PAUSE_INSTRUCTION   40
@@ -266,6 +267,7 @@ enum vmcs_field {
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_VIOLATION       48
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_EPT_MISCONFIG       49
 #define EXIT_REASON_WBINVD		54
 #define EXIT_REASON_WBINVD		54
+#define EXIT_REASON_XSETBV		55
 
 
 /*
 /*
  * Interruption-information format
  * Interruption-information format
@@ -375,6 +377,9 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_CONTEXT_BIT		(1ull << 25)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 
+#define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
+#define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
+
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
 #define VMX_EPT_MAX_GAW				0x4
 #define VMX_EPT_MT_EPTE_SHIFT			3
 #define VMX_EPT_MT_EPTE_SHIFT			3

+ 6 - 0
arch/x86/include/asm/xsave.h

@@ -13,6 +13,12 @@
 
 
 #define FXSAVE_SIZE	512
 #define FXSAVE_SIZE	512
 
 
+#define XSAVE_HDR_SIZE	    64
+#define XSAVE_HDR_OFFSET    FXSAVE_SIZE
+
+#define XSAVE_YMM_SIZE	    256
+#define XSAVE_YMM_OFFSET    (XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET)
+
 /*
 /*
  * These are the features that the OS can handle currently.
  * These are the features that the OS can handle currently.
  */
  */

+ 2 - 1
arch/x86/kernel/i387.c

@@ -107,7 +107,7 @@ void __cpuinit fpu_init(void)
 }
 }
 #endif	/* CONFIG_X86_64 */
 #endif	/* CONFIG_X86_64 */
 
 
-static void fpu_finit(struct fpu *fpu)
+void fpu_finit(struct fpu *fpu)
 {
 {
 #ifdef CONFIG_X86_32
 #ifdef CONFIG_X86_32
 	if (!HAVE_HWFP) {
 	if (!HAVE_HWFP) {
@@ -132,6 +132,7 @@ static void fpu_finit(struct fpu *fpu)
 		fp->fos = 0xffff0000u;
 		fp->fos = 0xffff0000u;
 	}
 	}
 }
 }
+EXPORT_SYMBOL_GPL(fpu_finit);
 
 
 /*
 /*
  * The _current_ task is using the FPU for the first time
  * The _current_ task is using the FPU for the first time

+ 1 - 0
arch/x86/kernel/process.c

@@ -28,6 +28,7 @@ unsigned long idle_nomwait;
 EXPORT_SYMBOL(idle_nomwait);
 EXPORT_SYMBOL(idle_nomwait);
 
 
 struct kmem_cache *task_xstate_cachep;
 struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
 
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 {

+ 422 - 327
arch/x86/kvm/emulate.c

@@ -9,6 +9,7 @@
  * privileged instructions:
  * privileged instructions:
  *
  *
  * Copyright (C) 2006 Qumranet
  * Copyright (C) 2006 Qumranet
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  *   Avi Kivity <avi@qumranet.com>
  *   Avi Kivity <avi@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
  *   Yaniv Kamay <yaniv@qumranet.com>
@@ -67,6 +68,9 @@
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
 #define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
+#define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
+#define SrcAcc      (0xd<<4)	/* Source Accumulator */
 #define SrcMask     (0xf<<4)
 #define SrcMask     (0xf<<4)
 /* Generic ModRM decode. */
 /* Generic ModRM decode. */
 #define ModRM       (1<<8)
 #define ModRM       (1<<8)
@@ -88,10 +92,6 @@
 #define Src2CL      (1<<29)
 #define Src2CL      (1<<29)
 #define Src2ImmByte (2<<29)
 #define Src2ImmByte (2<<29)
 #define Src2One     (3<<29)
 #define Src2One     (3<<29)
-#define Src2Imm16   (4<<29)
-#define Src2Mem16   (5<<29) /* Used for Ep encoding. First argument has to be
-			       in memory and second argument is located
-			       immediately after the first one in memory. */
 #define Src2Mask    (7<<29)
 #define Src2Mask    (7<<29)
 
 
 enum {
 enum {
@@ -124,15 +124,15 @@ static u32 opcode_table[256] = {
 	/* 0x20 - 0x27 */
 	/* 0x20 - 0x27 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x28 - 0x2F */
 	/* 0x28 - 0x2F */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x30 - 0x37 */
 	/* 0x30 - 0x37 */
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
-	0, 0, 0, 0,
+	ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
 	/* 0x38 - 0x3F */
 	/* 0x38 - 0x3F */
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
 	ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -170,20 +170,20 @@ static u32 opcode_table[256] = {
 	/* 0x88 - 0x8F */
 	/* 0x88 - 0x8F */
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
 	ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
-	DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
-	DstReg | SrcMem | ModRM | Mov, Group | Group1A,
+	DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
+	ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
 	/* 0x90 - 0x97 */
 	/* 0x90 - 0x97 */
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	DstReg, DstReg, DstReg, DstReg,	DstReg, DstReg, DstReg, DstReg,
 	/* 0x98 - 0x9F */
 	/* 0x98 - 0x9F */
-	0, 0, SrcImm | Src2Imm16 | No64, 0,
+	0, 0, SrcImmFAddr | No64, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
 	/* 0xA0 - 0xA7 */
 	/* 0xA0 - 0xA7 */
-	ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
-	ByteOp | DstMem | SrcReg | Mov | MemAbs, DstMem | SrcReg | Mov | MemAbs,
+	ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
+	ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
 	/* 0xA8 - 0xAF */
 	/* 0xA8 - 0xAF */
-	0, 0, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
+	DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
 	ByteOp | DstDI | String, DstDI | String,
 	ByteOp | DstDI | String, DstDI | String,
 	/* 0xB0 - 0xB7 */
 	/* 0xB0 - 0xB7 */
@@ -215,7 +215,7 @@ static u32 opcode_table[256] = {
 	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
 	/* 0xE8 - 0xEF */
 	/* 0xE8 - 0xEF */
 	SrcImm | Stack, SrcImm | ImplicitOps,
 	SrcImm | Stack, SrcImm | ImplicitOps,
-	SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
+	SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
 	/* 0xF0 - 0xF7 */
 	/* 0xF0 - 0xF7 */
@@ -337,20 +337,20 @@ static u32 group_table[] = {
 	[Group1A*8] =
 	[Group1A*8] =
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
 	[Group3_Byte*8] =
 	[Group3_Byte*8] =
-	ByteOp | SrcImm | DstMem | ModRM, 0,
+	ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	0, 0, 0, 0,
 	[Group3*8] =
 	[Group3*8] =
-	DstMem | SrcImm | ModRM, 0,
+	DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
 	0, 0, 0, 0,
 	0, 0, 0, 0,
 	[Group4*8] =
 	[Group4*8] =
-	ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
+	ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
 	0, 0, 0, 0, 0, 0,
 	0, 0, 0, 0, 0, 0,
 	[Group5*8] =
 	[Group5*8] =
-	DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
+	DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
 	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM | Stack, 0,
-	SrcMem | ModRM | Stack, SrcMem | ModRM | Src2Mem16 | ImplicitOps,
+	SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
 	SrcMem | ModRM | Stack, 0,
 	SrcMem | ModRM | Stack, 0,
 	[Group7*8] =
 	[Group7*8] =
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
 	0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
@@ -576,6 +576,13 @@ static u32 group2_table[] = {
 	(_type)_x;							\
 	(_type)_x;							\
 })
 })
 
 
+#define insn_fetch_arr(_arr, _size, _eip)                                \
+({	rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size));		\
+	if (rc != X86EMUL_CONTINUE)					\
+		goto done;						\
+	(_eip) += (_size);						\
+})
+
 static inline unsigned long ad_mask(struct decode_cache *c)
 static inline unsigned long ad_mask(struct decode_cache *c)
 {
 {
 	return (1UL << (c->ad_bytes << 3)) - 1;
 	return (1UL << (c->ad_bytes << 3)) - 1;
@@ -617,31 +624,66 @@ static void set_seg_override(struct decode_cache *c, int seg)
 	c->seg_override = seg;
 	c->seg_override = seg;
 }
 }
 
 
-static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
+static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
 		return 0;
 		return 0;
 
 
-	return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
+	return ops->get_cached_segment_base(seg, ctxt->vcpu);
 }
 }
 
 
 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
 static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
+				       struct x86_emulate_ops *ops,
 				       struct decode_cache *c)
 				       struct decode_cache *c)
 {
 {
 	if (!c->has_seg_override)
 	if (!c->has_seg_override)
 		return 0;
 		return 0;
 
 
-	return seg_base(ctxt, c->seg_override);
+	return seg_base(ctxt, ops, c->seg_override);
+}
+
+static unsigned long es_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
+{
+	return seg_base(ctxt, ops, VCPU_SREG_ES);
+}
+
+static unsigned long ss_base(struct x86_emulate_ctxt *ctxt,
+			     struct x86_emulate_ops *ops)
+{
+	return seg_base(ctxt, ops, VCPU_SREG_SS);
+}
+
+static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
+				      u32 error, bool valid)
+{
+	ctxt->exception = vec;
+	ctxt->error_code = error;
+	ctxt->error_code_valid = valid;
+	ctxt->restart = false;
+}
+
+static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, GP_VECTOR, err, true);
 }
 }
 
 
-static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr,
+		       int err)
 {
 {
-	return seg_base(ctxt, VCPU_SREG_ES);
+	ctxt->cr2 = addr;
+	emulate_exception(ctxt, PF_VECTOR, err, true);
 }
 }
 
 
-static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
+static void emulate_ud(struct x86_emulate_ctxt *ctxt)
 {
 {
-	return seg_base(ctxt, VCPU_SREG_SS);
+	emulate_exception(ctxt, UD_VECTOR, 0, false);
+}
+
+static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
+{
+	emulate_exception(ctxt, TS_VECTOR, err, true);
 }
 }
 
 
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
 static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -932,12 +974,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	/* we cannot decode insn before we complete previous rep insn */
 	/* we cannot decode insn before we complete previous rep insn */
 	WARN_ON(ctxt->restart);
 	WARN_ON(ctxt->restart);
 
 
-	/* Shadow copy of register state. Committed on successful emulation. */
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->eip = ctxt->eip;
 	c->fetch.start = c->fetch.end = c->eip;
 	c->fetch.start = c->fetch.end = c->eip;
-	ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
 
 
 	switch (mode) {
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
 	case X86EMUL_MODE_REAL:
@@ -1060,7 +1099,7 @@ done_prefixes:
 		set_seg_override(c, VCPU_SREG_DS);
 		set_seg_override(c, VCPU_SREG_DS);
 
 
 	if (!(!c->twobyte && c->b == 0x8d))
 	if (!(!c->twobyte && c->b == 0x8d))
-		c->modrm_ea += seg_override_base(ctxt, c);
+		c->modrm_ea += seg_override_base(ctxt, ops, c);
 
 
 	if (c->ad_bytes != 8)
 	if (c->ad_bytes != 8)
 		c->modrm_ea = (u32)c->modrm_ea;
 		c->modrm_ea = (u32)c->modrm_ea;
@@ -1148,6 +1187,25 @@ done_prefixes:
 		else
 		else
 			c->src.val = insn_fetch(u8, 1, c->eip);
 			c->src.val = insn_fetch(u8, 1, c->eip);
 		break;
 		break;
+	case SrcAcc:
+		c->src.type = OP_REG;
+		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
+		c->src.ptr = &c->regs[VCPU_REGS_RAX];
+		switch (c->src.bytes) {
+			case 1:
+				c->src.val = *(u8 *)c->src.ptr;
+				break;
+			case 2:
+				c->src.val = *(u16 *)c->src.ptr;
+				break;
+			case 4:
+				c->src.val = *(u32 *)c->src.ptr;
+				break;
+			case 8:
+				c->src.val = *(u64 *)c->src.ptr;
+				break;
+		}
+		break;
 	case SrcOne:
 	case SrcOne:
 		c->src.bytes = 1;
 		c->src.bytes = 1;
 		c->src.val = 1;
 		c->src.val = 1;
@@ -1156,10 +1214,21 @@ done_prefixes:
 		c->src.type = OP_MEM;
 		c->src.type = OP_MEM;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->src.ptr = (unsigned long *)
 		c->src.ptr = (unsigned long *)
-			register_address(c,  seg_override_base(ctxt, c),
+			register_address(c,  seg_override_base(ctxt, ops, c),
 					 c->regs[VCPU_REGS_RSI]);
 					 c->regs[VCPU_REGS_RSI]);
 		c->src.val = 0;
 		c->src.val = 0;
 		break;
 		break;
+	case SrcImmFAddr:
+		c->src.type = OP_IMM;
+		c->src.ptr = (unsigned long *)c->eip;
+		c->src.bytes = c->op_bytes + 2;
+		insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
+		break;
+	case SrcMemFAddr:
+		c->src.type = OP_MEM;
+		c->src.ptr = (unsigned long *)c->modrm_ea;
+		c->src.bytes = c->op_bytes + 2;
+		break;
 	}
 	}
 
 
 	/*
 	/*
@@ -1179,22 +1248,10 @@ done_prefixes:
 		c->src2.bytes = 1;
 		c->src2.bytes = 1;
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		c->src2.val = insn_fetch(u8, 1, c->eip);
 		break;
 		break;
-	case Src2Imm16:
-		c->src2.type = OP_IMM;
-		c->src2.ptr = (unsigned long *)c->eip;
-		c->src2.bytes = 2;
-		c->src2.val = insn_fetch(u16, 2, c->eip);
-		break;
 	case Src2One:
 	case Src2One:
 		c->src2.bytes = 1;
 		c->src2.bytes = 1;
 		c->src2.val = 1;
 		c->src2.val = 1;
 		break;
 		break;
-	case Src2Mem16:
-		c->src2.type = OP_MEM;
-		c->src2.bytes = 2;
-		c->src2.ptr = (unsigned long *)(c->modrm_ea + c->src.bytes);
-		c->src2.val = 0;
-		break;
 	}
 	}
 
 
 	/* Decode and fetch the destination operand: register or memory. */
 	/* Decode and fetch the destination operand: register or memory. */
@@ -1253,7 +1310,7 @@ done_prefixes:
 		c->dst.type = OP_MEM;
 		c->dst.type = OP_MEM;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
 		c->dst.ptr = (unsigned long *)
 		c->dst.ptr = (unsigned long *)
-			register_address(c, es_base(ctxt),
+			register_address(c, es_base(ctxt, ops),
 					 c->regs[VCPU_REGS_RDI]);
 					 c->regs[VCPU_REGS_RDI]);
 		c->dst.val = 0;
 		c->dst.val = 0;
 		break;
 		break;
@@ -1263,6 +1320,37 @@ done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 }
 }
 
 
+static int read_emulated(struct x86_emulate_ctxt *ctxt,
+			 struct x86_emulate_ops *ops,
+			 unsigned long addr, void *dest, unsigned size)
+{
+	int rc;
+	struct read_cache *mc = &ctxt->decode.mem_read;
+	u32 err;
+
+	while (size) {
+		int n = min(size, 8u);
+		size -= n;
+		if (mc->pos < mc->end)
+			goto read_cached;
+
+		rc = ops->read_emulated(addr, mc->data + mc->end, n, &err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt, addr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		mc->end += n;
+
+	read_cached:
+		memcpy(dest, mc->data + mc->pos, n);
+		mc->pos += n;
+		dest += n;
+		addr += n;
+	}
+	return X86EMUL_CONTINUE;
+}
+
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
 			   struct x86_emulate_ops *ops,
 			   struct x86_emulate_ops *ops,
 			   unsigned int size, unsigned short port,
 			   unsigned int size, unsigned short port,
@@ -1330,13 +1418,13 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 
 	if (dt.size < index * 8 + 7) {
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 	addr = dt.address + index * 8;
 	addr = dt.address + index * 8;
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu,  &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
 
        return ret;
        return ret;
 }
 }
@@ -1355,14 +1443,14 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 	get_descriptor_table_ptr(ctxt, ops, selector, &dt);
 
 
 	if (dt.size < index * 8 + 7) {
 	if (dt.size < index * 8 + 7) {
-		kvm_inject_gp(ctxt->vcpu, selector & 0xfffc);
+		emulate_gp(ctxt, selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
 	addr = dt.address + index * 8;
 	addr = dt.address + index * 8;
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT)
 	if (ret == X86EMUL_PROPAGATE_FAULT)
-		kvm_inject_page_fault(ctxt->vcpu, addr, err);
+		emulate_pf(ctxt, addr, err);
 
 
 	return ret;
 	return ret;
 }
 }
@@ -1481,11 +1569,70 @@ load:
 	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
 	ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 exception:
 exception:
-	kvm_queue_exception_e(ctxt->vcpu, err_vec, err_code);
+	emulate_exception(ctxt, err_vec, err_code, true);
 	return X86EMUL_PROPAGATE_FAULT;
 	return X86EMUL_PROPAGATE_FAULT;
 }
 }
 
 
-static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
+static inline int writeback(struct x86_emulate_ctxt *ctxt,
+			    struct x86_emulate_ops *ops)
+{
+	int rc;
+	struct decode_cache *c = &ctxt->decode;
+	u32 err;
+
+	switch (c->dst.type) {
+	case OP_REG:
+		/* The 4-byte case *is* correct:
+		 * in 64-bit mode we zero-extend.
+		 */
+		switch (c->dst.bytes) {
+		case 1:
+			*(u8 *)c->dst.ptr = (u8)c->dst.val;
+			break;
+		case 2:
+			*(u16 *)c->dst.ptr = (u16)c->dst.val;
+			break;
+		case 4:
+			*c->dst.ptr = (u32)c->dst.val;
+			break;	/* 64b: zero-ext */
+		case 8:
+			*c->dst.ptr = c->dst.val;
+			break;
+		}
+		break;
+	case OP_MEM:
+		if (c->lock_prefix)
+			rc = ops->cmpxchg_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.orig_val,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		else
+			rc = ops->write_emulated(
+					(unsigned long)c->dst.ptr,
+					&c->dst.val,
+					c->dst.bytes,
+					&err,
+					ctxt->vcpu);
+		if (rc == X86EMUL_PROPAGATE_FAULT)
+			emulate_pf(ctxt,
+					      (unsigned long)c->dst.ptr, err);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+		break;
+	case OP_NONE:
+		/* no writeback */
+		break;
+	default:
+		break;
+	}
+	return X86EMUL_CONTINUE;
+}
+
+static inline void emulate_push(struct x86_emulate_ctxt *ctxt,
+				struct x86_emulate_ops *ops)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
 
 
@@ -1493,7 +1640,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
 	c->dst.bytes = c->op_bytes;
 	c->dst.bytes = c->op_bytes;
 	c->dst.val = c->src.val;
 	c->dst.val = c->src.val;
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
 	register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
-	c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
+	c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops),
 					       c->regs[VCPU_REGS_RSP]);
 					       c->regs[VCPU_REGS_RSP]);
 }
 }
 
 
@@ -1504,9 +1651,9 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	int rc;
 
 
-	rc = ops->read_emulated(register_address(c, ss_base(ctxt),
-						 c->regs[VCPU_REGS_RSP]),
-				dest, len, ctxt->vcpu);
+	rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops),
+						       c->regs[VCPU_REGS_RSP]),
+			   dest, len);
 	if (rc != X86EMUL_CONTINUE)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 		return rc;
 
 
@@ -1541,7 +1688,7 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 		break;
 		break;
 	case X86EMUL_MODE_VM86:
 	case X86EMUL_MODE_VM86:
 		if (iopl < 3) {
 		if (iopl < 3) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
 		change_mask |= EFLG_IF;
 		change_mask |= EFLG_IF;
@@ -1557,15 +1704,14 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
 	return rc;
 	return rc;
 }
 }
 
 
-static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
+static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
+			      struct x86_emulate_ops *ops, int seg)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment segment;
 
 
-	kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
+	c->src.val = ops->get_segment_selector(seg, ctxt->vcpu);
 
 
-	c->src.val = segment.selector;
-	emulate_push(ctxt);
+	emulate_push(ctxt, ops);
 }
 }
 
 
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1583,19 +1729,31 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
 	return rc;
 	return rc;
 }
 }
 
 
-static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
+static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
+			  struct x86_emulate_ops *ops)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
 	unsigned long old_esp = c->regs[VCPU_REGS_RSP];
+	int rc = X86EMUL_CONTINUE;
 	int reg = VCPU_REGS_RAX;
 	int reg = VCPU_REGS_RAX;
 
 
 	while (reg <= VCPU_REGS_RDI) {
 	while (reg <= VCPU_REGS_RDI) {
 		(reg == VCPU_REGS_RSP) ?
 		(reg == VCPU_REGS_RSP) ?
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 		(c->src.val = old_esp) : (c->src.val = c->regs[reg]);
 
 
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
+
+		rc = writeback(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			return rc;
+
 		++reg;
 		++reg;
 	}
 	}
+
+	/* Disable writeback. */
+	c->dst.type = OP_NONE;
+
+	return rc;
 }
 }
 
 
 static int emulate_popa(struct x86_emulate_ctxt *ctxt,
 static int emulate_popa(struct x86_emulate_ctxt *ctxt,
@@ -1695,14 +1853,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
 		old_eip = c->eip;
 		old_eip = c->eip;
 		c->eip = c->src.val;
 		c->eip = c->src.val;
 		c->src.val = old_eip;
 		c->src.val = old_eip;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	}
 	}
 	case 4: /* jmp abs */
 	case 4: /* jmp abs */
 		c->eip = c->src.val;
 		c->eip = c->src.val;
 		break;
 		break;
 	case 6:	/* push */
 	case 6:	/* push */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	}
 	}
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
@@ -1748,145 +1906,82 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
 	return rc;
 	return rc;
 }
 }
 
 
-static inline int writeback(struct x86_emulate_ctxt *ctxt,
-			    struct x86_emulate_ops *ops)
-{
-	int rc;
-	struct decode_cache *c = &ctxt->decode;
-
-	switch (c->dst.type) {
-	case OP_REG:
-		/* The 4-byte case *is* correct:
-		 * in 64-bit mode we zero-extend.
-		 */
-		switch (c->dst.bytes) {
-		case 1:
-			*(u8 *)c->dst.ptr = (u8)c->dst.val;
-			break;
-		case 2:
-			*(u16 *)c->dst.ptr = (u16)c->dst.val;
-			break;
-		case 4:
-			*c->dst.ptr = (u32)c->dst.val;
-			break;	/* 64b: zero-ext */
-		case 8:
-			*c->dst.ptr = c->dst.val;
-			break;
-		}
-		break;
-	case OP_MEM:
-		if (c->lock_prefix)
-			rc = ops->cmpxchg_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.orig_val,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		else
-			rc = ops->write_emulated(
-					(unsigned long)c->dst.ptr,
-					&c->dst.val,
-					c->dst.bytes,
-					ctxt->vcpu);
-		if (rc != X86EMUL_CONTINUE)
-			return rc;
-		break;
-	case OP_NONE:
-		/* no writeback */
-		break;
-	default:
-		break;
-	}
-	return X86EMUL_CONTINUE;
-}
-
-static void toggle_interruptibility(struct x86_emulate_ctxt *ctxt, u32 mask)
-{
-	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(ctxt->vcpu, mask);
-	/*
-	 * an sti; sti; sequence only disable interrupts for the first
-	 * instruction. So, if the last instruction, be it emulated or
-	 * not, left the system with the INT_STI flag enabled, it
-	 * means that the last instruction is an sti. We should not
-	 * leave the flag on in this case. The same goes for mov ss
-	 */
-	if (!(int_shadow & mask))
-		ctxt->interruptibility = mask;
-}
-
 static inline void
 static inline void
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
 setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
-	struct kvm_segment *cs, struct kvm_segment *ss)
+			struct x86_emulate_ops *ops, struct desc_struct *cs,
+			struct desc_struct *ss)
 {
 {
-	memset(cs, 0, sizeof(struct kvm_segment));
-	kvm_x86_ops->get_segment(ctxt->vcpu, cs, VCPU_SREG_CS);
-	memset(ss, 0, sizeof(struct kvm_segment));
+	memset(cs, 0, sizeof(struct desc_struct));
+	ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu);
+	memset(ss, 0, sizeof(struct desc_struct));
 
 
 	cs->l = 0;		/* will be adjusted later */
 	cs->l = 0;		/* will be adjusted later */
-	cs->base = 0;		/* flat segment */
+	set_desc_base(cs, 0);	/* flat segment */
 	cs->g = 1;		/* 4kb granularity */
 	cs->g = 1;		/* 4kb granularity */
-	cs->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_limit(cs, 0xfffff);	/* 4GB limit */
 	cs->type = 0x0b;	/* Read, Execute, Accessed */
 	cs->type = 0x0b;	/* Read, Execute, Accessed */
 	cs->s = 1;
 	cs->s = 1;
 	cs->dpl = 0;		/* will be adjusted later */
 	cs->dpl = 0;		/* will be adjusted later */
-	cs->present = 1;
-	cs->db = 1;
+	cs->p = 1;
+	cs->d = 1;
 
 
-	ss->unusable = 0;
-	ss->base = 0;		/* flat segment */
-	ss->limit = 0xffffffff;	/* 4GB limit */
+	set_desc_base(ss, 0);	/* flat segment */
+	set_desc_limit(ss, 0xfffff);	/* 4GB limit */
 	ss->g = 1;		/* 4kb granularity */
 	ss->g = 1;		/* 4kb granularity */
 	ss->s = 1;
 	ss->s = 1;
 	ss->type = 0x03;	/* Read/Write, Accessed */
 	ss->type = 0x03;	/* Read/Write, Accessed */
-	ss->db = 1;		/* 32bit stack segment */
+	ss->d = 1;		/* 32bit stack segment */
 	ss->dpl = 0;
 	ss->dpl = 0;
-	ss->present = 1;
+	ss->p = 1;
 }
 }
 
 
 static int
 static int
-emulate_syscall(struct x86_emulate_ctxt *ctxt)
+emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 
 	/* syscall is not available in real mode */
 	/* syscall is not available in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 	msr_data >>= 32;
 	msr_data >>= 32;
-	cs.selector = (u16)(msr_data & 0xfffc);
-	ss.selector = (u16)(msr_data + 8);
+	cs_sel = (u16)(msr_data & 0xfffc);
+	ss_sel = (u16)(msr_data + 8);
 
 
 	if (is_long_mode(ctxt->vcpu)) {
 	if (is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 		cs.l = 1;
 	}
 	}
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 
 	c->regs[VCPU_REGS_RCX] = c->eip;
 	c->regs[VCPU_REGS_RCX] = c->eip;
 	if (is_long_mode(ctxt->vcpu)) {
 	if (is_long_mode(ctxt->vcpu)) {
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 		c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
 
 
-		kvm_x86_ops->get_msr(ctxt->vcpu,
-			ctxt->mode == X86EMUL_MODE_PROT64 ?
-			MSR_LSTAR : MSR_CSTAR, &msr_data);
+		ops->get_msr(ctxt->vcpu,
+			     ctxt->mode == X86EMUL_MODE_PROT64 ?
+			     MSR_LSTAR : MSR_CSTAR, &msr_data);
 		c->eip = msr_data;
 		c->eip = msr_data;
 
 
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
 		ctxt->eflags &= ~(msr_data | EFLG_RF);
 #endif
 #endif
 	} else {
 	} else {
 		/* legacy mode */
 		/* legacy mode */
-		kvm_x86_ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
+		ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data);
 		c->eip = (u32)msr_data;
 		c->eip = (u32)msr_data;
 
 
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
 		ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1896,15 +1991,16 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
 }
 }
 
 
 static int
 static int
-emulate_sysenter(struct x86_emulate_ctxt *ctxt)
+emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
 	u64 msr_data;
+	u16 cs_sel, ss_sel;
 
 
 	/* inject #GP if in real mode */
 	/* inject #GP if in real mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
 	if (ctxt->mode == X86EMUL_MODE_REAL) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
@@ -1912,67 +2008,70 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
 	* Therefore, we inject an #UD.
 	* Therefore, we inject an #UD.
 	*/
 	*/
 	if (ctxt->mode == X86EMUL_MODE_PROT64) {
 	if (ctxt->mode == X86EMUL_MODE_PROT64) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (ctxt->mode) {
 	switch (ctxt->mode) {
 	case X86EMUL_MODE_PROT32:
 	case X86EMUL_MODE_PROT32:
 		if ((msr_data & 0xfffc) == 0x0) {
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
 		break;
 		break;
 	case X86EMUL_MODE_PROT64:
 	case X86EMUL_MODE_PROT64:
 		if (msr_data == 0x0) {
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
 		break;
 		break;
 	}
 	}
 
 
 	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
 	ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
-	cs.selector = (u16)msr_data;
-	cs.selector &= ~SELECTOR_RPL_MASK;
-	ss.selector = cs.selector + 8;
-	ss.selector &= ~SELECTOR_RPL_MASK;
+	cs_sel = (u16)msr_data;
+	cs_sel &= ~SELECTOR_RPL_MASK;
+	ss_sel = cs_sel + 8;
+	ss_sel &= ~SELECTOR_RPL_MASK;
 	if (ctxt->mode == X86EMUL_MODE_PROT64
 	if (ctxt->mode == X86EMUL_MODE_PROT64
 		|| is_long_mode(ctxt->vcpu)) {
 		|| is_long_mode(ctxt->vcpu)) {
-		cs.db = 0;
+		cs.d = 0;
 		cs.l = 1;
 		cs.l = 1;
 	}
 	}
 
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data);
 	c->eip = msr_data;
 	c->eip = msr_data;
 
 
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
 	c->regs[VCPU_REGS_RSP] = msr_data;
 	c->regs[VCPU_REGS_RSP] = msr_data;
 
 
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 }
 }
 
 
 static int
 static int
-emulate_sysexit(struct x86_emulate_ctxt *ctxt)
+emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 {
 {
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
-	struct kvm_segment cs, ss;
+	struct desc_struct cs, ss;
 	u64 msr_data;
 	u64 msr_data;
 	int usermode;
 	int usermode;
+	u16 cs_sel, ss_sel;
 
 
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	/* inject #GP if in real mode or Virtual 8086 mode */
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	if (ctxt->mode == X86EMUL_MODE_REAL ||
 	    ctxt->mode == X86EMUL_MODE_VM86) {
 	    ctxt->mode == X86EMUL_MODE_VM86) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
-	setup_syscalls_segments(ctxt, &cs, &ss);
+	setup_syscalls_segments(ctxt, ops, &cs, &ss);
 
 
 	if ((c->rex_prefix & 0x8) != 0x0)
 	if ((c->rex_prefix & 0x8) != 0x0)
 		usermode = X86EMUL_MODE_PROT64;
 		usermode = X86EMUL_MODE_PROT64;
@@ -1981,35 +2080,37 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
 
 
 	cs.dpl = 3;
 	cs.dpl = 3;
 	ss.dpl = 3;
 	ss.dpl = 3;
-	kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
+	ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data);
 	switch (usermode) {
 	switch (usermode) {
 	case X86EMUL_MODE_PROT32:
 	case X86EMUL_MODE_PROT32:
-		cs.selector = (u16)(msr_data + 16);
+		cs_sel = (u16)(msr_data + 16);
 		if ((msr_data & 0xfffc) == 0x0) {
 		if ((msr_data & 0xfffc) == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
-		ss.selector = (u16)(msr_data + 24);
+		ss_sel = (u16)(msr_data + 24);
 		break;
 		break;
 	case X86EMUL_MODE_PROT64:
 	case X86EMUL_MODE_PROT64:
-		cs.selector = (u16)(msr_data + 32);
+		cs_sel = (u16)(msr_data + 32);
 		if (msr_data == 0x0) {
 		if (msr_data == 0x0) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
-		ss.selector = cs.selector + 8;
-		cs.db = 0;
+		ss_sel = cs_sel + 8;
+		cs.d = 0;
 		cs.l = 1;
 		cs.l = 1;
 		break;
 		break;
 	}
 	}
-	cs.selector |= SELECTOR_RPL_MASK;
-	ss.selector |= SELECTOR_RPL_MASK;
+	cs_sel |= SELECTOR_RPL_MASK;
+	ss_sel |= SELECTOR_RPL_MASK;
 
 
-	kvm_x86_ops->set_segment(ctxt->vcpu, &cs, VCPU_SREG_CS);
-	kvm_x86_ops->set_segment(ctxt->vcpu, &ss, VCPU_SREG_SS);
+	ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu);
+	ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
+	ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
 
 
-	c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
-	c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
+	c->eip = c->regs[VCPU_REGS_RDX];
+	c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
 
 
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 }
 }
@@ -2030,25 +2131,25 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
 					    struct x86_emulate_ops *ops,
 					    struct x86_emulate_ops *ops,
 					    u16 port, u16 len)
 					    u16 port, u16 len)
 {
 {
-	struct kvm_segment tr_seg;
+	struct desc_struct tr_seg;
 	int r;
 	int r;
 	u16 io_bitmap_ptr;
 	u16 io_bitmap_ptr;
 	u8 perm, bit_idx = port & 0x7;
 	u8 perm, bit_idx = port & 0x7;
 	unsigned mask = (1 << len) - 1;
 	unsigned mask = (1 << len) - 1;
 
 
-	kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
-	if (tr_seg.unusable)
+	ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu);
+	if (!tr_seg.p)
 		return false;
 		return false;
-	if (tr_seg.limit < 103)
+	if (desc_limit_scaled(&tr_seg) < 103)
 		return false;
 		return false;
-	r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
-			  NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2,
+			  ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 	if (r != X86EMUL_CONTINUE)
 		return false;
 		return false;
-	if (io_bitmap_ptr + port/8 > tr_seg.limit)
+	if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
 		return false;
 		return false;
-	r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
-			  ctxt->vcpu, NULL);
+	r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8,
+			  &perm, 1, ctxt->vcpu, NULL);
 	if (r != X86EMUL_CONTINUE)
 	if (r != X86EMUL_CONTINUE)
 		return false;
 		return false;
 	if ((perm >> bit_idx) & mask)
 	if ((perm >> bit_idx) & mask)
@@ -2066,17 +2167,6 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
 	return true;
 	return true;
 }
 }
 
 
-static u32 get_cached_descriptor_base(struct x86_emulate_ctxt *ctxt,
-				      struct x86_emulate_ops *ops,
-				      int seg)
-{
-	struct desc_struct desc;
-	if (ops->get_cached_descriptor(&desc, seg, ctxt->vcpu))
-		return get_desc_base(&desc);
-	else
-		return ~0;
-}
-
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
 				struct x86_emulate_ops *ops,
 				struct x86_emulate_ops *ops,
 				struct tss_segment_16 *tss)
 				struct tss_segment_16 *tss)
@@ -2165,7 +2255,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2175,7 +2265,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			     &err);
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2183,7 +2273,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 			    &err);
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2196,7 +2286,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 			return ret;
 		}
 		}
 	}
 	}
@@ -2238,7 +2328,10 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
 	int ret;
 	int ret;
 
 
-	ops->set_cr(3, tss->cr3, ctxt->vcpu);
+	if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) {
+		emulate_gp(ctxt, 0);
+		return X86EMUL_PROPAGATE_FAULT;
+	}
 	c->eip = tss->eip;
 	c->eip = tss->eip;
 	ctxt->eflags = tss->eflags | 2;
 	ctxt->eflags = tss->eflags | 2;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
 	c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2304,7 +2397,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2314,7 +2407,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			     &err);
 			     &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, old_tss_base, err);
+		emulate_pf(ctxt, old_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2322,7 +2415,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 			    &err);
 			    &err);
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 	if (ret == X86EMUL_PROPAGATE_FAULT) {
 		/* FIXME: need to provide precise fault address */
 		/* FIXME: need to provide precise fault address */
-		kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+		emulate_pf(ctxt, new_tss_base, err);
 		return ret;
 		return ret;
 	}
 	}
 
 
@@ -2335,7 +2428,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
 				     ctxt->vcpu, &err);
 				     ctxt->vcpu, &err);
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 		if (ret == X86EMUL_PROPAGATE_FAULT) {
 			/* FIXME: need to provide precise fault address */
 			/* FIXME: need to provide precise fault address */
-			kvm_inject_page_fault(ctxt->vcpu, new_tss_base, err);
+			emulate_pf(ctxt, new_tss_base, err);
 			return ret;
 			return ret;
 		}
 		}
 	}
 	}
@@ -2352,7 +2445,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	int ret;
 	int ret;
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu);
 	ulong old_tss_base =
 	ulong old_tss_base =
-		get_cached_descriptor_base(ctxt, ops, VCPU_SREG_TR);
+		ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu);
 	u32 desc_limit;
 	u32 desc_limit;
 
 
 	/* FIXME: old_tss_base == ~0 ? */
 	/* FIXME: old_tss_base == ~0 ? */
@@ -2369,7 +2462,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (reason != TASK_SWITCH_IRET) {
 	if (reason != TASK_SWITCH_IRET) {
 		if ((tss_selector & 3) > next_tss_desc.dpl ||
 		if ((tss_selector & 3) > next_tss_desc.dpl ||
 		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
 		    ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			return X86EMUL_PROPAGATE_FAULT;
 			return X86EMUL_PROPAGATE_FAULT;
 		}
 		}
 	}
 	}
@@ -2378,8 +2471,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 	if (!next_tss_desc.p ||
 	if (!next_tss_desc.p ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
 	    ((desc_limit < 0x67 && (next_tss_desc.type & 8)) ||
 	     desc_limit < 0x2b)) {
 	     desc_limit < 0x2b)) {
-		kvm_queue_exception_e(ctxt->vcpu, TS_VECTOR,
-				      tss_selector & 0xfffc);
+		emulate_ts(ctxt, tss_selector & 0xfffc);
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
 	}
 	}
 
 
@@ -2425,7 +2517,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
 		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
 		c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
 		c->lock_prefix = 0;
 		c->lock_prefix = 0;
 		c->src.val = (unsigned long) error_code;
 		c->src.val = (unsigned long) error_code;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 	}
 	}
 
 
 	return ret;
 	return ret;
@@ -2439,18 +2531,16 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
 	struct decode_cache *c = &ctxt->decode;
 	struct decode_cache *c = &ctxt->decode;
 	int rc;
 	int rc;
 
 
-	memset(c, 0, sizeof(struct decode_cache));
 	c->eip = ctxt->eip;
 	c->eip = ctxt->eip;
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
 	c->dst.type = OP_NONE;
 	c->dst.type = OP_NONE;
 
 
 	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
 	rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
 				     has_error_code, error_code);
 				     has_error_code, error_code);
 
 
 	if (rc == X86EMUL_CONTINUE) {
 	if (rc == X86EMUL_CONTINUE) {
-		memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-		kvm_rip_write(ctxt->vcpu, c->eip);
 		rc = writeback(ctxt, ops);
 		rc = writeback(ctxt, ops);
+		if (rc == X86EMUL_CONTINUE)
+			ctxt->eip = c->eip;
 	}
 	}
 
 
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -2474,29 +2564,22 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	int rc = X86EMUL_CONTINUE;
 	int rc = X86EMUL_CONTINUE;
 	int saved_dst_type = c->dst.type;
 	int saved_dst_type = c->dst.type;
 
 
-	ctxt->interruptibility = 0;
-
-	/* Shadow copy of register state. Committed on successful emulation.
-	 * NOTE: we can copy them from vcpu as x86_decode_insn() doesn't
-	 * modify them.
-	 */
-
-	memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
+	ctxt->decode.mem_read.pos = 0;
 
 
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
 	if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 		goto done;
 	}
 	}
 
 
 	/* LOCK prefix is allowed only with some instructions */
 	/* LOCK prefix is allowed only with some instructions */
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
 	if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
-		kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+		emulate_ud(ctxt);
 		goto done;
 		goto done;
 	}
 	}
 
 
 	/* Privileged instruction can be executed only in CPL=0 */
 	/* Privileged instruction can be executed only in CPL=0 */
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
 	if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) {
-		kvm_inject_gp(ctxt->vcpu, 0);
+		emulate_gp(ctxt, 0);
 		goto done;
 		goto done;
 	}
 	}
 
 
@@ -2506,7 +2589,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 		if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
 		string_done:
 		string_done:
 			ctxt->restart = false;
 			ctxt->restart = false;
-			kvm_rip_write(ctxt->vcpu, c->eip);
+			ctxt->eip = c->eip;
 			goto done;
 			goto done;
 		}
 		}
 		/* The second termination condition only applies for REPE
 		/* The second termination condition only applies for REPE
@@ -2529,20 +2612,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 	}
 	}
 
 
 	if (c->src.type == OP_MEM) {
 	if (c->src.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src.ptr,
-					&c->src.val,
-					c->src.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr,
+					c->src.valptr, c->src.bytes);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 		c->src.orig_val = c->src.val;
 		c->src.orig_val = c->src.val;
 	}
 	}
 
 
 	if (c->src2.type == OP_MEM) {
 	if (c->src2.type == OP_MEM) {
-		rc = ops->read_emulated((unsigned long)c->src2.ptr,
-					&c->src2.val,
-					c->src2.bytes,
-					ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr,
+					&c->src2.val, c->src2.bytes);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 	}
 	}
@@ -2553,8 +2632,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
 
 
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 	if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
 		/* optimisation - avoid slow emulated read if Mov */
 		/* optimisation - avoid slow emulated read if Mov */
-		rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val,
-					c->dst.bytes, ctxt->vcpu);
+		rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr,
+				   &c->dst.val, c->dst.bytes);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 	}
 	}
@@ -2571,7 +2650,7 @@ special_insn:
 		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
 		emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x06:		/* push es */
 	case 0x06:		/* push es */
-		emulate_push_sreg(ctxt, VCPU_SREG_ES);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
 		break;
 		break;
 	case 0x07:		/* pop es */
 	case 0x07:		/* pop es */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
@@ -2583,14 +2662,14 @@ special_insn:
 		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
 		emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x0e:		/* push cs */
 	case 0x0e:		/* push cs */
-		emulate_push_sreg(ctxt, VCPU_SREG_CS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
 		break;
 		break;
 	case 0x10 ... 0x15:
 	case 0x10 ... 0x15:
 	      adc:		/* adc */
 	      adc:		/* adc */
 		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
 		emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x16:		/* push ss */
 	case 0x16:		/* push ss */
-		emulate_push_sreg(ctxt, VCPU_SREG_SS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
 		break;
 		break;
 	case 0x17:		/* pop ss */
 	case 0x17:		/* pop ss */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
@@ -2602,7 +2681,7 @@ special_insn:
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x1e:		/* push ds */
 	case 0x1e:		/* push ds */
-		emulate_push_sreg(ctxt, VCPU_SREG_DS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
 		break;
 		break;
 	case 0x1f:		/* pop ds */
 	case 0x1f:		/* pop ds */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
@@ -2632,7 +2711,7 @@ special_insn:
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		emulate_1op("dec", c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x50 ... 0x57:  /* push reg */
 	case 0x50 ... 0x57:  /* push reg */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	case 0x58 ... 0x5f: /* pop reg */
 	case 0x58 ... 0x5f: /* pop reg */
 	pop_instruction:
 	pop_instruction:
@@ -2641,7 +2720,9 @@ special_insn:
 			goto done;
 			goto done;
 		break;
 		break;
 	case 0x60:	/* pusha */
 	case 0x60:	/* pusha */
-		emulate_pusha(ctxt);
+		rc = emulate_pusha(ctxt, ops);
+		if (rc != X86EMUL_CONTINUE)
+			goto done;
 		break;
 		break;
 	case 0x61:	/* popa */
 	case 0x61:	/* popa */
 		rc = emulate_popa(ctxt, ops);
 		rc = emulate_popa(ctxt, ops);
@@ -2655,14 +2736,14 @@ special_insn:
 		break;
 		break;
 	case 0x68: /* push imm */
 	case 0x68: /* push imm */
 	case 0x6a: /* push imm8 */
 	case 0x6a: /* push imm8 */
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	case 0x6c:		/* insb */
 	case 0x6c:		/* insb */
 	case 0x6d:		/* insw/insd */
 	case 0x6d:		/* insw/insd */
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->dst.bytes)) {
 					  c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
@@ -2674,7 +2755,7 @@ special_insn:
 		c->src.bytes = min(c->src.bytes, 4u);
 		c->src.bytes = min(c->src.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 		if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
 					  c->src.bytes)) {
 					  c->src.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
 		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
 		ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
@@ -2707,6 +2788,7 @@ special_insn:
 		}
 		}
 		break;
 		break;
 	case 0x84 ... 0x85:
 	case 0x84 ... 0x85:
+	test:
 		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
 		emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0x86 ... 0x87:	/* xchg */
 	case 0x86 ... 0x87:	/* xchg */
@@ -2735,18 +2817,13 @@ special_insn:
 		break;
 		break;
 	case 0x88 ... 0x8b:	/* mov */
 	case 0x88 ... 0x8b:	/* mov */
 		goto mov;
 		goto mov;
-	case 0x8c: { /* mov r/m, sreg */
-		struct kvm_segment segreg;
-
-		if (c->modrm_reg <= VCPU_SREG_GS)
-			kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
-		else {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+	case 0x8c:  /* mov r/m, sreg */
+		if (c->modrm_reg > VCPU_SREG_GS) {
+			emulate_ud(ctxt);
 			goto done;
 			goto done;
 		}
 		}
-		c->dst.val = segreg.selector;
+		c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu);
 		break;
 		break;
-	}
 	case 0x8d: /* lea r16/r32, m */
 	case 0x8d: /* lea r16/r32, m */
 		c->dst.val = c->modrm_ea;
 		c->dst.val = c->modrm_ea;
 		break;
 		break;
@@ -2757,12 +2834,12 @@ special_insn:
 
 
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		if (c->modrm_reg == VCPU_SREG_CS ||
 		    c->modrm_reg > VCPU_SREG_GS) {
 		    c->modrm_reg > VCPU_SREG_GS) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 			goto done;
 		}
 		}
 
 
 		if (c->modrm_reg == VCPU_SREG_SS)
 		if (c->modrm_reg == VCPU_SREG_SS)
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_MOV_SS);
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
 
 
 		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 		rc = load_segment_descriptor(ctxt, ops, sel, c->modrm_reg);
 
 
@@ -2775,19 +2852,19 @@ special_insn:
 			goto done;
 			goto done;
 		break;
 		break;
 	case 0x90: /* nop / xchg r8,rax */
 	case 0x90: /* nop / xchg r8,rax */
-		if (!(c->rex_prefix & 1)) { /* nop */
-			c->dst.type = OP_NONE;
+		if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) {
+			c->dst.type = OP_NONE;  /* nop */
 			break;
 			break;
 		}
 		}
 	case 0x91 ... 0x97: /* xchg reg,rax */
 	case 0x91 ... 0x97: /* xchg reg,rax */
-		c->src.type = c->dst.type = OP_REG;
-		c->src.bytes = c->dst.bytes = c->op_bytes;
+		c->src.type = OP_REG;
+		c->src.bytes = c->op_bytes;
 		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
 		c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
 		c->src.val = *(c->src.ptr);
 		c->src.val = *(c->src.ptr);
 		goto xchg;
 		goto xchg;
 	case 0x9c: /* pushf */
 	case 0x9c: /* pushf */
 		c->src.val =  (unsigned long) ctxt->eflags;
 		c->src.val =  (unsigned long) ctxt->eflags;
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	case 0x9d: /* popf */
 	case 0x9d: /* popf */
 		c->dst.type = OP_REG;
 		c->dst.type = OP_REG;
@@ -2797,19 +2874,15 @@ special_insn:
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 		break;
 		break;
-	case 0xa0 ... 0xa1:	/* mov */
-		c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
-		c->dst.val = c->src.val;
-		break;
-	case 0xa2 ... 0xa3:	/* mov */
-		c->dst.val = (unsigned long)c->regs[VCPU_REGS_RAX];
-		break;
+	case 0xa0 ... 0xa3:	/* mov */
 	case 0xa4 ... 0xa5:	/* movs */
 	case 0xa4 ... 0xa5:	/* movs */
 		goto mov;
 		goto mov;
 	case 0xa6 ... 0xa7:	/* cmps */
 	case 0xa6 ... 0xa7:	/* cmps */
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		c->dst.type = OP_NONE; /* Disable writeback. */
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
 		DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
 		goto cmp;
 		goto cmp;
+	case 0xa8 ... 0xa9:	/* test ax, imm */
+		goto test;
 	case 0xaa ... 0xab:	/* stos */
 	case 0xaa ... 0xab:	/* stos */
 		c->dst.val = c->regs[VCPU_REGS_RAX];
 		c->dst.val = c->regs[VCPU_REGS_RAX];
 		break;
 		break;
@@ -2855,19 +2928,23 @@ special_insn:
 		long int rel = c->src.val;
 		long int rel = c->src.val;
 		c->src.val = (unsigned long) c->eip;
 		c->src.val = (unsigned long) c->eip;
 		jmp_rel(c, rel);
 		jmp_rel(c, rel);
-		emulate_push(ctxt);
+		emulate_push(ctxt, ops);
 		break;
 		break;
 	}
 	}
 	case 0xe9: /* jmp rel */
 	case 0xe9: /* jmp rel */
 		goto jmp;
 		goto jmp;
-	case 0xea: /* jmp far */
+	case 0xea: { /* jmp far */
+		unsigned short sel;
 	jump_far:
 	jump_far:
-		if (load_segment_descriptor(ctxt, ops, c->src2.val,
-					    VCPU_SREG_CS))
+		memcpy(&sel, c->src.valptr + c->op_bytes, 2);
+
+		if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
 			goto done;
 			goto done;
 
 
-		c->eip = c->src.val;
+		c->eip = 0;
+		memcpy(&c->eip, c->src.valptr, c->op_bytes);
 		break;
 		break;
+	}
 	case 0xeb:
 	case 0xeb:
 	      jmp:		/* jmp rel short */
 	      jmp:		/* jmp rel short */
 		jmp_rel(c, c->src.val);
 		jmp_rel(c, c->src.val);
@@ -2879,20 +2956,20 @@ special_insn:
 	do_io_in:
 	do_io_in:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
 				     &c->dst.val))
 				     &c->dst.val))
 			goto done; /* IO is needed */
 			goto done; /* IO is needed */
 		break;
 		break;
-	case 0xee: /* out al,dx */
-	case 0xef: /* out (e/r)ax,dx */
+	case 0xee: /* out dx,al */
+	case 0xef: /* out dx,(e/r)ax */
 		c->src.val = c->regs[VCPU_REGS_RDX];
 		c->src.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
 	do_io_out:
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		c->dst.bytes = min(c->dst.bytes, 4u);
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
 		if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
 		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
 		ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
@@ -2916,18 +2993,20 @@ special_insn:
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		c->dst.type = OP_NONE;	/* Disable writeback. */
 		break;
 		break;
 	case 0xfa: /* cli */
 	case 0xfa: /* cli */
-		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
-		else {
+		if (emulator_bad_iopl(ctxt, ops)) {
+			emulate_gp(ctxt, 0);
+			goto done;
+		} else {
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			ctxt->eflags &= ~X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		}
 		break;
 		break;
 	case 0xfb: /* sti */
 	case 0xfb: /* sti */
-		if (emulator_bad_iopl(ctxt, ops))
-			kvm_inject_gp(ctxt->vcpu, 0);
-		else {
-			toggle_interruptibility(ctxt, KVM_X86_SHADOW_INT_STI);
+		if (emulator_bad_iopl(ctxt, ops)) {
+			emulate_gp(ctxt, 0);
+			goto done;
+		} else {
+			ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			ctxt->eflags |= X86_EFLAGS_IF;
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 			c->dst.type = OP_NONE;	/* Disable writeback. */
 		}
 		}
@@ -2964,11 +3043,12 @@ writeback:
 	c->dst.type = saved_dst_type;
 	c->dst.type = saved_dst_type;
 
 
 	if ((c->d & SrcMask) == SrcSI)
 	if ((c->d & SrcMask) == SrcSI)
-		string_addr_inc(ctxt, seg_override_base(ctxt, c), VCPU_REGS_RSI,
-				&c->src);
+		string_addr_inc(ctxt, seg_override_base(ctxt, ops, c),
+				VCPU_REGS_RSI, &c->src);
 
 
 	if ((c->d & DstMask) == DstDI)
 	if ((c->d & DstMask) == DstDI)
-		string_addr_inc(ctxt, es_base(ctxt), VCPU_REGS_RDI, &c->dst);
+		string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI,
+				&c->dst);
 
 
 	if (c->rep_prefix && (c->d & String)) {
 	if (c->rep_prefix && (c->d & String)) {
 		struct read_cache *rc = &ctxt->decode.io_read;
 		struct read_cache *rc = &ctxt->decode.io_read;
@@ -2981,11 +3061,12 @@ writeback:
 		    (rc->end != 0 && rc->end == rc->pos))
 		    (rc->end != 0 && rc->end == rc->pos))
 			ctxt->restart = false;
 			ctxt->restart = false;
 	}
 	}
-
-	/* Commit shadow register state. */
-	memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
-	kvm_rip_write(ctxt->vcpu, c->eip);
-	ops->set_rflags(ctxt->vcpu, ctxt->eflags);
+	/*
+	 * reset read cache here in case string instruction is restared
+	 * without decoding
+	 */
+	ctxt->decode.mem_read.end = 0;
+	ctxt->eip = c->eip;
 
 
 done:
 done:
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
 	return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
@@ -3051,7 +3132,7 @@ twobyte_insn:
 			c->dst.type = OP_NONE;
 			c->dst.type = OP_NONE;
 			break;
 			break;
 		case 5: /* not defined */
 		case 5: /* not defined */
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 			goto done;
 		case 7: /* invlpg*/
 		case 7: /* invlpg*/
 			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
 			emulate_invlpg(ctxt->vcpu, c->modrm_ea);
@@ -3063,7 +3144,7 @@ twobyte_insn:
 		}
 		}
 		break;
 		break;
 	case 0x05: 		/* syscall */
 	case 0x05: 		/* syscall */
-		rc = emulate_syscall(ctxt);
+		rc = emulate_syscall(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 		else
 		else
@@ -3073,8 +3154,11 @@ twobyte_insn:
 		emulate_clts(ctxt->vcpu);
 		emulate_clts(ctxt->vcpu);
 		c->dst.type = OP_NONE;
 		c->dst.type = OP_NONE;
 		break;
 		break;
-	case 0x08:		/* invd */
 	case 0x09:		/* wbinvd */
 	case 0x09:		/* wbinvd */
+		kvm_emulate_wbinvd(ctxt->vcpu);
+		c->dst.type = OP_NONE;
+		break;
+	case 0x08:		/* invd */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x0d:		/* GrpP (prefetch) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 	case 0x18:		/* Grp16 (prefetch/nop) */
 		c->dst.type = OP_NONE;
 		c->dst.type = OP_NONE;
@@ -3084,7 +3168,7 @@ twobyte_insn:
 		case 1:
 		case 1:
 		case 5 ... 7:
 		case 5 ... 7:
 		case 9 ... 15:
 		case 9 ... 15:
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 			goto done;
 		}
 		}
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
 		c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
@@ -3093,31 +3177,42 @@ twobyte_insn:
 	case 0x21: /* mov from dr to reg */
 	case 0x21: /* mov from dr to reg */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
 			goto done;
 			goto done;
 		}
 		}
-		emulator_get_dr(ctxt, c->modrm_reg, &c->regs[c->modrm_rm]);
+		ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
 		c->dst.type = OP_NONE;	/* no writeback */
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 		break;
 	case 0x22: /* mov reg, cr */
 	case 0x22: /* mov reg, cr */
-		ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu);
+		if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) {
+			emulate_gp(ctxt, 0);
+			goto done;
+		}
 		c->dst.type = OP_NONE;
 		c->dst.type = OP_NONE;
 		break;
 		break;
 	case 0x23: /* mov from reg to dr */
 	case 0x23: /* mov from reg to dr */
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) &&
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
 		    (c->modrm_reg == 4 || c->modrm_reg == 5)) {
-			kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
+			emulate_ud(ctxt);
+			goto done;
+		}
+
+		if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
+				((ctxt->mode == X86EMUL_MODE_PROT64) ?
+				 ~0ULL : ~0U), ctxt->vcpu) < 0) {
+			/* #UD condition is already handled by the code above */
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
-		emulator_set_dr(ctxt, c->modrm_reg, c->regs[c->modrm_rm]);
+
 		c->dst.type = OP_NONE;	/* no writeback */
 		c->dst.type = OP_NONE;	/* no writeback */
 		break;
 		break;
 	case 0x30:
 	case 0x30:
 		/* wrmsr */
 		/* wrmsr */
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 		msr_data = (u32)c->regs[VCPU_REGS_RAX]
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
 			| ((u64)c->regs[VCPU_REGS_RDX] << 32);
-		if (kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+		if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) {
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		}
 		}
 		rc = X86EMUL_CONTINUE;
 		rc = X86EMUL_CONTINUE;
@@ -3125,8 +3220,8 @@ twobyte_insn:
 		break;
 		break;
 	case 0x32:
 	case 0x32:
 		/* rdmsr */
 		/* rdmsr */
-		if (kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
-			kvm_inject_gp(ctxt->vcpu, 0);
+		if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) {
+			emulate_gp(ctxt, 0);
 			goto done;
 			goto done;
 		} else {
 		} else {
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
 			c->regs[VCPU_REGS_RAX] = (u32)msr_data;
@@ -3136,14 +3231,14 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		c->dst.type = OP_NONE;
 		break;
 		break;
 	case 0x34:		/* sysenter */
 	case 0x34:		/* sysenter */
-		rc = emulate_sysenter(ctxt);
+		rc = emulate_sysenter(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 		else
 		else
 			goto writeback;
 			goto writeback;
 		break;
 		break;
 	case 0x35:		/* sysexit */
 	case 0x35:		/* sysexit */
-		rc = emulate_sysexit(ctxt);
+		rc = emulate_sysexit(ctxt, ops);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			goto done;
 			goto done;
 		else
 		else
@@ -3160,7 +3255,7 @@ twobyte_insn:
 		c->dst.type = OP_NONE;
 		c->dst.type = OP_NONE;
 		break;
 		break;
 	case 0xa0:	  /* push fs */
 	case 0xa0:	  /* push fs */
-		emulate_push_sreg(ctxt, VCPU_SREG_FS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
 		break;
 		break;
 	case 0xa1:	 /* pop fs */
 	case 0xa1:	 /* pop fs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3179,7 +3274,7 @@ twobyte_insn:
 		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
 		emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
 		break;
 		break;
 	case 0xa8:	/* push gs */
 	case 0xa8:	/* push gs */
-		emulate_push_sreg(ctxt, VCPU_SREG_GS);
+		emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
 		break;
 		break;
 	case 0xa9:	/* pop gs */
 	case 0xa9:	/* pop gs */
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
 		rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);

+ 90 - 56
arch/x86/kvm/i8254.c

@@ -5,6 +5,7 @@
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2006 Intel Corporation
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2007 Keir Fraser, XenSource Inc
  * Copyright (c) 2008 Intel Corporation
  * Copyright (c) 2008 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,7 @@
 
 
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 
 #include "irq.h"
 #include "irq.h"
 #include "i8254.h"
 #include "i8254.h"
@@ -243,11 +245,22 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
 						 irq_ack_notifier);
-	raw_spin_lock(&ps->inject_lock);
-	if (atomic_dec_return(&ps->pit_timer.pending) < 0)
+	int value;
+
+	spin_lock(&ps->inject_lock);
+	value = atomic_dec_return(&ps->pit_timer.pending);
+	if (value < 0)
+		/* spurious acks can be generated if, for example, the
+		 * PIC is being reset.  Handle it gracefully here
+		 */
 		atomic_inc(&ps->pit_timer.pending);
 		atomic_inc(&ps->pit_timer.pending);
+	else if (value > 0)
+		/* in this case, we had multiple outstanding pit interrupts
+		 * that we needed to inject.  Reinject
+		 */
+		queue_work(ps->pit->wq, &ps->pit->expired);
 	ps->irq_ack = 1;
 	ps->irq_ack = 1;
-	raw_spin_unlock(&ps->inject_lock);
+	spin_unlock(&ps->inject_lock);
 }
 }
 
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -263,10 +276,10 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 		hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
 }
 }
 
 
-static void destroy_pit_timer(struct kvm_timer *pt)
+static void destroy_pit_timer(struct kvm_pit *pit)
 {
 {
-	pr_debug("execute del timer!\n");
-	hrtimer_cancel(&pt->timer);
+	hrtimer_cancel(&pit->pit_state.pit_timer.timer);
+	cancel_work_sync(&pit->expired);
 }
 }
 
 
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
 static bool kpit_is_periodic(struct kvm_timer *ktimer)
@@ -280,6 +293,60 @@ static struct kvm_timer_ops kpit_ops = {
 	.is_periodic = kpit_is_periodic,
 	.is_periodic = kpit_is_periodic,
 };
 };
 
 
+static void pit_do_work(struct work_struct *work)
+{
+	struct kvm_pit *pit = container_of(work, struct kvm_pit, expired);
+	struct kvm *kvm = pit->kvm;
+	struct kvm_vcpu *vcpu;
+	int i;
+	struct kvm_kpit_state *ps = &pit->pit_state;
+	int inject = 0;
+
+	/* Try to inject pending interrupts when
+	 * last one has been acked.
+	 */
+	spin_lock(&ps->inject_lock);
+	if (ps->irq_ack) {
+		ps->irq_ack = 0;
+		inject = 1;
+	}
+	spin_unlock(&ps->inject_lock);
+	if (inject) {
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
+		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
+
+		/*
+		 * Provides NMI watchdog support via Virtual Wire mode.
+		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
+		 *
+		 * Note: Our Virtual Wire implementation is simplified, only
+		 * propagating PIT interrupts to all VCPUs when they have set
+		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
+		 * VCPU0, and only if its LVT0 is in EXTINT mode.
+		 */
+		if (kvm->arch.vapics_in_nmi_mode > 0)
+			kvm_for_each_vcpu(i, vcpu, kvm)
+				kvm_apic_nmi_wd_deliver(vcpu);
+	}
+}
+
+static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
+{
+	struct kvm_timer *ktimer = container_of(data, struct kvm_timer, timer);
+	struct kvm_pit *pt = ktimer->kvm->arch.vpit;
+
+	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
+		atomic_inc(&ktimer->pending);
+		queue_work(pt->wq, &pt->expired);
+	}
+
+	if (ktimer->t_ops->is_periodic(ktimer)) {
+		hrtimer_add_expires_ns(&ktimer->timer, ktimer->period);
+		return HRTIMER_RESTART;
+	} else
+		return HRTIMER_NORESTART;
+}
+
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 {
 {
 	struct kvm_timer *pt = &ps->pit_timer;
 	struct kvm_timer *pt = &ps->pit_timer;
@@ -291,13 +358,13 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
 
 
 	/* TODO The new value only affected after the retriggered */
 	/* TODO The new value only affected after the retriggered */
 	hrtimer_cancel(&pt->timer);
 	hrtimer_cancel(&pt->timer);
+	cancel_work_sync(&ps->pit->expired);
 	pt->period = interval;
 	pt->period = interval;
 	ps->is_periodic = is_period;
 	ps->is_periodic = is_period;
 
 
-	pt->timer.function = kvm_timer_fn;
+	pt->timer.function = pit_timer_fn;
 	pt->t_ops = &kpit_ops;
 	pt->t_ops = &kpit_ops;
 	pt->kvm = ps->pit->kvm;
 	pt->kvm = ps->pit->kvm;
-	pt->vcpu = pt->kvm->bsp_vcpu;
 
 
 	atomic_set(&pt->pending, 0);
 	atomic_set(&pt->pending, 0);
 	ps->irq_ack = 1;
 	ps->irq_ack = 1;
@@ -346,7 +413,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
 		}
 		}
 		break;
 		break;
 	default:
 	default:
-		destroy_pit_timer(&ps->pit_timer);
+		destroy_pit_timer(kvm->arch.vpit);
 	}
 	}
 }
 }
 
 
@@ -625,7 +692,15 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	raw_spin_lock_init(&pit->pit_state.inject_lock);
+	spin_lock_init(&pit->pit_state.inject_lock);
+
+	pit->wq = create_singlethread_workqueue("kvm-pit-wq");
+	if (!pit->wq) {
+		mutex_unlock(&pit->pit_state.lock);
+		kfree(pit);
+		return NULL;
+	}
+	INIT_WORK(&pit->expired, pit_do_work);
 
 
 	kvm->arch.vpit = pit;
 	kvm->arch.vpit = pit;
 	pit->kvm = kvm;
 	pit->kvm = kvm;
@@ -677,6 +752,9 @@ void kvm_free_pit(struct kvm *kvm)
 	struct hrtimer *timer;
 	struct hrtimer *timer;
 
 
 	if (kvm->arch.vpit) {
 	if (kvm->arch.vpit) {
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &kvm->arch.vpit->dev);
+		kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
+					      &kvm->arch.vpit->speaker_dev);
 		kvm_unregister_irq_mask_notifier(kvm, 0,
 		kvm_unregister_irq_mask_notifier(kvm, 0,
 					       &kvm->arch.vpit->mask_notifier);
 					       &kvm->arch.vpit->mask_notifier);
 		kvm_unregister_irq_ack_notifier(kvm,
 		kvm_unregister_irq_ack_notifier(kvm,
@@ -684,54 +762,10 @@ void kvm_free_pit(struct kvm *kvm)
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		mutex_lock(&kvm->arch.vpit->pit_state.lock);
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		timer = &kvm->arch.vpit->pit_state.pit_timer.timer;
 		hrtimer_cancel(timer);
 		hrtimer_cancel(timer);
+		cancel_work_sync(&kvm->arch.vpit->expired);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		kvm_free_irq_source_id(kvm, kvm->arch.vpit->irq_source_id);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
 		mutex_unlock(&kvm->arch.vpit->pit_state.lock);
+		destroy_workqueue(kvm->arch.vpit->wq);
 		kfree(kvm->arch.vpit);
 		kfree(kvm->arch.vpit);
 	}
 	}
 }
 }
-
-static void __inject_pit_timer_intr(struct kvm *kvm)
-{
-	struct kvm_vcpu *vcpu;
-	int i;
-
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
-	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
-
-	/*
-	 * Provides NMI watchdog support via Virtual Wire mode.
-	 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-	 *
-	 * Note: Our Virtual Wire implementation is simplified, only
-	 * propagating PIT interrupts to all VCPUs when they have set
-	 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-	 * VCPU0, and only if its LVT0 is in EXTINT mode.
-	 */
-	if (kvm->arch.vapics_in_nmi_mode > 0)
-		kvm_for_each_vcpu(i, vcpu, kvm)
-			kvm_apic_nmi_wd_deliver(vcpu);
-}
-
-void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
-{
-	struct kvm_pit *pit = vcpu->kvm->arch.vpit;
-	struct kvm *kvm = vcpu->kvm;
-	struct kvm_kpit_state *ps;
-
-	if (pit) {
-		int inject = 0;
-		ps = &pit->pit_state;
-
-		/* Try to inject pending interrupts when
-		 * last one has been acked.
-		 */
-		raw_spin_lock(&ps->inject_lock);
-		if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
-			ps->irq_ack = 0;
-			inject = 1;
-		}
-		raw_spin_unlock(&ps->inject_lock);
-		if (inject)
-			__inject_pit_timer_intr(kvm);
-	}
-}

+ 3 - 1
arch/x86/kvm/i8254.h

@@ -27,7 +27,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct mutex lock;
 	struct kvm_pit *pit;
 	struct kvm_pit *pit;
-	raw_spinlock_t inject_lock;
+	spinlock_t inject_lock;
 	unsigned long irq_ack;
 	unsigned long irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 };
@@ -40,6 +40,8 @@ struct kvm_pit {
 	struct kvm_kpit_state pit_state;
 	struct kvm_kpit_state pit_state;
 	int irq_source_id;
 	int irq_source_id;
 	struct kvm_irq_mask_notifier mask_notifier;
 	struct kvm_irq_mask_notifier mask_notifier;
+	struct workqueue_struct *wq;
+	struct work_struct expired;
 };
 };
 
 
 #define KVM_PIT_BASE_ADDRESS	    0x40
 #define KVM_PIT_BASE_ADDRESS	    0x40

+ 31 - 17
arch/x86/kvm/i8259.c

@@ -3,6 +3,7 @@
  *
  *
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2003-2004 Fabrice Bellard
  * Copyright (c) 2007 Intel Corporation
  * Copyright (c) 2007 Intel Corporation
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * of this software and associated documentation files (the "Software"), to deal
@@ -33,6 +34,8 @@
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include "trace.h"
 #include "trace.h"
 
 
+static void pic_irq_request(struct kvm *kvm, int level);
+
 static void pic_lock(struct kvm_pic *s)
 static void pic_lock(struct kvm_pic *s)
 	__acquires(&s->lock)
 	__acquires(&s->lock)
 {
 {
@@ -43,16 +46,25 @@ static void pic_unlock(struct kvm_pic *s)
 	__releases(&s->lock)
 	__releases(&s->lock)
 {
 {
 	bool wakeup = s->wakeup_needed;
 	bool wakeup = s->wakeup_needed;
-	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu *vcpu, *found = NULL;
+	int i;
 
 
 	s->wakeup_needed = false;
 	s->wakeup_needed = false;
 
 
 	raw_spin_unlock(&s->lock);
 	raw_spin_unlock(&s->lock);
 
 
 	if (wakeup) {
 	if (wakeup) {
-		vcpu = s->kvm->bsp_vcpu;
-		if (vcpu)
-			kvm_vcpu_kick(vcpu);
+		kvm_for_each_vcpu(i, vcpu, s->kvm) {
+			if (kvm_apic_accept_pic_intr(vcpu)) {
+				found = vcpu;
+				break;
+			}
+		}
+
+		if (!found)
+			found = s->kvm->bsp_vcpu;
+
+		kvm_vcpu_kick(found);
 	}
 	}
 }
 }
 
 
@@ -173,10 +185,7 @@ static void pic_update_irq(struct kvm_pic *s)
 		pic_set_irq1(&s->pics[0], 2, 0);
 		pic_set_irq1(&s->pics[0], 2, 0);
 	}
 	}
 	irq = pic_get_irq(&s->pics[0]);
 	irq = pic_get_irq(&s->pics[0]);
-	if (irq >= 0)
-		s->irq_request(s->irq_request_opaque, 1);
-	else
-		s->irq_request(s->irq_request_opaque, 0);
+	pic_irq_request(s->kvm, irq >= 0);
 }
 }
 
 
 void kvm_pic_update_irq(struct kvm_pic *s)
 void kvm_pic_update_irq(struct kvm_pic *s)
@@ -261,8 +270,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 void kvm_pic_reset(struct kvm_kpic_state *s)
 {
 {
 	int irq;
 	int irq;
-	struct kvm *kvm = s->pics_state->irq_request_opaque;
-	struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
+	struct kvm_vcpu *vcpu0 = s->pics_state->kvm->bsp_vcpu;
 	u8 irr = s->irr, isr = s->imr;
 	u8 irr = s->irr, isr = s->imr;
 
 
 	s->last_irr = 0;
 	s->last_irr = 0;
@@ -301,8 +309,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 			/*
 			/*
 			 * deassert a pending interrupt
 			 * deassert a pending interrupt
 			 */
 			 */
-			s->pics_state->irq_request(s->pics_state->
-						   irq_request_opaque, 0);
+			pic_irq_request(s->pics_state->kvm, 0);
 			s->init_state = 1;
 			s->init_state = 1;
 			s->init4 = val & 1;
 			s->init4 = val & 1;
 			if (val & 0x02)
 			if (val & 0x02)
@@ -356,10 +363,20 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
 		}
 		}
 	} else
 	} else
 		switch (s->init_state) {
 		switch (s->init_state) {
-		case 0:		/* normal mode */
+		case 0: { /* normal mode */
+			u8 imr_diff = s->imr ^ val,
+				off = (s == &s->pics_state->pics[0]) ? 0 : 8;
 			s->imr = val;
 			s->imr = val;
+			for (irq = 0; irq < PIC_NUM_PINS/2; irq++)
+				if (imr_diff & (1 << irq))
+					kvm_fire_mask_notifiers(
+						s->pics_state->kvm,
+						SELECT_PIC(irq + off),
+						irq + off,
+						!!(s->imr & (1 << irq)));
 			pic_update_irq(s->pics_state);
 			pic_update_irq(s->pics_state);
 			break;
 			break;
+		}
 		case 1:
 		case 1:
 			s->irq_base = val & 0xf8;
 			s->irq_base = val & 0xf8;
 			s->init_state = 2;
 			s->init_state = 2;
@@ -518,9 +535,8 @@ static int picdev_read(struct kvm_io_device *this,
 /*
 /*
  * callback when PIC0 irq status changed
  * callback when PIC0 irq status changed
  */
  */
-static void pic_irq_request(void *opaque, int level)
+static void pic_irq_request(struct kvm *kvm, int level)
 {
 {
-	struct kvm *kvm = opaque;
 	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
 	struct kvm_pic *s = pic_irqchip(kvm);
 	struct kvm_pic *s = pic_irqchip(kvm);
 	int irq = pic_get_irq(&s->pics[0]);
 	int irq = pic_get_irq(&s->pics[0]);
@@ -549,8 +565,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
 	s->kvm = kvm;
 	s->kvm = kvm;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[0].elcr_mask = 0xf8;
 	s->pics[1].elcr_mask = 0xde;
 	s->pics[1].elcr_mask = 0xde;
-	s->irq_request = pic_irq_request;
-	s->irq_request_opaque = kvm;
 	s->pics[0].pics_state = s;
 	s->pics[0].pics_state = s;
 	s->pics[1].pics_state = s;
 	s->pics[1].pics_state = s;
 
 

+ 1 - 1
arch/x86/kvm/irq.c

@@ -1,6 +1,7 @@
 /*
 /*
  * irq.c: API for in kernel interrupt controller
  * irq.c: API for in kernel interrupt controller
  * Copyright (c) 2007, Intel Corporation.
  * Copyright (c) 2007, Intel Corporation.
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  *
  * This program is free software; you can redistribute it and/or modify it
  * This program is free software; you can redistribute it and/or modify it
  * under the terms and conditions of the GNU General Public License,
  * under the terms and conditions of the GNU General Public License,
@@ -89,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu)
 {
 {
 	kvm_inject_apic_timer_irqs(vcpu);
 	kvm_inject_apic_timer_irqs(vcpu);
-	kvm_inject_pit_timer_irqs(vcpu);
 	/* TODO: PIT, RTC etc. */
 	/* TODO: PIT, RTC etc. */
 }
 }
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
 EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);

+ 0 - 4
arch/x86/kvm/irq.h

@@ -38,8 +38,6 @@
 struct kvm;
 struct kvm;
 struct kvm_vcpu;
 struct kvm_vcpu;
 
 
-typedef void irq_request_func(void *opaque, int level);
-
 struct kvm_kpic_state {
 struct kvm_kpic_state {
 	u8 last_irr;	/* edge detection */
 	u8 last_irr;	/* edge detection */
 	u8 irr;		/* interrupt request register */
 	u8 irr;		/* interrupt request register */
@@ -67,8 +65,6 @@ struct kvm_pic {
 	unsigned pending_acks;
 	unsigned pending_acks;
 	struct kvm *kvm;
 	struct kvm *kvm;
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
 	struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
-	irq_request_func *irq_request;
-	void *irq_request_opaque;
 	int output;		/* intr from master PIC */
 	int output;		/* intr from master PIC */
 	struct kvm_io_device dev;
 	struct kvm_io_device dev;
 	void (*ack_notifier)(void *opaque, int irq);
 	void (*ack_notifier)(void *opaque, int irq);

+ 8 - 0
arch/x86/kvm/kvm_cache_regs.h

@@ -36,6 +36,8 @@ static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
 
 
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
 {
 {
+	might_sleep();  /* on svm */
+
 	if (!test_bit(VCPU_EXREG_PDPTR,
 	if (!test_bit(VCPU_EXREG_PDPTR,
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		      (unsigned long *)&vcpu->arch.regs_avail))
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
 		kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
@@ -69,4 +71,10 @@ static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
 	return kvm_read_cr4_bits(vcpu, ~0UL);
 	return kvm_read_cr4_bits(vcpu, ~0UL);
 }
 }
 
 
+static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
+{
+	return (kvm_register_read(vcpu, VCPU_REGS_RAX) & -1u)
+		| ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
+}
+
 #endif
 #endif

+ 8 - 9
arch/x86/kvm/lapic.c

@@ -5,6 +5,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Novell
  * Copyright (C) 2007 Intel
  * Copyright (C) 2007 Intel
+ * Copyright 2009 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Dor Laor <dor.laor@qumranet.com>
  *   Dor Laor <dor.laor@qumranet.com>
@@ -328,7 +329,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
 		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   "dest_mode 0x%x, short_hand 0x%x\n",
 		   target, source, dest, dest_mode, short_hand);
 		   target, source, dest, dest_mode, short_hand);
 
 
-	ASSERT(!target);
+	ASSERT(target);
 	switch (short_hand) {
 	switch (short_hand) {
 	case APIC_DEST_NOSHORT:
 	case APIC_DEST_NOSHORT:
 		if (dest_mode == 0)
 		if (dest_mode == 0)
@@ -533,7 +534,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_vcpu *vcpu = apic->vcpu;
 	struct kvm_run *run = vcpu->run;
 	struct kvm_run *run = vcpu->run;
 
 
-	set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
+	kvm_make_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu);
 	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.rip = kvm_rip_read(vcpu);
 	run->tpr_access.is_write = write;
 	run->tpr_access.is_write = write;
 }
 }
@@ -1106,13 +1107,11 @@ int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu)
 	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	u32 lvt0 = apic_get_reg(vcpu->arch.apic, APIC_LVT0);
 	int r = 0;
 	int r = 0;
 
 
-	if (kvm_vcpu_is_bsp(vcpu)) {
-		if (!apic_hw_enabled(vcpu->arch.apic))
-			r = 1;
-		if ((lvt0 & APIC_LVT_MASKED) == 0 &&
-		    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
-			r = 1;
-	}
+	if (!apic_hw_enabled(vcpu->arch.apic))
+		r = 1;
+	if ((lvt0 & APIC_LVT_MASKED) == 0 &&
+	    GET_APIC_DELIVERY_MODE(lvt0) == APIC_MODE_EXTINT)
+		r = 1;
 	return r;
 	return r;
 }
 }
 
 

+ 497 - 310
arch/x86/kvm/mmu.c

@@ -7,6 +7,7 @@
  * MMU support
  * MMU support
  *
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -32,6 +33,7 @@
 #include <linux/compiler.h>
 #include <linux/compiler.h>
 #include <linux/srcu.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/cmpxchg.h>
 #include <asm/cmpxchg.h>
@@ -90,8 +92,6 @@ module_param(oos_shadow, bool, 0644);
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT_FIRST_AVAIL_BITS_SHIFT 9
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 #define PT64_SECOND_AVAIL_BITS_SHIFT 52
 
 
-#define VALID_PAGE(x) ((x) != INVALID_PAGE)
-
 #define PT64_LEVEL_BITS 9
 #define PT64_LEVEL_BITS 9
 
 
 #define PT64_LEVEL_SHIFT(level) \
 #define PT64_LEVEL_SHIFT(level) \
@@ -173,7 +173,7 @@ struct kvm_shadow_walk_iterator {
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_okay(&(_walker));			\
 	     shadow_walk_next(&(_walker)))
 	     shadow_walk_next(&(_walker)))
 
 
-typedef int (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp);
+typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
 
 
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *pte_chain_cache;
 static struct kmem_cache *rmap_desc_cache;
 static struct kmem_cache *rmap_desc_cache;
@@ -288,6 +288,35 @@ static void __set_spte(u64 *sptep, u64 spte)
 #endif
 #endif
 }
 }
 
 
+static u64 __xchg_spte(u64 *sptep, u64 new_spte)
+{
+#ifdef CONFIG_X86_64
+	return xchg(sptep, new_spte);
+#else
+	u64 old_spte;
+
+	do {
+		old_spte = *sptep;
+	} while (cmpxchg64(sptep, old_spte, new_spte) != old_spte);
+
+	return old_spte;
+#endif
+}
+
+static void update_spte(u64 *sptep, u64 new_spte)
+{
+	u64 old_spte;
+
+	if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) ||
+	      !is_rmap_spte(*sptep))
+		__set_spte(sptep, new_spte);
+	else {
+		old_spte = __xchg_spte(sptep, new_spte);
+		if (old_spte & shadow_accessed_mask)
+			mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte)));
+	}
+}
+
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 				  struct kmem_cache *base_cache, int min)
 				  struct kmem_cache *base_cache, int min)
 {
 {
@@ -304,10 +333,11 @@ static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
 	return 0;
 	return 0;
 }
 }
 
 
-static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
+static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
+				  struct kmem_cache *cache)
 {
 {
 	while (mc->nobjs)
 	while (mc->nobjs)
-		kfree(mc->objects[--mc->nobjs]);
+		kmem_cache_free(cache, mc->objects[--mc->nobjs]);
 }
 }
 
 
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
 static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
@@ -355,10 +385,11 @@ out:
 
 
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
 {
 {
-	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_pte_chain_cache, pte_chain_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, rmap_desc_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
 	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
-	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache);
+	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
+				mmu_page_header_cache);
 }
 }
 
 
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
 static void *mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc,
@@ -379,7 +410,7 @@ static struct kvm_pte_chain *mmu_alloc_pte_chain(struct kvm_vcpu *vcpu)
 
 
 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 static void mmu_free_pte_chain(struct kvm_pte_chain *pc)
 {
 {
-	kfree(pc);
+	kmem_cache_free(pte_chain_cache, pc);
 }
 }
 
 
 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
@@ -390,7 +421,23 @@ static struct kvm_rmap_desc *mmu_alloc_rmap_desc(struct kvm_vcpu *vcpu)
 
 
 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 static void mmu_free_rmap_desc(struct kvm_rmap_desc *rd)
 {
 {
-	kfree(rd);
+	kmem_cache_free(rmap_desc_cache, rd);
+}
+
+static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
+{
+	if (!sp->role.direct)
+		return sp->gfns[index];
+
+	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
+}
+
+static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
+{
+	if (sp->role.direct)
+		BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
+	else
+		sp->gfns[index] = gfn;
 }
 }
 
 
 /*
 /*
@@ -403,8 +450,8 @@ static int *slot_largepage_idx(gfn_t gfn,
 {
 {
 	unsigned long idx;
 	unsigned long idx;
 
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-	      (slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+	      (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 	return &slot->lpage_info[level - 2][idx].write_count;
 	return &slot->lpage_info[level - 2][idx].write_count;
 }
 }
 
 
@@ -414,9 +461,7 @@ static void account_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int *write_count;
 	int i;
 	int i;
 
 
-	gfn = unalias_gfn(kvm, gfn);
-
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -430,8 +475,7 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
 	int *write_count;
 	int *write_count;
 	int i;
 	int i;
 
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	for (i = PT_DIRECTORY_LEVEL;
 	for (i = PT_DIRECTORY_LEVEL;
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 	     i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
 		write_count   = slot_largepage_idx(gfn, slot, i);
 		write_count   = slot_largepage_idx(gfn, slot, i);
@@ -447,8 +491,7 @@ static int has_wrprotected_page(struct kvm *kvm,
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot *slot;
 	int *largepage_idx;
 	int *largepage_idx;
 
 
-	gfn = unalias_gfn(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (slot) {
 	if (slot) {
 		largepage_idx = slot_largepage_idx(gfn, slot, level);
 		largepage_idx = slot_largepage_idx(gfn, slot, level);
 		return *largepage_idx;
 		return *largepage_idx;
@@ -501,7 +544,6 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 
 
 /*
 /*
  * Take gfn and return the reverse mapping to it.
  * Take gfn and return the reverse mapping to it.
- * Note: gfn must be unaliased before this function get called
  */
  */
 
 
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
@@ -513,8 +555,8 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 	if (likely(level == PT_PAGE_TABLE_LEVEL))
 		return &slot->rmap[gfn - slot->base_gfn];
 		return &slot->rmap[gfn - slot->base_gfn];
 
 
-	idx = (gfn / KVM_PAGES_PER_HPAGE(level)) -
-		(slot->base_gfn / KVM_PAGES_PER_HPAGE(level));
+	idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
+		(slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
 
 
 	return &slot->lpage_info[level - 2][idx].rmap_pde;
 	return &slot->lpage_info[level - 2][idx].rmap_pde;
 }
 }
@@ -541,9 +583,8 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 
 	if (!is_rmap_spte(*spte))
 	if (!is_rmap_spte(*spte))
 		return count;
 		return count;
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	sp = page_header(__pa(spte));
 	sp = page_header(__pa(spte));
-	sp->gfns[spte - sp->spt] = gfn;
+	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 	if (!*rmapp) {
 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
 		rmap_printk("rmap_add: %p %llx 0->1\n", spte, *spte);
@@ -600,19 +641,13 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_rmap_desc *prev_desc;
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
-	pfn_t pfn;
+	gfn_t gfn;
 	unsigned long *rmapp;
 	unsigned long *rmapp;
 	int i;
 	int i;
 
 
-	if (!is_rmap_spte(*spte))
-		return;
 	sp = page_header(__pa(spte));
 	sp = page_header(__pa(spte));
-	pfn = spte_to_pfn(*spte);
-	if (*spte & shadow_accessed_mask)
-		kvm_set_pfn_accessed(pfn);
-	if (is_writable_pte(*spte))
-		kvm_set_pfn_dirty(pfn);
-	rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
+	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
+	rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
 	if (!*rmapp) {
 	if (!*rmapp) {
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 		printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
 		BUG();
 		BUG();
@@ -644,6 +679,32 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
 	}
 	}
 }
 }
 
 
+static void set_spte_track_bits(u64 *sptep, u64 new_spte)
+{
+	pfn_t pfn;
+	u64 old_spte = *sptep;
+
+	if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) ||
+	      old_spte & shadow_accessed_mask) {
+		__set_spte(sptep, new_spte);
+	} else
+		old_spte = __xchg_spte(sptep, new_spte);
+
+	if (!is_rmap_spte(old_spte))
+		return;
+	pfn = spte_to_pfn(old_spte);
+	if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
+		kvm_set_pfn_accessed(pfn);
+	if (is_writable_pte(old_spte))
+		kvm_set_pfn_dirty(pfn);
+}
+
+static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
+{
+	set_spte_track_bits(sptep, new_spte);
+	rmap_remove(kvm, sptep);
+}
+
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
 {
 {
 	struct kvm_rmap_desc *desc;
 	struct kvm_rmap_desc *desc;
@@ -676,7 +737,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 	u64 *spte;
 	u64 *spte;
 	int i, write_protected = 0;
 	int i, write_protected = 0;
 
 
-	gfn = unalias_gfn(kvm, gfn);
 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 	rmapp = gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL);
 
 
 	spte = rmap_next(kvm, rmapp, NULL);
 	spte = rmap_next(kvm, rmapp, NULL);
@@ -685,7 +745,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
 		if (is_writable_pte(*spte)) {
 		if (is_writable_pte(*spte)) {
-			__set_spte(spte, *spte & ~PT_WRITABLE_MASK);
+			update_spte(spte, *spte & ~PT_WRITABLE_MASK);
 			write_protected = 1;
 			write_protected = 1;
 		}
 		}
 		spte = rmap_next(kvm, rmapp, spte);
 		spte = rmap_next(kvm, rmapp, spte);
@@ -709,9 +769,9 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
 			if (is_writable_pte(*spte)) {
 			if (is_writable_pte(*spte)) {
-				rmap_remove(kvm, spte);
+				drop_spte(kvm, spte,
+					  shadow_trap_nonpresent_pte);
 				--kvm->stat.lpages;
 				--kvm->stat.lpages;
-				__set_spte(spte, shadow_trap_nonpresent_pte);
 				spte = NULL;
 				spte = NULL;
 				write_protected = 1;
 				write_protected = 1;
 			}
 			}
@@ -731,8 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 	while ((spte = rmap_next(kvm, rmapp, NULL))) {
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		BUG_ON(!(*spte & PT_PRESENT_MASK));
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
 		rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
-		rmap_remove(kvm, spte);
-		__set_spte(spte, shadow_trap_nonpresent_pte);
+		drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 		need_tlb_flush = 1;
 		need_tlb_flush = 1;
 	}
 	}
 	return need_tlb_flush;
 	return need_tlb_flush;
@@ -754,8 +813,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte);
 		need_flush = 1;
 		need_flush = 1;
 		if (pte_write(*ptep)) {
 		if (pte_write(*ptep)) {
-			rmap_remove(kvm, spte);
-			__set_spte(spte, shadow_trap_nonpresent_pte);
+			drop_spte(kvm, spte, shadow_trap_nonpresent_pte);
 			spte = rmap_next(kvm, rmapp, NULL);
 			spte = rmap_next(kvm, rmapp, NULL);
 		} else {
 		} else {
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
 			new_spte = *spte &~ (PT64_BASE_ADDR_MASK);
@@ -763,9 +821,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
 
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~PT_WRITABLE_MASK;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
 			new_spte &= ~SPTE_HOST_WRITEABLE;
-			if (is_writable_pte(*spte))
-				kvm_set_pfn_dirty(spte_to_pfn(*spte));
-			__set_spte(spte, new_spte);
+			new_spte &= ~shadow_accessed_mask;
+			set_spte_track_bits(spte, new_spte);
 			spte = rmap_next(kvm, rmapp, spte);
 			spte = rmap_next(kvm, rmapp, spte);
 		}
 		}
 	}
 	}
@@ -799,8 +856,12 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 			ret = handler(kvm, &memslot->rmap[gfn_offset], data);
 
 
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
 			for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
-				int idx = gfn_offset;
-				idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
+				unsigned long idx;
+				int sh;
+
+				sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j);
+				idx = ((memslot->base_gfn+gfn_offset) >> sh) -
+					(memslot->base_gfn >> sh);
 				ret |= handler(kvm,
 				ret |= handler(kvm,
 					&memslot->lpage_info[j][idx].rmap_pde,
 					&memslot->lpage_info[j][idx].rmap_pde,
 					data);
 					data);
@@ -863,7 +924,6 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
 
 
 	sp = page_header(__pa(spte));
 	sp = page_header(__pa(spte));
 
 
-	gfn = unalias_gfn(vcpu->kvm, gfn);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 	rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level);
 
 
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
 	kvm_unmap_rmapp(vcpu->kvm, rmapp, 0);
@@ -894,10 +954,12 @@ static int is_empty_shadow_page(u64 *spt)
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 {
 	ASSERT(is_empty_shadow_page(sp->spt));
 	ASSERT(is_empty_shadow_page(sp->spt));
+	hlist_del(&sp->hash_link);
 	list_del(&sp->link);
 	list_del(&sp->link);
 	__free_page(virt_to_page(sp->spt));
 	__free_page(virt_to_page(sp->spt));
-	__free_page(virt_to_page(sp->gfns));
-	kfree(sp);
+	if (!sp->role.direct)
+		__free_page(virt_to_page(sp->gfns));
+	kmem_cache_free(mmu_page_header_cache, sp);
 	++kvm->arch.n_free_mmu_pages;
 	++kvm->arch.n_free_mmu_pages;
 }
 }
 
 
@@ -907,13 +969,15 @@ static unsigned kvm_page_table_hashfn(gfn_t gfn)
 }
 }
 
 
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
 static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
-					       u64 *parent_pte)
+					       u64 *parent_pte, int direct)
 {
 {
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
 
 
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache, sizeof *sp);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
 	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
-	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache, PAGE_SIZE);
+	if (!direct)
+		sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache,
+						  PAGE_SIZE);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
 	bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
@@ -998,7 +1062,6 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
 	BUG();
 	BUG();
 }
 }
 
 
-
 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 {
 {
 	struct kvm_pte_chain *pte_chain;
 	struct kvm_pte_chain *pte_chain;
@@ -1008,63 +1071,37 @@ static void mmu_parent_walk(struct kvm_mmu_page *sp, mmu_parent_walk_fn fn)
 
 
 	if (!sp->multimapped && sp->parent_pte) {
 	if (!sp->multimapped && sp->parent_pte) {
 		parent_sp = page_header(__pa(sp->parent_pte));
 		parent_sp = page_header(__pa(sp->parent_pte));
-		fn(parent_sp);
-		mmu_parent_walk(parent_sp, fn);
+		fn(parent_sp, sp->parent_pte);
 		return;
 		return;
 	}
 	}
+
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
 		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
+			u64 *spte = pte_chain->parent_ptes[i];
+
+			if (!spte)
 				break;
 				break;
-			parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
-			fn(parent_sp);
-			mmu_parent_walk(parent_sp, fn);
+			parent_sp = page_header(__pa(spte));
+			fn(parent_sp, spte);
 		}
 		}
 }
 }
 
 
-static void kvm_mmu_update_unsync_bitmap(u64 *spte)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte);
+static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
 {
 {
-	unsigned int index;
-	struct kvm_mmu_page *sp = page_header(__pa(spte));
-
-	index = spte - sp->spt;
-	if (!__test_and_set_bit(index, sp->unsync_child_bitmap))
-		sp->unsync_children++;
-	WARN_ON(!sp->unsync_children);
+	mmu_parent_walk(sp, mark_unsync);
 }
 }
 
 
-static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
+static void mark_unsync(struct kvm_mmu_page *sp, u64 *spte)
 {
 {
-	struct kvm_pte_chain *pte_chain;
-	struct hlist_node *node;
-	int i;
+	unsigned int index;
 
 
-	if (!sp->parent_pte)
+	index = spte - sp->spt;
+	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
 		return;
 		return;
-
-	if (!sp->multimapped) {
-		kvm_mmu_update_unsync_bitmap(sp->parent_pte);
+	if (sp->unsync_children++)
 		return;
 		return;
-	}
-
-	hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
-		for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
-			if (!pte_chain->parent_ptes[i])
-				break;
-			kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
-		}
-}
-
-static int unsync_walk_fn(struct kvm_mmu_page *sp)
-{
-	kvm_mmu_update_parents_unsync(sp);
-	return 1;
-}
-
-static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
-{
-	mmu_parent_walk(sp, unsync_walk_fn);
-	kvm_mmu_update_parents_unsync(sp);
+	kvm_mmu_mark_parents_unsync(sp);
 }
 }
 
 
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
@@ -1077,7 +1114,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
 }
 }
 
 
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
-			       struct kvm_mmu_page *sp)
+			       struct kvm_mmu_page *sp, bool clear_unsync)
 {
 {
 	return 1;
 	return 1;
 }
 }
@@ -1123,35 +1160,40 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
 	int i, ret, nr_unsync_leaf = 0;
 	int i, ret, nr_unsync_leaf = 0;
 
 
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
 	for_each_unsync_children(sp->unsync_child_bitmap, i) {
+		struct kvm_mmu_page *child;
 		u64 ent = sp->spt[i];
 		u64 ent = sp->spt[i];
 
 
-		if (is_shadow_present_pte(ent) && !is_large_pte(ent)) {
-			struct kvm_mmu_page *child;
-			child = page_header(ent & PT64_BASE_ADDR_MASK);
-
-			if (child->unsync_children) {
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-
-				ret = __mmu_unsync_walk(child, pvec);
-				if (!ret)
-					__clear_bit(i, sp->unsync_child_bitmap);
-				else if (ret > 0)
-					nr_unsync_leaf += ret;
-				else
-					return ret;
-			}
+		if (!is_shadow_present_pte(ent) || is_large_pte(ent))
+			goto clear_child_bitmap;
+
+		child = page_header(ent & PT64_BASE_ADDR_MASK);
+
+		if (child->unsync_children) {
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+
+			ret = __mmu_unsync_walk(child, pvec);
+			if (!ret)
+				goto clear_child_bitmap;
+			else if (ret > 0)
+				nr_unsync_leaf += ret;
+			else
+				return ret;
+		} else if (child->unsync) {
+			nr_unsync_leaf++;
+			if (mmu_pages_add(pvec, child, i))
+				return -ENOSPC;
+		} else
+			 goto clear_child_bitmap;
 
 
-			if (child->unsync) {
-				nr_unsync_leaf++;
-				if (mmu_pages_add(pvec, child, i))
-					return -ENOSPC;
-			}
-		}
+		continue;
+
+clear_child_bitmap:
+		__clear_bit(i, sp->unsync_child_bitmap);
+		sp->unsync_children--;
+		WARN_ON((int)sp->unsync_children < 0);
 	}
 	}
 
 
-	if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
-		sp->unsync_children = 0;
 
 
 	return nr_unsync_leaf;
 	return nr_unsync_leaf;
 }
 }
@@ -1166,26 +1208,6 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp,
 	return __mmu_unsync_walk(sp, pvec);
 	return __mmu_unsync_walk(sp, pvec);
 }
 }
 
 
-static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
-{
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *sp;
-	struct hlist_node *node;
-
-	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry(sp, node, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: found role %x\n",
-				 __func__, sp->role.word);
-			return sp;
-		}
-	return NULL;
-}
-
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 {
 {
 	WARN_ON(!sp->unsync);
 	WARN_ON(!sp->unsync);
@@ -1194,20 +1216,36 @@ static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 	--kvm->stat.mmu_unsync;
 	--kvm->stat.mmu_unsync;
 }
 }
 
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list);
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list);
+
+#define for_each_gfn_sp(kvm, sp, gfn, pos)				\
+  hlist_for_each_entry(sp, pos,						\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+	if ((sp)->gfn != (gfn)) {} else
+
+#define for_each_gfn_indirect_valid_sp(kvm, sp, gfn, pos)		\
+  hlist_for_each_entry(sp, pos,						\
+   &(kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)], hash_link)	\
+		if ((sp)->gfn != (gfn) || (sp)->role.direct ||		\
+			(sp)->role.invalid) {} else
 
 
-static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+/* @sp->gfn should be write-protected at the call site */
+static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			   struct list_head *invalid_list, bool clear_unsync)
 {
 {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
 	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 		return 1;
 	}
 	}
 
 
-	if (rmap_write_protect(vcpu->kvm, sp->gfn))
-		kvm_flush_remote_tlbs(vcpu->kvm);
-	kvm_unlink_unsync_page(vcpu->kvm, sp);
-	if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+	if (clear_unsync)
+		kvm_unlink_unsync_page(vcpu->kvm, sp);
+
+	if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) {
+		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
 		return 1;
 		return 1;
 	}
 	}
 
 
@@ -1215,6 +1253,52 @@ static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 	return 0;
 	return 0;
 }
 }
 
 
+static int kvm_sync_page_transient(struct kvm_vcpu *vcpu,
+				   struct kvm_mmu_page *sp)
+{
+	LIST_HEAD(invalid_list);
+	int ret;
+
+	ret = __kvm_sync_page(vcpu, sp, &invalid_list, false);
+	if (ret)
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+
+	return ret;
+}
+
+static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			 struct list_head *invalid_list)
+{
+	return __kvm_sync_page(vcpu, sp, invalid_list, true);
+}
+
+/* @gfn should be write-protected at the call site */
+static void kvm_sync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+	struct kvm_mmu_page *s;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
+	bool flush = false;
+
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+		if (!s->unsync)
+			continue;
+
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
+			(vcpu->arch.mmu.sync_page(vcpu, s, true))) {
+			kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
+			continue;
+		}
+		kvm_unlink_unsync_page(vcpu->kvm, s);
+		flush = true;
+	}
+
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+	if (flush)
+		kvm_mmu_flush_tlb(vcpu);
+}
+
 struct mmu_page_path {
 struct mmu_page_path {
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	struct kvm_mmu_page *parent[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
 	unsigned int idx[PT64_ROOT_LEVEL-1];
@@ -1281,6 +1365,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
 	struct mmu_page_path parents;
 	struct mmu_page_path parents;
 	struct kvm_mmu_pages pages;
 	struct kvm_mmu_pages pages;
+	LIST_HEAD(invalid_list);
 
 
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	kvm_mmu_pages_init(parent, &parents, &pages);
 	while (mmu_unsync_walk(parent, &pages)) {
 	while (mmu_unsync_walk(parent, &pages)) {
@@ -1293,9 +1378,10 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu,
 			kvm_flush_remote_tlbs(vcpu->kvm);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 
 
 		for_each_sp(pages, sp, parents, i) {
 		for_each_sp(pages, sp, parents, i) {
-			kvm_sync_page(vcpu, sp);
+			kvm_sync_page(vcpu, sp, &invalid_list);
 			mmu_pages_clear_parents(&parents);
 			mmu_pages_clear_parents(&parents);
 		}
 		}
+		kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 		cond_resched_lock(&vcpu->kvm->mmu_lock);
 		kvm_mmu_pages_init(parent, &parents, &pages);
 		kvm_mmu_pages_init(parent, &parents, &pages);
 	}
 	}
@@ -1310,11 +1396,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 					     u64 *parent_pte)
 					     u64 *parent_pte)
 {
 {
 	union kvm_mmu_page_role role;
 	union kvm_mmu_page_role role;
-	unsigned index;
 	unsigned quadrant;
 	unsigned quadrant;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *tmp;
+	struct hlist_node *node;
+	bool need_sync = false;
 
 
 	role = vcpu->arch.mmu.base_role;
 	role = vcpu->arch.mmu.base_role;
 	role.level = level;
 	role.level = level;
@@ -1322,40 +1407,45 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
 	if (role.direct)
 	if (role.direct)
 		role.cr4_pae = 0;
 		role.cr4_pae = 0;
 	role.access = access;
 	role.access = access;
-	if (vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
+	if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
 		role.quadrant = quadrant;
 		role.quadrant = quadrant;
 	}
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
-		if (sp->gfn == gfn) {
-			if (sp->unsync)
-				if (kvm_sync_page(vcpu, sp))
-					continue;
+	for_each_gfn_sp(vcpu->kvm, sp, gfn, node) {
+		if (!need_sync && sp->unsync)
+			need_sync = true;
 
 
-			if (sp->role.word != role.word)
-				continue;
+		if (sp->role.word != role.word)
+			continue;
 
 
-			mmu_page_add_parent_pte(vcpu, sp, parent_pte);
-			if (sp->unsync_children) {
-				set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
-				kvm_mmu_mark_parents_unsync(sp);
-			}
-			trace_kvm_mmu_get_page(sp, false);
-			return sp;
-		}
+		if (sp->unsync && kvm_sync_page_transient(vcpu, sp))
+			break;
+
+		mmu_page_add_parent_pte(vcpu, sp, parent_pte);
+		if (sp->unsync_children) {
+			kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
+			kvm_mmu_mark_parents_unsync(sp);
+		} else if (sp->unsync)
+			kvm_mmu_mark_parents_unsync(sp);
+
+		trace_kvm_mmu_get_page(sp, false);
+		return sp;
+	}
 	++vcpu->kvm->stat.mmu_cache_miss;
 	++vcpu->kvm->stat.mmu_cache_miss;
-	sp = kvm_mmu_alloc_page(vcpu, parent_pte);
+	sp = kvm_mmu_alloc_page(vcpu, parent_pte, direct);
 	if (!sp)
 	if (!sp)
 		return sp;
 		return sp;
 	sp->gfn = gfn;
 	sp->gfn = gfn;
 	sp->role = role;
 	sp->role = role;
-	hlist_add_head(&sp->hash_link, bucket);
+	hlist_add_head(&sp->hash_link,
+		&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
 	if (!direct) {
 	if (!direct) {
 		if (rmap_write_protect(vcpu->kvm, gfn))
 		if (rmap_write_protect(vcpu->kvm, gfn))
 			kvm_flush_remote_tlbs(vcpu->kvm);
 			kvm_flush_remote_tlbs(vcpu->kvm);
+		if (level > PT_PAGE_TABLE_LEVEL && need_sync)
+			kvm_sync_pages(vcpu, gfn);
+
 		account_shadowed(vcpu->kvm, gfn);
 		account_shadowed(vcpu->kvm, gfn);
 	}
 	}
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
 	if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
@@ -1402,6 +1492,47 @@ static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
 	--iterator->level;
 	--iterator->level;
 }
 }
 
 
+static void link_shadow_page(u64 *sptep, struct kvm_mmu_page *sp)
+{
+	u64 spte;
+
+	spte = __pa(sp->spt)
+		| PT_PRESENT_MASK | PT_ACCESSED_MASK
+		| PT_WRITABLE_MASK | PT_USER_MASK;
+	__set_spte(sptep, spte);
+}
+
+static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep)
+{
+	if (is_large_pte(*sptep)) {
+		drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
+static void validate_direct_spte(struct kvm_vcpu *vcpu, u64 *sptep,
+				   unsigned direct_access)
+{
+	if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) {
+		struct kvm_mmu_page *child;
+
+		/*
+		 * For the direct sp, if the guest pte's dirty bit
+		 * changed form clean to dirty, it will corrupt the
+		 * sp's access: allow writable in the read-only sp,
+		 * so we should update the spte at this point to get
+		 * a new sp with the correct access.
+		 */
+		child = page_header(*sptep & PT64_BASE_ADDR_MASK);
+		if (child->role.access == direct_access)
+			return;
+
+		mmu_page_remove_parent_pte(child, sptep);
+		__set_spte(sptep, shadow_trap_nonpresent_pte);
+		kvm_flush_remote_tlbs(vcpu->kvm);
+	}
+}
+
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 					 struct kvm_mmu_page *sp)
 					 struct kvm_mmu_page *sp)
 {
 {
@@ -1422,7 +1553,8 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
 			} else {
 			} else {
 				if (is_large_pte(ent))
 				if (is_large_pte(ent))
 					--kvm->stat.lpages;
 					--kvm->stat.lpages;
-				rmap_remove(kvm, &pt[i]);
+				drop_spte(kvm, &pt[i],
+					  shadow_trap_nonpresent_pte);
 			}
 			}
 		}
 		}
 		pt[i] = shadow_trap_nonpresent_pte;
 		pt[i] = shadow_trap_nonpresent_pte;
@@ -1464,7 +1596,8 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
 }
 }
 
 
 static int mmu_zap_unsync_children(struct kvm *kvm,
 static int mmu_zap_unsync_children(struct kvm *kvm,
-				   struct kvm_mmu_page *parent)
+				   struct kvm_mmu_page *parent,
+				   struct list_head *invalid_list)
 {
 {
 	int i, zapped = 0;
 	int i, zapped = 0;
 	struct mmu_page_path parents;
 	struct mmu_page_path parents;
@@ -1478,7 +1611,7 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 		struct kvm_mmu_page *sp;
 		struct kvm_mmu_page *sp;
 
 
 		for_each_sp(pages, sp, parents, i) {
 		for_each_sp(pages, sp, parents, i) {
-			kvm_mmu_zap_page(kvm, sp);
+			kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
 			mmu_pages_clear_parents(&parents);
 			mmu_pages_clear_parents(&parents);
 			zapped++;
 			zapped++;
 		}
 		}
@@ -1488,32 +1621,52 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
 	return zapped;
 	return zapped;
 }
 }
 
 
-static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
+static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp,
+				    struct list_head *invalid_list)
 {
 {
 	int ret;
 	int ret;
 
 
-	trace_kvm_mmu_zap_page(sp);
+	trace_kvm_mmu_prepare_zap_page(sp);
 	++kvm->stat.mmu_shadow_zapped;
 	++kvm->stat.mmu_shadow_zapped;
-	ret = mmu_zap_unsync_children(kvm, sp);
+	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_page_unlink_children(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
 	kvm_mmu_unlink_parents(kvm, sp);
-	kvm_flush_remote_tlbs(kvm);
 	if (!sp->role.invalid && !sp->role.direct)
 	if (!sp->role.invalid && !sp->role.direct)
 		unaccount_shadowed(kvm, sp->gfn);
 		unaccount_shadowed(kvm, sp->gfn);
 	if (sp->unsync)
 	if (sp->unsync)
 		kvm_unlink_unsync_page(kvm, sp);
 		kvm_unlink_unsync_page(kvm, sp);
 	if (!sp->root_count) {
 	if (!sp->root_count) {
-		hlist_del(&sp->hash_link);
-		kvm_mmu_free_page(kvm, sp);
+		/* Count self */
+		ret++;
+		list_move(&sp->link, invalid_list);
 	} else {
 	} else {
-		sp->role.invalid = 1;
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		list_move(&sp->link, &kvm->arch.active_mmu_pages);
 		kvm_reload_remote_mmus(kvm);
 		kvm_reload_remote_mmus(kvm);
 	}
 	}
+
+	sp->role.invalid = 1;
 	kvm_mmu_reset_last_pte_updated(kvm);
 	kvm_mmu_reset_last_pte_updated(kvm);
 	return ret;
 	return ret;
 }
 }
 
 
+static void kvm_mmu_commit_zap_page(struct kvm *kvm,
+				    struct list_head *invalid_list)
+{
+	struct kvm_mmu_page *sp;
+
+	if (list_empty(invalid_list))
+		return;
+
+	kvm_flush_remote_tlbs(kvm);
+
+	do {
+		sp = list_first_entry(invalid_list, struct kvm_mmu_page, link);
+		WARN_ON(!sp->role.invalid || sp->root_count);
+		kvm_mmu_free_page(kvm, sp);
+	} while (!list_empty(invalid_list));
+
+}
+
 /*
 /*
  * Changing the number of mmu pages allocated to the vm
  * Changing the number of mmu pages allocated to the vm
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
  * Note: if kvm_nr_mmu_pages is too small, you will get dead lock
@@ -1521,6 +1674,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 {
 {
 	int used_pages;
 	int used_pages;
+	LIST_HEAD(invalid_list);
 
 
 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
 	used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
 	used_pages = max(0, used_pages);
 	used_pages = max(0, used_pages);
@@ -1538,9 +1692,10 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 			page = container_of(kvm->arch.active_mmu_pages.prev,
 					    struct kvm_mmu_page, link);
 					    struct kvm_mmu_page, link);
-			used_pages -= kvm_mmu_zap_page(kvm, page);
-			used_pages--;
+			used_pages -= kvm_mmu_prepare_zap_page(kvm, page,
+							       &invalid_list);
 		}
 		}
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		kvm_nr_mmu_pages = used_pages;
 		kvm_nr_mmu_pages = used_pages;
 		kvm->arch.n_free_mmu_pages = 0;
 		kvm->arch.n_free_mmu_pages = 0;
 	}
 	}
@@ -1553,47 +1708,36 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
 
 
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
 {
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
 	int r;
 	int r;
 
 
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	pgprintk("%s: looking for gfn %lx\n", __func__, gfn);
 	r = 0;
 	r = 0;
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
-restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link)
-		if (sp->gfn == gfn && !sp->role.direct) {
-			pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
-				 sp->role.word);
-			r = 1;
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+		pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
+			 sp->role.word);
+		r = 1;
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
+	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	return r;
 	return r;
 }
 }
 
 
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
 {
 {
-	unsigned index;
-	struct hlist_head *bucket;
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *nn;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
 
 
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &kvm->arch.mmu_page_hash[index];
-restart:
-	hlist_for_each_entry_safe(sp, node, nn, bucket, hash_link) {
-		if (sp->gfn == gfn && !sp->role.direct
-		    && !sp->role.invalid) {
-			pgprintk("%s: zap %lx %x\n",
-				 __func__, gfn, sp->role.word);
-			if (kvm_mmu_zap_page(kvm, sp))
-				goto restart;
-		}
+	for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
+		pgprintk("%s: zap %lx %x\n",
+			 __func__, gfn, sp->role.word);
+		kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
 	}
 	}
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 }
 }
 
 
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
 static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
@@ -1723,47 +1867,51 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type);
 
 
-static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 {
 {
-	unsigned index;
-	struct hlist_head *bucket;
-	struct kvm_mmu_page *s;
-	struct hlist_node *node, *n;
-
-	index = kvm_page_table_hashfn(sp->gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
-	/* don't unsync if pagetable is shadowed with multiple roles */
-	hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
-		if (s->gfn != sp->gfn || s->role.direct)
-			continue;
-		if (s->role.word != sp->role.word)
-			return 1;
-	}
 	trace_kvm_mmu_unsync_page(sp);
 	trace_kvm_mmu_unsync_page(sp);
 	++vcpu->kvm->stat.mmu_unsync;
 	++vcpu->kvm->stat.mmu_unsync;
 	sp->unsync = 1;
 	sp->unsync = 1;
 
 
 	kvm_mmu_mark_parents_unsync(sp);
 	kvm_mmu_mark_parents_unsync(sp);
-
 	mmu_convert_notrap(sp);
 	mmu_convert_notrap(sp);
-	return 0;
+}
+
+static void kvm_unsync_pages(struct kvm_vcpu *vcpu,  gfn_t gfn)
+{
+	struct kvm_mmu_page *s;
+	struct hlist_node *node;
+
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+		if (s->unsync)
+			continue;
+		WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
+		__kvm_unsync_page(vcpu, s);
+	}
 }
 }
 
 
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 				  bool can_unsync)
 				  bool can_unsync)
 {
 {
-	struct kvm_mmu_page *shadow;
+	struct kvm_mmu_page *s;
+	struct hlist_node *node;
+	bool need_unsync = false;
 
 
-	shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
-	if (shadow) {
-		if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn, node) {
+		if (!can_unsync)
 			return 1;
 			return 1;
-		if (shadow->unsync)
-			return 0;
-		if (can_unsync && oos_shadow)
-			return kvm_unsync_page(vcpu, shadow);
-		return 1;
+
+		if (s->role.level != PT_PAGE_TABLE_LEVEL)
+			return 1;
+
+		if (!need_unsync && !s->unsync) {
+			if (!oos_shadow)
+				return 1;
+			need_unsync = true;
+		}
 	}
 	}
+	if (need_unsync)
+		kvm_unsync_pages(vcpu, gfn);
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1804,13 +1952,14 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	spte |= (u64)pfn << PAGE_SHIFT;
 	spte |= (u64)pfn << PAGE_SHIFT;
 
 
 	if ((pte_access & ACC_WRITE_MASK)
 	if ((pte_access & ACC_WRITE_MASK)
-	    || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
+	    || (!tdp_enabled && write_fault && !is_write_protection(vcpu)
+		&& !user_fault)) {
 
 
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		if (level > PT_PAGE_TABLE_LEVEL &&
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 		    has_wrprotected_page(vcpu->kvm, gfn, level)) {
 			ret = 1;
 			ret = 1;
-			spte = shadow_trap_nonpresent_pte;
-			goto set_pte;
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
+			goto done;
 		}
 		}
 
 
 		spte |= PT_WRITABLE_MASK;
 		spte |= PT_WRITABLE_MASK;
@@ -1841,7 +1990,10 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		mark_page_dirty(vcpu->kvm, gfn);
 		mark_page_dirty(vcpu->kvm, gfn);
 
 
 set_pte:
 set_pte:
-	__set_spte(sptep, spte);
+	if (is_writable_pte(*sptep) && !is_writable_pte(spte))
+		kvm_set_pfn_dirty(pfn);
+	update_spte(sptep, spte);
+done:
 	return ret;
 	return ret;
 }
 }
 
 
@@ -1853,7 +2005,6 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 			 bool reset_host_protection)
 			 bool reset_host_protection)
 {
 {
 	int was_rmapped = 0;
 	int was_rmapped = 0;
-	int was_writable = is_writable_pte(*sptep);
 	int rmap_count;
 	int rmap_count;
 
 
 	pgprintk("%s: spte %llx access %x write_fault %d"
 	pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1878,8 +2029,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		} else if (pfn != spte_to_pfn(*sptep)) {
 		} else if (pfn != spte_to_pfn(*sptep)) {
 			pgprintk("hfn old %lx new %lx\n",
 			pgprintk("hfn old %lx new %lx\n",
 				 spte_to_pfn(*sptep), pfn);
 				 spte_to_pfn(*sptep), pfn);
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 			kvm_flush_remote_tlbs(vcpu->kvm);
 		} else
 		} else
 			was_rmapped = 1;
 			was_rmapped = 1;
@@ -1890,7 +2040,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 		      reset_host_protection)) {
 		      reset_host_protection)) {
 		if (write_fault)
 		if (write_fault)
 			*ptwrite = 1;
 			*ptwrite = 1;
-		kvm_x86_ops->tlb_flush(vcpu);
+		kvm_mmu_flush_tlb(vcpu);
 	}
 	}
 
 
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
 	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
@@ -1904,15 +2054,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 	page_header_update_slot(vcpu->kvm, sptep, gfn);
 	page_header_update_slot(vcpu->kvm, sptep, gfn);
 	if (!was_rmapped) {
 	if (!was_rmapped) {
 		rmap_count = rmap_add(vcpu, sptep, gfn);
 		rmap_count = rmap_add(vcpu, sptep, gfn);
-		kvm_release_pfn_clean(pfn);
 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
 		if (rmap_count > RMAP_RECYCLE_THRESHOLD)
 			rmap_recycle(vcpu, sptep, gfn);
 			rmap_recycle(vcpu, sptep, gfn);
-	} else {
-		if (was_writable)
-			kvm_release_pfn_dirty(pfn);
-		else
-			kvm_release_pfn_clean(pfn);
 	}
 	}
+	kvm_release_pfn_clean(pfn);
 	if (speculative) {
 	if (speculative) {
 		vcpu->arch.last_pte_updated = sptep;
 		vcpu->arch.last_pte_updated = sptep;
 		vcpu->arch.last_pte_gfn = gfn;
 		vcpu->arch.last_pte_gfn = gfn;
@@ -1941,7 +2086,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 		}
 		}
 
 
 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
 		if (*iterator.sptep == shadow_trap_nonpresent_pte) {
-			pseudo_gfn = (iterator.addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
+			u64 base_addr = iterator.addr;
+
+			base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
+			pseudo_gfn = base_addr >> PAGE_SHIFT;
 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
 			sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
 					      iterator.level - 1,
 					      iterator.level - 1,
 					      1, ACC_ALL, iterator.sptep);
 					      1, ACC_ALL, iterator.sptep);
@@ -1960,6 +2108,29 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
 	return pt_write;
 	return pt_write;
 }
 }
 
 
+static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn)
+{
+	char buf[1];
+	void __user *hva;
+	int r;
+
+	/* Touch the page, so send SIGBUS */
+	hva = (void __user *)gfn_to_hva(kvm, gfn);
+	r = copy_from_user(buf, hva, 1);
+}
+
+static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
+{
+	kvm_release_pfn_clean(pfn);
+	if (is_hwpoison_pfn(pfn)) {
+		kvm_send_hwpoison_signal(kvm, gfn);
+		return 0;
+	} else if (is_fault_pfn(pfn))
+		return -EFAULT;
+
+	return 1;
+}
+
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 {
 {
 	int r;
 	int r;
@@ -1983,10 +2154,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 
 
 	/* mmio */
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -2009,6 +2178,7 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 {
 {
 	int i;
 	int i;
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
+	LIST_HEAD(invalid_list);
 
 
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
 		return;
 		return;
@@ -2018,8 +2188,10 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 
 
 		sp = page_header(root);
 		sp = page_header(root);
 		--sp->root_count;
 		--sp->root_count;
-		if (!sp->root_count && sp->role.invalid)
-			kvm_mmu_zap_page(vcpu->kvm, sp);
+		if (!sp->root_count && sp->role.invalid) {
+			kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
+			kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
+		}
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		spin_unlock(&vcpu->kvm->mmu_lock);
 		return;
 		return;
@@ -2032,10 +2204,12 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
 			sp = page_header(root);
 			sp = page_header(root);
 			--sp->root_count;
 			--sp->root_count;
 			if (!sp->root_count && sp->role.invalid)
 			if (!sp->root_count && sp->role.invalid)
-				kvm_mmu_zap_page(vcpu->kvm, sp);
+				kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+							 &invalid_list);
 		}
 		}
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 		vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
 	}
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
 }
 }
@@ -2045,7 +2219,7 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
 	int ret = 0;
 	int ret = 0;
 
 
 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
 	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		ret = 1;
 		ret = 1;
 	}
 	}
 
 
@@ -2073,6 +2247,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = 0;
 			root_gfn = 0;
 		}
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 		sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
 				      PT64_ROOT_LEVEL, direct,
 				      PT64_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 				      ACC_ALL, NULL);
@@ -2103,6 +2278,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 			root_gfn = i << 30;
 			root_gfn = i << 30;
 		}
 		}
 		spin_lock(&vcpu->kvm->mmu_lock);
 		spin_lock(&vcpu->kvm->mmu_lock);
+		kvm_mmu_free_some_pages(vcpu);
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 		sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
 				      PT32_ROOT_LEVEL, direct,
 				      PT32_ROOT_LEVEL, direct,
 				      ACC_ALL, NULL);
 				      ACC_ALL, NULL);
@@ -2198,10 +2374,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	mmu_seq = vcpu->kvm->mmu_notifier_seq;
 	smp_rmb();
 	smp_rmb();
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
 	pfn = gfn_to_pfn(vcpu->kvm, gfn);
-	if (is_error_pfn(pfn)) {
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 		goto out_unlock;
 		goto out_unlock;
@@ -2243,7 +2417,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 {
 	++vcpu->stat.tlb_flush;
 	++vcpu->stat.tlb_flush;
-	kvm_x86_ops->tlb_flush(vcpu);
+	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
 }
 }
 
 
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
 static void paging_new_cr3(struct kvm_vcpu *vcpu)
@@ -2457,10 +2631,9 @@ static int init_kvm_mmu(struct kvm_vcpu *vcpu)
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 static void destroy_kvm_mmu(struct kvm_vcpu *vcpu)
 {
 {
 	ASSERT(vcpu);
 	ASSERT(vcpu);
-	if (VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
+		/* mmu.free() should set root_hpa = INVALID_PAGE */
 		vcpu->arch.mmu.free(vcpu);
 		vcpu->arch.mmu.free(vcpu);
-		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
-	}
 }
 }
 
 
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
 int kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -2477,9 +2650,6 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 	r = mmu_topup_memory_caches(vcpu);
 	r = mmu_topup_memory_caches(vcpu);
 	if (r)
 	if (r)
 		goto out;
 		goto out;
-	spin_lock(&vcpu->kvm->mmu_lock);
-	kvm_mmu_free_some_pages(vcpu);
-	spin_unlock(&vcpu->kvm->mmu_lock);
 	r = mmu_alloc_roots(vcpu);
 	r = mmu_alloc_roots(vcpu);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	mmu_sync_roots(vcpu);
 	mmu_sync_roots(vcpu);
@@ -2508,7 +2678,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
 	pte = *spte;
 	pte = *spte;
 	if (is_shadow_present_pte(pte)) {
 	if (is_shadow_present_pte(pte)) {
 		if (is_last_spte(pte, sp->role.level))
 		if (is_last_spte(pte, sp->role.level))
-			rmap_remove(vcpu->kvm, spte);
+			drop_spte(vcpu->kvm, spte, shadow_trap_nonpresent_pte);
 		else {
 		else {
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			child = page_header(pte & PT64_BASE_ADDR_MASK);
 			mmu_page_remove_parent_pte(child, spte);
 			mmu_page_remove_parent_pte(child, spte);
@@ -2529,6 +2699,9 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
 		return;
 		return;
         }
         }
 
 
+	if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
+		return;
+
 	++vcpu->kvm->stat.mmu_pte_updated;
 	++vcpu->kvm->stat.mmu_pte_updated;
 	if (!sp->role.cr4_pae)
 	if (!sp->role.cr4_pae)
 		paging32_update_pte(vcpu, sp, spte, new);
 		paging32_update_pte(vcpu, sp, spte, new);
@@ -2549,11 +2722,15 @@ static bool need_remote_flush(u64 old, u64 new)
 	return (old & ~new & PT64_PERM_MASK) != 0;
 	return (old & ~new & PT64_PERM_MASK) != 0;
 }
 }
 
 
-static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, u64 old, u64 new)
+static void mmu_pte_write_flush_tlb(struct kvm_vcpu *vcpu, bool zap_page,
+				    bool remote_flush, bool local_flush)
 {
 {
-	if (need_remote_flush(old, new))
+	if (zap_page)
+		return;
+
+	if (remote_flush)
 		kvm_flush_remote_tlbs(vcpu->kvm);
 		kvm_flush_remote_tlbs(vcpu->kvm);
-	else
+	else if (local_flush)
 		kvm_mmu_flush_tlb(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 }
 }
 
 
@@ -2603,10 +2780,10 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 		       bool guest_initiated)
 		       bool guest_initiated)
 {
 {
 	gfn_t gfn = gpa >> PAGE_SHIFT;
 	gfn_t gfn = gpa >> PAGE_SHIFT;
+	union kvm_mmu_page_role mask = { .word = 0 };
 	struct kvm_mmu_page *sp;
 	struct kvm_mmu_page *sp;
-	struct hlist_node *node, *n;
-	struct hlist_head *bucket;
-	unsigned index;
+	struct hlist_node *node;
+	LIST_HEAD(invalid_list);
 	u64 entry, gentry;
 	u64 entry, gentry;
 	u64 *spte;
 	u64 *spte;
 	unsigned offset = offset_in_page(gpa);
 	unsigned offset = offset_in_page(gpa);
@@ -2619,6 +2796,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 	int npte;
 	int npte;
 	int r;
 	int r;
 	int invlpg_counter;
 	int invlpg_counter;
+	bool remote_flush, local_flush, zap_page;
+
+	zap_page = remote_flush = local_flush = false;
 
 
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
 
 
@@ -2674,13 +2854,9 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 			vcpu->arch.last_pte_updated = NULL;
 			vcpu->arch.last_pte_updated = NULL;
 		}
 		}
 	}
 	}
-	index = kvm_page_table_hashfn(gfn);
-	bucket = &vcpu->kvm->arch.mmu_page_hash[index];
 
 
-restart:
-	hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
-		if (sp->gfn != gfn || sp->role.direct || sp->role.invalid)
-			continue;
+	mask.cr0_wp = mask.cr4_pae = mask.nxe = 1;
+	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) {
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		pte_size = sp->role.cr4_pae ? 8 : 4;
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
 		misaligned |= bytes < 4;
 		misaligned |= bytes < 4;
@@ -2697,8 +2873,8 @@ restart:
 			 */
 			 */
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 			pgprintk("misaligned: gpa %llx bytes %d role %x\n",
 				 gpa, bytes, sp->role.word);
 				 gpa, bytes, sp->role.word);
-			if (kvm_mmu_zap_page(vcpu->kvm, sp))
-				goto restart;
+			zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						     &invalid_list);
 			++vcpu->kvm->stat.mmu_flooded;
 			++vcpu->kvm->stat.mmu_flooded;
 			continue;
 			continue;
 		}
 		}
@@ -2722,16 +2898,22 @@ restart:
 			if (quadrant != sp->role.quadrant)
 			if (quadrant != sp->role.quadrant)
 				continue;
 				continue;
 		}
 		}
+		local_flush = true;
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		spte = &sp->spt[page_offset / sizeof(*spte)];
 		while (npte--) {
 		while (npte--) {
 			entry = *spte;
 			entry = *spte;
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
 			mmu_pte_write_zap_pte(vcpu, sp, spte);
-			if (gentry)
+			if (gentry &&
+			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
+			      & mask.word))
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
-			mmu_pte_write_flush_tlb(vcpu, entry, *spte);
+			if (!remote_flush && need_remote_flush(entry, *spte))
+				remote_flush = true;
 			++spte;
 			++spte;
 		}
 		}
 	}
 	}
+	mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 	kvm_mmu_audit(vcpu, "post pte write");
 	kvm_mmu_audit(vcpu, "post pte write");
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	spin_unlock(&vcpu->kvm->mmu_lock);
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
 	if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
@@ -2759,15 +2941,21 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
 
 
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
 {
 {
-	while (vcpu->kvm->arch.n_free_mmu_pages < KVM_REFILL_PAGES &&
+	int free_pages;
+	LIST_HEAD(invalid_list);
+
+	free_pages = vcpu->kvm->arch.n_free_mmu_pages;
+	while (free_pages < KVM_REFILL_PAGES &&
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 	       !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
 		struct kvm_mmu_page *sp;
 		struct kvm_mmu_page *sp;
 
 
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 		sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
 				  struct kvm_mmu_page, link);
 				  struct kvm_mmu_page, link);
-		kvm_mmu_zap_page(vcpu->kvm, sp);
+		free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp,
+						       &invalid_list);
 		++vcpu->kvm->stat.mmu_recycled;
 		++vcpu->kvm->stat.mmu_recycled;
 	}
 	}
+	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
 }
 }
 
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
@@ -2795,11 +2983,8 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
 		return 1;
 		return 1;
 	case EMULATE_DO_MMIO:
 	case EMULATE_DO_MMIO:
 		++vcpu->stat.mmio_exits;
 		++vcpu->stat.mmio_exits;
-		return 0;
+		/* fall through */
 	case EMULATE_FAIL:
 	case EMULATE_FAIL:
-		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-		vcpu->run->internal.ndata = 0;
 		return 0;
 		return 0;
 	default:
 	default:
 		BUG();
 		BUG();
@@ -2896,7 +3081,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 		pt = sp->spt;
 		pt = sp->spt;
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 		for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
 			/* avoid RMW */
 			/* avoid RMW */
-			if (pt[i] & PT_WRITABLE_MASK)
+			if (is_writable_pte(pt[i]))
 				pt[i] &= ~PT_WRITABLE_MASK;
 				pt[i] &= ~PT_WRITABLE_MASK;
 	}
 	}
 	kvm_flush_remote_tlbs(kvm);
 	kvm_flush_remote_tlbs(kvm);
@@ -2905,25 +3090,26 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
 void kvm_mmu_zap_all(struct kvm *kvm)
 void kvm_mmu_zap_all(struct kvm *kvm)
 {
 {
 	struct kvm_mmu_page *sp, *node;
 	struct kvm_mmu_page *sp, *node;
+	LIST_HEAD(invalid_list);
 
 
 	spin_lock(&kvm->mmu_lock);
 	spin_lock(&kvm->mmu_lock);
 restart:
 restart:
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
 	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
-		if (kvm_mmu_zap_page(kvm, sp))
+		if (kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list))
 			goto restart;
 			goto restart;
 
 
+	kvm_mmu_commit_zap_page(kvm, &invalid_list);
 	spin_unlock(&kvm->mmu_lock);
 	spin_unlock(&kvm->mmu_lock);
-
-	kvm_flush_remote_tlbs(kvm);
 }
 }
 
 
-static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm)
+static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
+					       struct list_head *invalid_list)
 {
 {
 	struct kvm_mmu_page *page;
 	struct kvm_mmu_page *page;
 
 
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 	page = container_of(kvm->arch.active_mmu_pages.prev,
 			    struct kvm_mmu_page, link);
 			    struct kvm_mmu_page, link);
-	return kvm_mmu_zap_page(kvm, page) + 1;
+	return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
 }
 }
 
 
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
@@ -2936,6 +3122,7 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 
 
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		int npages, idx, freed_pages;
 		int npages, idx, freed_pages;
+		LIST_HEAD(invalid_list);
 
 
 		idx = srcu_read_lock(&kvm->srcu);
 		idx = srcu_read_lock(&kvm->srcu);
 		spin_lock(&kvm->mmu_lock);
 		spin_lock(&kvm->mmu_lock);
@@ -2943,12 +3130,14 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
 			 kvm->arch.n_free_mmu_pages;
 			 kvm->arch.n_free_mmu_pages;
 		cache_count += npages;
 		cache_count += npages;
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
 		if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
-			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm);
+			freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
+							  &invalid_list);
 			cache_count -= freed_pages;
 			cache_count -= freed_pages;
 			kvm_freed = kvm;
 			kvm_freed = kvm;
 		}
 		}
 		nr_to_scan--;
 		nr_to_scan--;
 
 
+		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 		spin_unlock(&kvm->mmu_lock);
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 		srcu_read_unlock(&kvm->srcu, idx);
 	}
 	}
@@ -3074,7 +3263,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
 
 
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
 {
 {
-	kvm_set_cr3(vcpu, vcpu->arch.cr3);
+	(void)kvm_set_cr3(vcpu, vcpu->arch.cr3);
 	return 1;
 	return 1;
 }
 }
 
 
@@ -3331,9 +3520,9 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 	struct kvm_mmu_page *rev_sp;
 	struct kvm_mmu_page *rev_sp;
 	gfn_t gfn;
 	gfn_t gfn;
 
 
-	if (*sptep & PT_WRITABLE_MASK) {
+	if (is_writable_pte(*sptep)) {
 		rev_sp = page_header(__pa(sptep));
 		rev_sp = page_header(__pa(sptep));
-		gfn = rev_sp->gfns[sptep - rev_sp->spt];
+		gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
 
 
 		if (!gfn_to_memslot(kvm, gfn)) {
 		if (!gfn_to_memslot(kvm, gfn)) {
 			if (!printk_ratelimit())
 			if (!printk_ratelimit())
@@ -3347,8 +3536,7 @@ void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
 			return;
 			return;
 		}
 		}
 
 
-		rmapp = gfn_to_rmap(kvm, rev_sp->gfns[sptep - rev_sp->spt],
-				    rev_sp->role.level);
+		rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
 		if (!*rmapp) {
 		if (!*rmapp) {
 			if (!printk_ratelimit())
 			if (!printk_ratelimit())
 				return;
 				return;
@@ -3381,7 +3569,7 @@ static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
 
 
 			if (!(ent & PT_PRESENT_MASK))
 			if (!(ent & PT_PRESENT_MASK))
 				continue;
 				continue;
-			if (!(ent & PT_WRITABLE_MASK))
+			if (!is_writable_pte(ent))
 				continue;
 				continue;
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 			inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
 		}
 		}
@@ -3409,13 +3597,12 @@ static void audit_write_protection(struct kvm_vcpu *vcpu)
 		if (sp->unsync)
 		if (sp->unsync)
 			continue;
 			continue;
 
 
-		gfn = unalias_gfn(vcpu->kvm, sp->gfn);
-		slot = gfn_to_memslot_unaliased(vcpu->kvm, sp->gfn);
+		slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 		rmapp = &slot->rmap[gfn - slot->base_gfn];
 
 
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		spte = rmap_next(vcpu->kvm, rmapp, NULL);
 		while (spte) {
 		while (spte) {
-			if (*spte & PT_WRITABLE_MASK)
+			if (is_writable_pte(*spte))
 				printk(KERN_ERR "%s: (%s) shadow page has "
 				printk(KERN_ERR "%s: (%s) shadow page has "
 				"writable mappings: gfn %lx role %x\n",
 				"writable mappings: gfn %lx role %x\n",
 			       __func__, audit_msg, sp->gfn,
 			       __func__, audit_msg, sp->gfn,

+ 1 - 1
arch/x86/kvm/mmutrace.h

@@ -190,7 +190,7 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_unsync_page,
 	TP_ARGS(sp)
 	TP_ARGS(sp)
 );
 );
 
 
-DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_zap_page,
+DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
 	TP_PROTO(struct kvm_mmu_page *sp),
 	TP_PROTO(struct kvm_mmu_page *sp),
 
 
 	TP_ARGS(sp)
 	TP_ARGS(sp)

+ 145 - 107
arch/x86/kvm/paging_tmpl.h

@@ -7,6 +7,7 @@
  * MMU support
  * MMU support
  *
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -118,21 +119,25 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
 {
 {
 	pt_element_t pte;
 	pt_element_t pte;
 	gfn_t table_gfn;
 	gfn_t table_gfn;
-	unsigned index, pt_access, pte_access;
+	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
 	gpa_t pte_gpa;
-	int rsvd_fault = 0;
+	bool eperm, present, rsvd_fault;
 
 
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 	trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
 				     fetch_fault);
 				     fetch_fault);
 walk:
 walk:
+	present = true;
+	eperm = rsvd_fault = false;
 	walker->level = vcpu->arch.mmu.root_level;
 	walker->level = vcpu->arch.mmu.root_level;
 	pte = vcpu->arch.cr3;
 	pte = vcpu->arch.cr3;
 #if PTTYPE == 64
 #if PTTYPE == 64
 	if (!is_long_mode(vcpu)) {
 	if (!is_long_mode(vcpu)) {
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		trace_kvm_mmu_paging_element(pte, walker->level);
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			goto error;
+		}
 		--walker->level;
 		--walker->level;
 	}
 	}
 #endif
 #endif
@@ -150,37 +155,42 @@ walk:
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->table_gfn[walker->level - 1] = table_gfn;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 		walker->pte_gpa[walker->level - 1] = pte_gpa;
 
 
-		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
-			goto not_present;
+		if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) {
+			present = false;
+			break;
+		}
 
 
 		trace_kvm_mmu_paging_element(pte, walker->level);
 		trace_kvm_mmu_paging_element(pte, walker->level);
 
 
-		if (!is_present_gpte(pte))
-			goto not_present;
+		if (!is_present_gpte(pte)) {
+			present = false;
+			break;
+		}
 
 
-		rsvd_fault = is_rsvd_bits_set(vcpu, pte, walker->level);
-		if (rsvd_fault)
-			goto access_error;
+		if (is_rsvd_bits_set(vcpu, pte, walker->level)) {
+			rsvd_fault = true;
+			break;
+		}
 
 
 		if (write_fault && !is_writable_pte(pte))
 		if (write_fault && !is_writable_pte(pte))
 			if (user_fault || is_write_protection(vcpu))
 			if (user_fault || is_write_protection(vcpu))
-				goto access_error;
+				eperm = true;
 
 
 		if (user_fault && !(pte & PT_USER_MASK))
 		if (user_fault && !(pte & PT_USER_MASK))
-			goto access_error;
+			eperm = true;
 
 
 #if PTTYPE == 64
 #if PTTYPE == 64
 		if (fetch_fault && (pte & PT64_NX_MASK))
 		if (fetch_fault && (pte & PT64_NX_MASK))
-			goto access_error;
+			eperm = true;
 #endif
 #endif
 
 
-		if (!(pte & PT_ACCESSED_MASK)) {
+		if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) {
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 			trace_kvm_mmu_set_accessed_bit(table_gfn, index,
 						       sizeof(pte));
 						       sizeof(pte));
-			mark_page_dirty(vcpu->kvm, table_gfn);
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
 			if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn,
 			    index, pte, pte|PT_ACCESSED_MASK))
 			    index, pte, pte|PT_ACCESSED_MASK))
 				goto walk;
 				goto walk;
+			mark_page_dirty(vcpu->kvm, table_gfn);
 			pte |= PT_ACCESSED_MASK;
 			pte |= PT_ACCESSED_MASK;
 		}
 		}
 
 
@@ -213,15 +223,18 @@ walk:
 		--walker->level;
 		--walker->level;
 	}
 	}
 
 
+	if (!present || eperm || rsvd_fault)
+		goto error;
+
 	if (write_fault && !is_dirty_gpte(pte)) {
 	if (write_fault && !is_dirty_gpte(pte)) {
 		bool ret;
 		bool ret;
 
 
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
 		trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
-		mark_page_dirty(vcpu->kvm, table_gfn);
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 		ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte,
 			    pte|PT_DIRTY_MASK);
 			    pte|PT_DIRTY_MASK);
 		if (ret)
 		if (ret)
 			goto walk;
 			goto walk;
+		mark_page_dirty(vcpu->kvm, table_gfn);
 		pte |= PT_DIRTY_MASK;
 		pte |= PT_DIRTY_MASK;
 		walker->ptes[walker->level - 1] = pte;
 		walker->ptes[walker->level - 1] = pte;
 	}
 	}
@@ -229,22 +242,18 @@ walk:
 	walker->pt_access = pt_access;
 	walker->pt_access = pt_access;
 	walker->pte_access = pte_access;
 	walker->pte_access = pte_access;
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
 	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
-		 __func__, (u64)pte, pt_access, pte_access);
+		 __func__, (u64)pte, pte_access, pt_access);
 	return 1;
 	return 1;
 
 
-not_present:
+error:
 	walker->error_code = 0;
 	walker->error_code = 0;
-	goto err;
-
-access_error:
-	walker->error_code = PFERR_PRESENT_MASK;
-
-err:
+	if (present)
+		walker->error_code |= PFERR_PRESENT_MASK;
 	if (write_fault)
 	if (write_fault)
 		walker->error_code |= PFERR_WRITE_MASK;
 		walker->error_code |= PFERR_WRITE_MASK;
 	if (user_fault)
 	if (user_fault)
 		walker->error_code |= PFERR_USER_MASK;
 		walker->error_code |= PFERR_USER_MASK;
-	if (fetch_fault)
+	if (fetch_fault && is_nx(vcpu))
 		walker->error_code |= PFERR_FETCH_MASK;
 		walker->error_code |= PFERR_FETCH_MASK;
 	if (rsvd_fault)
 	if (rsvd_fault)
 		walker->error_code |= PFERR_RSVD_MASK;
 		walker->error_code |= PFERR_RSVD_MASK;
@@ -252,7 +261,7 @@ err:
 	return 0;
 	return 0;
 }
 }
 
 
-static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
+static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 			      u64 *spte, const void *pte)
 			      u64 *spte, const void *pte)
 {
 {
 	pt_element_t gpte;
 	pt_element_t gpte;
@@ -263,7 +272,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	gpte = *(const pt_element_t *)pte;
 	gpte = *(const pt_element_t *)pte;
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
 	if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) {
 		if (!is_present_gpte(gpte)) {
 		if (!is_present_gpte(gpte)) {
-			if (page->unsync)
+			if (sp->unsync)
 				new_spte = shadow_trap_nonpresent_pte;
 				new_spte = shadow_trap_nonpresent_pte;
 			else
 			else
 				new_spte = shadow_notrap_nonpresent_pte;
 				new_spte = shadow_notrap_nonpresent_pte;
@@ -272,7 +281,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 		return;
 		return;
 	}
 	}
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
-	pte_access = page->role.access & FNAME(gpte_access)(vcpu, gpte);
+	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 	if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn)
 		return;
 		return;
 	pfn = vcpu->arch.update_pte.pfn;
 	pfn = vcpu->arch.update_pte.pfn;
@@ -285,11 +294,22 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
 	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
 	 * we call mmu_set_spte() with reset_host_protection = true beacuse that
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
 	 */
 	 */
-	mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
-		     gpte & PT_DIRTY_MASK, NULL, PT_PAGE_TABLE_LEVEL,
+	mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
+		     is_dirty_gpte(gpte), NULL, PT_PAGE_TABLE_LEVEL,
 		     gpte_to_gfn(gpte), pfn, true, true);
 		     gpte_to_gfn(gpte), pfn, true, true);
 }
 }
 
 
+static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
+				struct guest_walker *gw, int level)
+{
+	int r;
+	pt_element_t curr_pte;
+
+	r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1],
+				  &curr_pte, sizeof(curr_pte));
+	return r || curr_pte != gw->ptes[level - 1];
+}
+
 /*
 /*
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  * Fetch a shadow pte for a specific level in the paging hierarchy.
  */
  */
@@ -299,75 +319,86 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
 			 int *ptwrite, pfn_t pfn)
 			 int *ptwrite, pfn_t pfn)
 {
 {
 	unsigned access = gw->pt_access;
 	unsigned access = gw->pt_access;
-	struct kvm_mmu_page *shadow_page;
-	u64 spte, *sptep = NULL;
-	int direct;
-	gfn_t table_gfn;
-	int r;
-	int level;
-	pt_element_t curr_pte;
-	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp = NULL;
+	bool dirty = is_dirty_gpte(gw->ptes[gw->level - 1]);
+	int top_level;
+	unsigned direct_access;
+	struct kvm_shadow_walk_iterator it;
 
 
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 	if (!is_present_gpte(gw->ptes[gw->level - 1]))
 		return NULL;
 		return NULL;
 
 
-	for_each_shadow_entry(vcpu, addr, iterator) {
-		level = iterator.level;
-		sptep = iterator.sptep;
-		if (iterator.level == hlevel) {
-			mmu_set_spte(vcpu, sptep, access,
-				     gw->pte_access & access,
-				     user_fault, write_fault,
-				     gw->ptes[gw->level-1] & PT_DIRTY_MASK,
-				     ptwrite, level,
-				     gw->gfn, pfn, false, true);
-			break;
-		}
+	direct_access = gw->pt_access & gw->pte_access;
+	if (!dirty)
+		direct_access &= ~ACC_WRITE_MASK;
 
 
-		if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
-			continue;
+	top_level = vcpu->arch.mmu.root_level;
+	if (top_level == PT32E_ROOT_LEVEL)
+		top_level = PT32_ROOT_LEVEL;
+	/*
+	 * Verify that the top-level gpte is still there.  Since the page
+	 * is a root page, it is either write protected (and cannot be
+	 * changed from now on) or it is invalid (in which case, we don't
+	 * really care if it changes underneath us after this point).
+	 */
+	if (FNAME(gpte_changed)(vcpu, gw, top_level))
+		goto out_gpte_changed;
 
 
-		if (is_large_pte(*sptep)) {
-			rmap_remove(vcpu->kvm, sptep);
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
-			kvm_flush_remote_tlbs(vcpu->kvm);
-		}
+	for (shadow_walk_init(&it, vcpu, addr);
+	     shadow_walk_okay(&it) && it.level > gw->level;
+	     shadow_walk_next(&it)) {
+		gfn_t table_gfn;
 
 
-		if (level <= gw->level) {
-			int delta = level - gw->level + 1;
-			direct = 1;
-			if (!is_dirty_gpte(gw->ptes[level - delta]))
-				access &= ~ACC_WRITE_MASK;
-			table_gfn = gpte_to_gfn(gw->ptes[level - delta]);
-			/* advance table_gfn when emulating 1gb pages with 4k */
-			if (delta == 0)
-				table_gfn += PT_INDEX(addr, level);
-			access &= gw->pte_access;
-		} else {
-			direct = 0;
-			table_gfn = gw->table_gfn[level - 2];
-		}
-		shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1,
-					       direct, access, sptep);
-		if (!direct) {
-			r = kvm_read_guest_atomic(vcpu->kvm,
-						  gw->pte_gpa[level - 2],
-						  &curr_pte, sizeof(curr_pte));
-			if (r || curr_pte != gw->ptes[level - 2]) {
-				kvm_mmu_put_page(shadow_page, sptep);
-				kvm_release_pfn_clean(pfn);
-				sptep = NULL;
-				break;
-			}
+		drop_large_spte(vcpu, it.sptep);
+
+		sp = NULL;
+		if (!is_shadow_present_pte(*it.sptep)) {
+			table_gfn = gw->table_gfn[it.level - 2];
+			sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
+					      false, access, it.sptep);
 		}
 		}
 
 
-		spte = __pa(shadow_page->spt)
-			| PT_PRESENT_MASK | PT_ACCESSED_MASK
-			| PT_WRITABLE_MASK | PT_USER_MASK;
-		*sptep = spte;
+		/*
+		 * Verify that the gpte in the page we've just write
+		 * protected is still there.
+		 */
+		if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
+			goto out_gpte_changed;
+
+		if (sp)
+			link_shadow_page(it.sptep, sp);
 	}
 	}
 
 
-	return sptep;
+	for (;
+	     shadow_walk_okay(&it) && it.level > hlevel;
+	     shadow_walk_next(&it)) {
+		gfn_t direct_gfn;
+
+		validate_direct_spte(vcpu, it.sptep, direct_access);
+
+		drop_large_spte(vcpu, it.sptep);
+
+		if (is_shadow_present_pte(*it.sptep))
+			continue;
+
+		direct_gfn = gw->gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
+
+		sp = kvm_mmu_get_page(vcpu, direct_gfn, addr, it.level-1,
+				      true, direct_access, it.sptep);
+		link_shadow_page(it.sptep, sp);
+	}
+
+	mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
+		     user_fault, write_fault, dirty, ptwrite, it.level,
+		     gw->gfn, pfn, false, true);
+
+	return it.sptep;
+
+out_gpte_changed:
+	if (sp)
+		kvm_mmu_put_page(sp, it.sptep);
+	kvm_release_pfn_clean(pfn);
+	return NULL;
 }
 }
 
 
 /*
 /*
@@ -431,11 +462,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 	pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
 
 
 	/* mmio */
 	/* mmio */
-	if (is_error_pfn(pfn)) {
-		pgprintk("gfn %lx is mmio\n", walker.gfn);
-		kvm_release_pfn_clean(pfn);
-		return 1;
-	}
+	if (is_error_pfn(pfn))
+		return kvm_handle_bad_page(vcpu->kvm, walker.gfn, pfn);
 
 
 	spin_lock(&vcpu->kvm->mmu_lock);
 	spin_lock(&vcpu->kvm->mmu_lock);
 	if (mmu_notifier_retry(vcpu, mmu_seq))
 	if (mmu_notifier_retry(vcpu, mmu_seq))
@@ -443,6 +471,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
 	kvm_mmu_free_some_pages(vcpu);
 	kvm_mmu_free_some_pages(vcpu);
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 	sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
 			     level, &write_pt, pfn);
 			     level, &write_pt, pfn);
+	(void)sptep;
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 	pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
 		 sptep, *sptep, write_pt);
 		 sptep, *sptep, write_pt);
 
 
@@ -464,6 +493,7 @@ out_unlock:
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 {
 {
 	struct kvm_shadow_walk_iterator iterator;
 	struct kvm_shadow_walk_iterator iterator;
+	struct kvm_mmu_page *sp;
 	gpa_t pte_gpa = -1;
 	gpa_t pte_gpa = -1;
 	int level;
 	int level;
 	u64 *sptep;
 	u64 *sptep;
@@ -475,10 +505,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 		level = iterator.level;
 		level = iterator.level;
 		sptep = iterator.sptep;
 		sptep = iterator.sptep;
 
 
+		sp = page_header(__pa(sptep));
 		if (is_last_spte(*sptep, level)) {
 		if (is_last_spte(*sptep, level)) {
-			struct kvm_mmu_page *sp = page_header(__pa(sptep));
 			int offset, shift;
 			int offset, shift;
 
 
+			if (!sp->unsync)
+				break;
+
 			shift = PAGE_SHIFT -
 			shift = PAGE_SHIFT -
 				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
 				  (PT_LEVEL_BITS - PT64_LEVEL_BITS) * level;
 			offset = sp->role.quadrant << shift;
 			offset = sp->role.quadrant << shift;
@@ -487,16 +520,17 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 			pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
 
 
 			if (is_shadow_present_pte(*sptep)) {
 			if (is_shadow_present_pte(*sptep)) {
-				rmap_remove(vcpu->kvm, sptep);
 				if (is_large_pte(*sptep))
 				if (is_large_pte(*sptep))
 					--vcpu->kvm->stat.lpages;
 					--vcpu->kvm->stat.lpages;
+				drop_spte(vcpu->kvm, sptep,
+					  shadow_trap_nonpresent_pte);
 				need_flush = 1;
 				need_flush = 1;
-			}
-			__set_spte(sptep, shadow_trap_nonpresent_pte);
+			} else
+				__set_spte(sptep, shadow_trap_nonpresent_pte);
 			break;
 			break;
 		}
 		}
 
 
-		if (!is_shadow_present_pte(*sptep))
+		if (!is_shadow_present_pte(*sptep) || !sp->unsync_children)
 			break;
 			break;
 	}
 	}
 
 
@@ -570,9 +604,9 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
  * Using the cached information from sp->gfns is safe because:
  * Using the cached information from sp->gfns is safe because:
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  * - The spte has a reference to the struct page, so the pfn for a given gfn
  *   can't change unless all sptes pointing to it are nuked first.
  *   can't change unless all sptes pointing to it are nuked first.
- * - Alias changes zap the entire shadow cache.
  */
  */
-static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
+static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
+			    bool clear_unsync)
 {
 {
 	int i, offset, nr_present;
 	int i, offset, nr_present;
 	bool reset_host_protection;
 	bool reset_host_protection;
@@ -580,6 +614,9 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
 
 	offset = nr_present = 0;
 	offset = nr_present = 0;
 
 
+	/* direct kvm_mmu_page can not be unsync. */
+	BUG_ON(sp->role.direct);
+
 	if (PTTYPE == 32)
 	if (PTTYPE == 32)
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 		offset = sp->role.quadrant << PT64_LEVEL_BITS;
 
 
@@ -589,7 +626,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 		unsigned pte_access;
 		unsigned pte_access;
 		pt_element_t gpte;
 		pt_element_t gpte;
 		gpa_t pte_gpa;
 		gpa_t pte_gpa;
-		gfn_t gfn = sp->gfns[i];
+		gfn_t gfn;
 
 
 		if (!is_shadow_present_pte(sp->spt[i]))
 		if (!is_shadow_present_pte(sp->spt[i]))
 			continue;
 			continue;
@@ -600,16 +637,17 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 					  sizeof(pt_element_t)))
 					  sizeof(pt_element_t)))
 			return -EINVAL;
 			return -EINVAL;
 
 
-		if (gpte_to_gfn(gpte) != gfn || !is_present_gpte(gpte) ||
-		    !(gpte & PT_ACCESSED_MASK)) {
+		gfn = gpte_to_gfn(gpte);
+		if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
+		      || gfn != sp->gfns[i] || !is_present_gpte(gpte)
+		      || !(gpte & PT_ACCESSED_MASK)) {
 			u64 nonpresent;
 			u64 nonpresent;
 
 
-			rmap_remove(vcpu->kvm, &sp->spt[i]);
-			if (is_present_gpte(gpte))
+			if (is_present_gpte(gpte) || !clear_unsync)
 				nonpresent = shadow_trap_nonpresent_pte;
 				nonpresent = shadow_trap_nonpresent_pte;
 			else
 			else
 				nonpresent = shadow_notrap_nonpresent_pte;
 				nonpresent = shadow_notrap_nonpresent_pte;
-			__set_spte(&sp->spt[i], nonpresent);
+			drop_spte(vcpu->kvm, &sp->spt[i], nonpresent);
 			continue;
 			continue;
 		}
 		}
 
 

+ 121 - 17
arch/x86/kvm/svm.c

@@ -4,6 +4,7 @@
  * AMD SVM support
  * AMD SVM support
  *
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Yaniv Kamay  <yaniv@qumranet.com>
  *   Yaniv Kamay  <yaniv@qumranet.com>
@@ -285,11 +286,11 @@ static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
 
 
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 {
+	vcpu->arch.efer = efer;
 	if (!npt_enabled && !(efer & EFER_LMA))
 	if (!npt_enabled && !(efer & EFER_LMA))
 		efer &= ~EFER_LME;
 		efer &= ~EFER_LME;
 
 
 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
 	to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
-	vcpu->arch.efer = efer;
 }
 }
 
 
 static int is_external_interrupt(u32 info)
 static int is_external_interrupt(u32 info)
@@ -640,7 +641,7 @@ static __init int svm_hardware_setup(void)
 
 
 	if (nested) {
 	if (nested) {
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
 		printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
-		kvm_enable_efer_bits(EFER_SVME);
+		kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
 	}
 	}
 
 
 	for_each_possible_cpu(cpu) {
 	for_each_possible_cpu(cpu) {
@@ -806,7 +807,7 @@ static void init_vmcb(struct vcpu_svm *svm)
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
 	 */
 	 */
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
 	svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
-	kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
+	(void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
 
 
 	save->cr4 = X86_CR4_PAE;
 	save->cr4 = X86_CR4_PAE;
 	/* rdx = ?? */
 	/* rdx = ?? */
@@ -903,13 +904,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
 	svm->asid_generation = 0;
 	svm->asid_generation = 0;
 	init_vmcb(svm);
 	init_vmcb(svm);
 
 
-	fx_init(&svm->vcpu);
+	err = fx_init(&svm->vcpu);
+	if (err)
+		goto free_page4;
+
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 	if (kvm_vcpu_is_bsp(&svm->vcpu))
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 		svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
 
 
 	return &svm->vcpu;
 	return &svm->vcpu;
 
 
+free_page4:
+	__free_page(hsave_page);
 free_page3:
 free_page3:
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 	__free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
 free_page2:
 free_page2:
@@ -1488,7 +1494,7 @@ static void svm_handle_mce(struct vcpu_svm *svm)
 		 */
 		 */
 		pr_err("KVM: Guest triggered AMD Erratum 383\n");
 		pr_err("KVM: Guest triggered AMD Erratum 383\n");
 
 
-		set_bit(KVM_REQ_TRIPLE_FAULT, &svm->vcpu.requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, &svm->vcpu);
 
 
 		return;
 		return;
 	}
 	}
@@ -1535,7 +1541,7 @@ static int io_interception(struct vcpu_svm *svm)
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	string = (io_info & SVM_IOIO_STR_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
 	if (string || in)
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 
 	port = io_info >> 16;
 	port = io_info >> 16;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
 	size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1957,7 +1963,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = hsave->save.cr3;
 		svm->vmcb->save.cr3 = hsave->save.cr3;
 		svm->vcpu.arch.cr3 = hsave->save.cr3;
 		svm->vcpu.arch.cr3 = hsave->save.cr3;
 	} else {
 	} else {
-		kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
 	}
 	}
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
 	kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
@@ -2080,7 +2086,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vmcb->save.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
 		svm->vcpu.arch.cr3 = nested_vmcb->save.cr3;
 	} else
 	} else
-		kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
+		(void)kvm_set_cr3(&svm->vcpu, nested_vmcb->save.cr3);
 
 
 	/* Guest paging mode is active - reset mmu */
 	/* Guest paging mode is active - reset mmu */
 	kvm_mmu_reset_context(&svm->vcpu);
 	kvm_mmu_reset_context(&svm->vcpu);
@@ -2386,16 +2392,12 @@ static int iret_interception(struct vcpu_svm *svm)
 
 
 static int invlpg_interception(struct vcpu_svm *svm)
 static int invlpg_interception(struct vcpu_svm *svm)
 {
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 }
 
 
 static int emulate_on_interception(struct vcpu_svm *svm)
 static int emulate_on_interception(struct vcpu_svm *svm)
 {
 {
-	if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
-		pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
-	return 1;
+	return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE;
 }
 }
 
 
 static int cr8_write_interception(struct vcpu_svm *svm)
 static int cr8_write_interception(struct vcpu_svm *svm)
@@ -2726,6 +2728,99 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
 	[SVM_EXIT_NPF]				= pf_interception,
 	[SVM_EXIT_NPF]				= pf_interception,
 };
 };
 
 
+void dump_vmcb(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_svm *svm = to_svm(vcpu);
+	struct vmcb_control_area *control = &svm->vmcb->control;
+	struct vmcb_save_area *save = &svm->vmcb->save;
+
+	pr_err("VMCB Control Area:\n");
+	pr_err("cr_read:            %04x\n", control->intercept_cr_read);
+	pr_err("cr_write:           %04x\n", control->intercept_cr_write);
+	pr_err("dr_read:            %04x\n", control->intercept_dr_read);
+	pr_err("dr_write:           %04x\n", control->intercept_dr_write);
+	pr_err("exceptions:         %08x\n", control->intercept_exceptions);
+	pr_err("intercepts:         %016llx\n", control->intercept);
+	pr_err("pause filter count: %d\n", control->pause_filter_count);
+	pr_err("iopm_base_pa:       %016llx\n", control->iopm_base_pa);
+	pr_err("msrpm_base_pa:      %016llx\n", control->msrpm_base_pa);
+	pr_err("tsc_offset:         %016llx\n", control->tsc_offset);
+	pr_err("asid:               %d\n", control->asid);
+	pr_err("tlb_ctl:            %d\n", control->tlb_ctl);
+	pr_err("int_ctl:            %08x\n", control->int_ctl);
+	pr_err("int_vector:         %08x\n", control->int_vector);
+	pr_err("int_state:          %08x\n", control->int_state);
+	pr_err("exit_code:          %08x\n", control->exit_code);
+	pr_err("exit_info1:         %016llx\n", control->exit_info_1);
+	pr_err("exit_info2:         %016llx\n", control->exit_info_2);
+	pr_err("exit_int_info:      %08x\n", control->exit_int_info);
+	pr_err("exit_int_info_err:  %08x\n", control->exit_int_info_err);
+	pr_err("nested_ctl:         %lld\n", control->nested_ctl);
+	pr_err("nested_cr3:         %016llx\n", control->nested_cr3);
+	pr_err("event_inj:          %08x\n", control->event_inj);
+	pr_err("event_inj_err:      %08x\n", control->event_inj_err);
+	pr_err("lbr_ctl:            %lld\n", control->lbr_ctl);
+	pr_err("next_rip:           %016llx\n", control->next_rip);
+	pr_err("VMCB State Save Area:\n");
+	pr_err("es:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->es.selector, save->es.attrib,
+		save->es.limit, save->es.base);
+	pr_err("cs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->cs.selector, save->cs.attrib,
+		save->cs.limit, save->cs.base);
+	pr_err("ss:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ss.selector, save->ss.attrib,
+		save->ss.limit, save->ss.base);
+	pr_err("ds:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ds.selector, save->ds.attrib,
+		save->ds.limit, save->ds.base);
+	pr_err("fs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->fs.selector, save->fs.attrib,
+		save->fs.limit, save->fs.base);
+	pr_err("gs:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gs.selector, save->gs.attrib,
+		save->gs.limit, save->gs.base);
+	pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->gdtr.selector, save->gdtr.attrib,
+		save->gdtr.limit, save->gdtr.base);
+	pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->ldtr.selector, save->ldtr.attrib,
+		save->ldtr.limit, save->ldtr.base);
+	pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n",
+		save->idtr.selector, save->idtr.attrib,
+		save->idtr.limit, save->idtr.base);
+	pr_err("tr:   s: %04x a: %04x l: %08x b: %016llx\n",
+		save->tr.selector, save->tr.attrib,
+		save->tr.limit, save->tr.base);
+	pr_err("cpl:            %d                efer:         %016llx\n",
+		save->cpl, save->efer);
+	pr_err("cr0:            %016llx cr2:          %016llx\n",
+		save->cr0, save->cr2);
+	pr_err("cr3:            %016llx cr4:          %016llx\n",
+		save->cr3, save->cr4);
+	pr_err("dr6:            %016llx dr7:          %016llx\n",
+		save->dr6, save->dr7);
+	pr_err("rip:            %016llx rflags:       %016llx\n",
+		save->rip, save->rflags);
+	pr_err("rsp:            %016llx rax:          %016llx\n",
+		save->rsp, save->rax);
+	pr_err("star:           %016llx lstar:        %016llx\n",
+		save->star, save->lstar);
+	pr_err("cstar:          %016llx sfmask:       %016llx\n",
+		save->cstar, save->sfmask);
+	pr_err("kernel_gs_base: %016llx sysenter_cs:  %016llx\n",
+		save->kernel_gs_base, save->sysenter_cs);
+	pr_err("sysenter_esp:   %016llx sysenter_eip: %016llx\n",
+		save->sysenter_esp, save->sysenter_eip);
+	pr_err("gpat:           %016llx dbgctl:       %016llx\n",
+		save->g_pat, save->dbgctl);
+	pr_err("br_from:        %016llx br_to:        %016llx\n",
+		save->br_from, save->br_to);
+	pr_err("excp_from:      %016llx excp_to:      %016llx\n",
+		save->last_excp_from, save->last_excp_to);
+
+}
+
 static int handle_exit(struct kvm_vcpu *vcpu)
 static int handle_exit(struct kvm_vcpu *vcpu)
 {
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -2770,6 +2865,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		kvm_run->fail_entry.hardware_entry_failure_reason
 		kvm_run->fail_entry.hardware_entry_failure_reason
 			= svm->vmcb->control.exit_code;
 			= svm->vmcb->control.exit_code;
+		pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
+		dump_vmcb(vcpu);
 		return 0;
 		return 0;
 	}
 	}
 
 
@@ -2826,9 +2923,6 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
 {
 {
 	struct vmcb_control_area *control;
 	struct vmcb_control_area *control;
 
 
-	trace_kvm_inj_virq(irq);
-
-	++svm->vcpu.stat.irq_injections;
 	control = &svm->vmcb->control;
 	control = &svm->vmcb->control;
 	control->int_vector = irq;
 	control->int_vector = irq;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
 	control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -2842,6 +2936,9 @@ static void svm_set_irq(struct kvm_vcpu *vcpu)
 
 
 	BUG_ON(!(gif_set(svm)));
 	BUG_ON(!(gif_set(svm)));
 
 
+	trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
+	++vcpu->stat.irq_injections;
+
 	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
 	svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr |
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 		SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR;
 }
 }
@@ -3327,6 +3424,11 @@ static bool svm_rdtscp_supported(void)
 	return false;
 	return false;
 }
 }
 
 
+static bool svm_has_wbinvd_exit(void)
+{
+	return true;
+}
+
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
 {
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3411,6 +3513,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.rdtscp_supported = svm_rdtscp_supported,
 	.rdtscp_supported = svm_rdtscp_supported,
 
 
 	.set_supported_cpuid = svm_set_supported_cpuid,
 	.set_supported_cpuid = svm_set_supported_cpuid,
+
+	.has_wbinvd_exit = svm_has_wbinvd_exit,
 };
 };
 
 
 static int __init svm_init(void)
 static int __init svm_init(void)

+ 15 - 1
arch/x86/kvm/timer.c

@@ -1,3 +1,17 @@
+/*
+ * Kernel-based Virtual Machine driver for Linux
+ *
+ * This module enables machines with Intel VT-x extensions to run virtual
+ * machines without emulation or binary translation.
+ *
+ * timer support
+ *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2.  See
+ * the COPYING file in the top-level directory.
+ */
+
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
 #include <linux/kvm.h>
 #include <linux/kvm.h>
 #include <linux/hrtimer.h>
 #include <linux/hrtimer.h>
@@ -18,7 +32,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 	if (ktimer->reinject || !atomic_read(&ktimer->pending)) {
 		atomic_inc(&ktimer->pending);
 		atomic_inc(&ktimer->pending);
 		/* FIXME: this code should not know anything about vcpus */
 		/* FIXME: this code should not know anything about vcpus */
-		set_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests);
+		kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
 	}
 	}
 
 
 	if (waitqueue_active(q))
 	if (waitqueue_active(q))

+ 175 - 78
arch/x86/kvm/vmx.c

@@ -5,6 +5,7 @@
  * machines without emulation or binary translation.
  * machines without emulation or binary translation.
  *
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Avi Kivity   <avi@qumranet.com>
@@ -36,6 +37,8 @@
 #include <asm/vmx.h>
 #include <asm/vmx.h>
 #include <asm/virtext.h>
 #include <asm/virtext.h>
 #include <asm/mce.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
 
 
 #include "trace.h"
 #include "trace.h"
 
 
@@ -63,6 +66,9 @@ module_param_named(unrestricted_guest,
 static int __read_mostly emulate_invalid_guest_state = 0;
 static int __read_mostly emulate_invalid_guest_state = 0;
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 module_param(emulate_invalid_guest_state, bool, S_IRUGO);
 
 
+static int __read_mostly vmm_exclusive = 1;
+module_param(vmm_exclusive, bool, S_IRUGO);
+
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 #define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST				\
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 	(X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
 #define KVM_GUEST_CR0_MASK						\
 #define KVM_GUEST_CR0_MASK						\
@@ -173,10 +179,13 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
 
 
 static int init_rmode(struct kvm *kvm);
 static int init_rmode(struct kvm *kvm);
 static u64 construct_eptp(unsigned long root_hpa);
 static u64 construct_eptp(unsigned long root_hpa);
+static void kvm_cpu_vmxon(u64 addr);
+static void kvm_cpu_vmxoff(void);
 
 
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, vmxarea);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
 static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
+static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
 
 
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_a;
 static unsigned long *vmx_io_bitmap_b;
 static unsigned long *vmx_io_bitmap_b;
@@ -334,6 +343,11 @@ static inline bool cpu_has_vmx_ept_1g_page(void)
 	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 	return vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT;
 }
 }
 
 
+static inline bool cpu_has_vmx_ept_4levels(void)
+{
+	return vmx_capability.ept & VMX_EPT_PAGE_WALK_4_BIT;
+}
+
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 static inline bool cpu_has_vmx_invept_individual_addr(void)
 {
 {
 	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
 	return vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT;
@@ -349,6 +363,16 @@ static inline bool cpu_has_vmx_invept_global(void)
 	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 	return vmx_capability.ept & VMX_EPT_EXTENT_GLOBAL_BIT;
 }
 }
 
 
+static inline bool cpu_has_vmx_invvpid_single(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT;
+}
+
+static inline bool cpu_has_vmx_invvpid_global(void)
+{
+	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 static inline bool cpu_has_vmx_ept(void)
 {
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -389,6 +413,12 @@ static inline bool cpu_has_virtual_nmis(void)
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 	return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
 }
 }
 
 
+static inline bool cpu_has_vmx_wbinvd_exit(void)
+{
+	return vmcs_config.cpu_based_2nd_exec_ctrl &
+		SECONDARY_EXEC_WBINVD_EXITING;
+}
+
 static inline bool report_flexpriority(void)
 static inline bool report_flexpriority(void)
 {
 {
 	return flexpriority_enabled;
 	return flexpriority_enabled;
@@ -453,6 +483,19 @@ static void vmcs_clear(struct vmcs *vmcs)
 		       vmcs, phys_addr);
 		       vmcs, phys_addr);
 }
 }
 
 
+static void vmcs_load(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(vmcs);
+	u8 error;
+
+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
+			: "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
+			: "cc", "memory");
+	if (error)
+		printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
+		       vmcs, phys_addr);
+}
+
 static void __vcpu_clear(void *arg)
 static void __vcpu_clear(void *arg)
 {
 {
 	struct vcpu_vmx *vmx = arg;
 	struct vcpu_vmx *vmx = arg;
@@ -475,12 +518,27 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
 	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 	smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
 }
 }
 
 
-static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
+static inline void vpid_sync_vcpu_single(struct vcpu_vmx *vmx)
 {
 {
 	if (vmx->vpid == 0)
 	if (vmx->vpid == 0)
 		return;
 		return;
 
 
-	__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+	if (cpu_has_vmx_invvpid_single())
+		__invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vmx->vpid, 0);
+}
+
+static inline void vpid_sync_vcpu_global(void)
+{
+	if (cpu_has_vmx_invvpid_global())
+		__invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+}
+
+static inline void vpid_sync_context(struct vcpu_vmx *vmx)
+{
+	if (cpu_has_vmx_invvpid_single())
+		vpid_sync_vcpu_single(vmx);
+	else
+		vpid_sync_vcpu_global();
 }
 }
 
 
 static inline void ept_sync_global(void)
 static inline void ept_sync_global(void)
@@ -812,6 +870,9 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
 	}
 	}
 #endif
 #endif
+	if (current_thread_info()->status & TS_USEDFPU)
+		clts();
+	load_gdt(&__get_cpu_var(host_gdt));
 }
 }
 
 
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -828,35 +889,30 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
-	u64 phys_addr = __pa(vmx->vmcs);
 	u64 tsc_this, delta, new_offset;
 	u64 tsc_this, delta, new_offset;
+	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
 
 
-	if (vcpu->cpu != cpu) {
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+	else if (vcpu->cpu != cpu)
 		vcpu_clear(vmx);
 		vcpu_clear(vmx);
-		kvm_migrate_timers(vcpu);
-		set_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests);
-		local_irq_disable();
-		list_add(&vmx->local_vcpus_link,
-			 &per_cpu(vcpus_on_cpu, cpu));
-		local_irq_enable();
-	}
 
 
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
 	if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
-		u8 error;
-
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
 		per_cpu(current_vmcs, cpu) = vmx->vmcs;
-		asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
-			      : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
-			      : "cc");
-		if (error)
-			printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
-			       vmx->vmcs, phys_addr);
+		vmcs_load(vmx->vmcs);
 	}
 	}
 
 
 	if (vcpu->cpu != cpu) {
 	if (vcpu->cpu != cpu) {
 		struct desc_ptr dt;
 		struct desc_ptr dt;
 		unsigned long sysenter_esp;
 		unsigned long sysenter_esp;
 
 
+		kvm_migrate_timers(vcpu);
+		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+		local_irq_disable();
+		list_add(&vmx->local_vcpus_link,
+			 &per_cpu(vcpus_on_cpu, cpu));
+		local_irq_enable();
+
 		vcpu->cpu = cpu;
 		vcpu->cpu = cpu;
 		/*
 		/*
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
 		 * Linux uses per-cpu TSS and GDT, so set these when switching
@@ -884,6 +940,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
 	__vmx_load_host_state(to_vmx(vcpu));
 	__vmx_load_host_state(to_vmx(vcpu));
+	if (!vmm_exclusive) {
+		__vcpu_clear(to_vmx(vcpu));
+		kvm_cpu_vmxoff();
+	}
 }
 }
 
 
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
 static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
@@ -1286,6 +1346,13 @@ static __init int vmx_disabled_by_bios(void)
 	/* locked but not enabled */
 	/* locked but not enabled */
 }
 }
 
 
+static void kvm_cpu_vmxon(u64 addr)
+{
+	asm volatile (ASM_VMX_VMXON_RAX
+			: : "a"(&addr), "m"(addr)
+			: "memory", "cc");
+}
+
 static int hardware_enable(void *garbage)
 static int hardware_enable(void *garbage)
 {
 {
 	int cpu = raw_smp_processor_id();
 	int cpu = raw_smp_processor_id();
@@ -1308,11 +1375,13 @@ static int hardware_enable(void *garbage)
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 		wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
 	}
 	}
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
 	write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
-	asm volatile (ASM_VMX_VMXON_RAX
-		      : : "a"(&phys_addr), "m"(phys_addr)
-		      : "memory", "cc");
 
 
-	ept_sync_global();
+	if (vmm_exclusive) {
+		kvm_cpu_vmxon(phys_addr);
+		ept_sync_global();
+	}
+
+	store_gdt(&__get_cpu_var(host_gdt));
 
 
 	return 0;
 	return 0;
 }
 }
@@ -1334,13 +1403,15 @@ static void vmclear_local_vcpus(void)
 static void kvm_cpu_vmxoff(void)
 static void kvm_cpu_vmxoff(void)
 {
 {
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
 	asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
-	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 }
 
 
 static void hardware_disable(void *garbage)
 static void hardware_disable(void *garbage)
 {
 {
-	vmclear_local_vcpus();
-	kvm_cpu_vmxoff();
+	if (vmm_exclusive) {
+		vmclear_local_vcpus();
+		kvm_cpu_vmxoff();
+	}
+	write_cr4(read_cr4() & ~X86_CR4_VMXE);
 }
 }
 
 
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
 static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
@@ -1539,7 +1610,8 @@ static __init int hardware_setup(void)
 	if (!cpu_has_vmx_vpid())
 	if (!cpu_has_vmx_vpid())
 		enable_vpid = 0;
 		enable_vpid = 0;
 
 
-	if (!cpu_has_vmx_ept()) {
+	if (!cpu_has_vmx_ept() ||
+	    !cpu_has_vmx_ept_4levels()) {
 		enable_ept = 0;
 		enable_ept = 0;
 		enable_unrestricted_guest = 0;
 		enable_unrestricted_guest = 0;
 	}
 	}
@@ -1628,7 +1700,7 @@ static gva_t rmode_tss_base(struct kvm *kvm)
 		gfn_t base_gfn;
 		gfn_t base_gfn;
 
 
 		slots = kvm_memslots(kvm);
 		slots = kvm_memslots(kvm);
-		base_gfn = kvm->memslots->memslots[0].base_gfn +
+		base_gfn = slots->memslots[0].base_gfn +
 				 kvm->memslots->memslots[0].npages - 3;
 				 kvm->memslots->memslots[0].npages - 3;
 		return base_gfn << PAGE_SHIFT;
 		return base_gfn << PAGE_SHIFT;
 	}
 	}
@@ -1759,9 +1831,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
 
 
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
 {
 {
-	vpid_sync_vcpu_all(to_vmx(vcpu));
-	if (enable_ept)
+	vpid_sync_context(to_vmx(vcpu));
+	if (enable_ept) {
+		if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
+			return;
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
 		ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
+	}
 }
 }
 
 
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
@@ -2507,7 +2582,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
 	vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 	vmcs_write32(CR3_TARGET_COUNT, 0);           /* 22.2.1 */
 
 
-	vmcs_writel(HOST_CR0, read_cr0());  /* 22.2.3 */
+	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
 	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */
 
 
@@ -2599,21 +2674,27 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
 
 static int init_rmode(struct kvm *kvm)
 static int init_rmode(struct kvm *kvm)
 {
 {
+	int idx, ret = 0;
+
+	idx = srcu_read_lock(&kvm->srcu);
 	if (!init_rmode_tss(kvm))
 	if (!init_rmode_tss(kvm))
-		return 0;
+		goto exit;
 	if (!init_rmode_identity_map(kvm))
 	if (!init_rmode_identity_map(kvm))
-		return 0;
-	return 1;
+		goto exit;
+
+	ret = 1;
+exit:
+	srcu_read_unlock(&kvm->srcu, idx);
+	return ret;
 }
 }
 
 
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 {
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	u64 msr;
 	u64 msr;
-	int ret, idx;
+	int ret;
 
 
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
 	vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
-	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	if (!init_rmode(vmx->vcpu.kvm)) {
 	if (!init_rmode(vmx->vcpu.kvm)) {
 		ret = -ENOMEM;
 		ret = -ENOMEM;
 		goto out;
 		goto out;
@@ -2630,7 +2711,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 		msr |= MSR_IA32_APICBASE_BSP;
 		msr |= MSR_IA32_APICBASE_BSP;
 	kvm_set_apic_base(&vmx->vcpu, msr);
 	kvm_set_apic_base(&vmx->vcpu, msr);
 
 
-	fx_init(&vmx->vcpu);
+	ret = fx_init(&vmx->vcpu);
+	if (ret != 0)
+		goto out;
 
 
 	seg_setup(VCPU_SREG_CS);
 	seg_setup(VCPU_SREG_CS);
 	/*
 	/*
@@ -2713,7 +2796,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx_fpu_activate(&vmx->vcpu);
 	vmx_fpu_activate(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 	update_exception_bitmap(&vmx->vcpu);
 
 
-	vpid_sync_vcpu_all(vmx);
+	vpid_sync_context(vmx);
 
 
 	ret = 0;
 	ret = 0;
 
 
@@ -2721,7 +2804,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
 	vmx->emulation_required = 0;
 	vmx->emulation_required = 0;
 
 
 out:
 out:
-	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	return ret;
 	return ret;
 }
 }
 
 
@@ -2826,9 +2908,7 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
 {
 {
 	if (!cpu_has_virtual_nmis())
 	if (!cpu_has_virtual_nmis())
 		return to_vmx(vcpu)->soft_vnmi_blocked;
 		return to_vmx(vcpu)->soft_vnmi_blocked;
-	else
-		return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
-			  GUEST_INTR_STATE_NMI);
+	return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)	& GUEST_INTR_STATE_NMI;
 }
 }
 
 
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
 static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
@@ -3070,7 +3150,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
 	++vcpu->stat.io_exits;
 	++vcpu->stat.io_exits;
 
 
 	if (string || in)
 	if (string || in)
-		return !(emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO);
+		return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
 
 
 	port = exit_qualification >> 16;
 	port = exit_qualification >> 16;
 	size = (exit_qualification & 7) + 1;
 	size = (exit_qualification & 7) + 1;
@@ -3090,11 +3170,20 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
 	hypercall[2] = 0xc1;
 	hypercall[2] = 0xc1;
 }
 }
 
 
+static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
+{
+	if (err)
+		kvm_inject_gp(vcpu, 0);
+	else
+		skip_emulated_instruction(vcpu);
+}
+
 static int handle_cr(struct kvm_vcpu *vcpu)
 static int handle_cr(struct kvm_vcpu *vcpu)
 {
 {
 	unsigned long exit_qualification, val;
 	unsigned long exit_qualification, val;
 	int cr;
 	int cr;
 	int reg;
 	int reg;
+	int err;
 
 
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
 	cr = exit_qualification & 15;
 	cr = exit_qualification & 15;
@@ -3105,16 +3194,16 @@ static int handle_cr(struct kvm_vcpu *vcpu)
 		trace_kvm_cr_write(cr, val);
 		trace_kvm_cr_write(cr, val);
 		switch (cr) {
 		switch (cr) {
 		case 0:
 		case 0:
-			kvm_set_cr0(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr0(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 			return 1;
 		case 3:
 		case 3:
-			kvm_set_cr3(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr3(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 			return 1;
 		case 4:
 		case 4:
-			kvm_set_cr4(vcpu, val);
-			skip_emulated_instruction(vcpu);
+			err = kvm_set_cr4(vcpu, val);
+			complete_insn_gp(vcpu, err);
 			return 1;
 			return 1;
 		case 8: {
 		case 8: {
 				u8 cr8_prev = kvm_get_cr8(vcpu);
 				u8 cr8_prev = kvm_get_cr8(vcpu);
@@ -3321,30 +3410,25 @@ static int handle_invlpg(struct kvm_vcpu *vcpu)
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
 {
 	skip_emulated_instruction(vcpu);
 	skip_emulated_instruction(vcpu);
-	/* TODO: Add support for VT-d/pass-through device */
+	kvm_emulate_wbinvd(vcpu);
 	return 1;
 	return 1;
 }
 }
 
 
-static int handle_apic_access(struct kvm_vcpu *vcpu)
+static int handle_xsetbv(struct kvm_vcpu *vcpu)
 {
 {
-	unsigned long exit_qualification;
-	enum emulation_result er;
-	unsigned long offset;
+	u64 new_bv = kvm_read_edx_eax(vcpu);
+	u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
 
 
-	exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-	offset = exit_qualification & 0xffful;
-
-	er = emulate_instruction(vcpu, 0, 0, 0);
-
-	if (er !=  EMULATE_DONE) {
-		printk(KERN_ERR
-		       "Fail to handle apic access vmexit! Offset is 0x%lx\n",
-		       offset);
-		return -ENOEXEC;
-	}
+	if (kvm_set_xcr(vcpu, index, new_bv) == 0)
+		skip_emulated_instruction(vcpu);
 	return 1;
 	return 1;
 }
 }
 
 
+static int handle_apic_access(struct kvm_vcpu *vcpu)
+{
+	return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE;
+}
+
 static int handle_task_switch(struct kvm_vcpu *vcpu)
 static int handle_task_switch(struct kvm_vcpu *vcpu)
 {
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3554,13 +3638,8 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 			goto out;
 			goto out;
 		}
 		}
 
 
-		if (err != EMULATE_DONE) {
-			vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
-			vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
-			vcpu->run->internal.ndata = 0;
-			ret = 0;
-			goto out;
-		}
+		if (err != EMULATE_DONE)
+			return 0;
 
 
 		if (signal_pending(current))
 		if (signal_pending(current))
 			goto out;
 			goto out;
@@ -3623,6 +3702,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_TPR_BELOW_THRESHOLD]     = handle_tpr_below_threshold,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_APIC_ACCESS]             = handle_apic_access,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
 	[EXIT_REASON_WBINVD]                  = handle_wbinvd,
+	[EXIT_REASON_XSETBV]                  = handle_xsetbv,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_TASK_SWITCH]             = handle_task_switch,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 	[EXIT_REASON_MCE_DURING_VMENTRY]      = handle_machine_check,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
 	[EXIT_REASON_EPT_VIOLATION]	      = handle_ept_violation,
@@ -3656,6 +3736,13 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
 	if (enable_ept && is_paging(vcpu))
 	if (enable_ept && is_paging(vcpu))
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 		vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
 
 
+	if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
+		vcpu->run->fail_entry.hardware_entry_failure_reason
+			= exit_reason;
+		return 0;
+	}
+
 	if (unlikely(vmx->fail)) {
 	if (unlikely(vmx->fail)) {
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
 		vcpu->run->fail_entry.hardware_entry_failure_reason
 		vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3861,11 +3948,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
 		vmx_set_interrupt_shadow(vcpu, 0);
 		vmx_set_interrupt_shadow(vcpu, 0);
 
 
-	/*
-	 * Loading guest fpu may have cleared host cr0.ts
-	 */
-	vmcs_writel(HOST_CR0, read_cr0());
-
 	asm(
 	asm(
 		/* Store host registers */
 		/* Store host registers */
 		"push %%"R"dx; push %%"R"bp;"
 		"push %%"R"dx; push %%"R"bp;"
@@ -4001,6 +4083,19 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 	kmem_cache_free(kvm_vcpu_cache, vmx);
 }
 }
 
 
+static inline void vmcs_init(struct vmcs *vmcs)
+{
+	u64 phys_addr = __pa(per_cpu(vmxarea, raw_smp_processor_id()));
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxon(phys_addr);
+
+	vmcs_clear(vmcs);
+
+	if (!vmm_exclusive)
+		kvm_cpu_vmxoff();
+}
+
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 {
 {
 	int err;
 	int err;
@@ -4026,7 +4121,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
 	if (!vmx->vmcs)
 	if (!vmx->vmcs)
 		goto free_msrs;
 		goto free_msrs;
 
 
-	vmcs_clear(vmx->vmcs);
+	vmcs_init(vmx->vmcs);
 
 
 	cpu = get_cpu();
 	cpu = get_cpu();
 	vmx_vcpu_load(&vmx->vcpu, cpu);
 	vmx_vcpu_load(&vmx->vcpu, cpu);
@@ -4265,6 +4360,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.rdtscp_supported = vmx_rdtscp_supported,
 	.rdtscp_supported = vmx_rdtscp_supported,
 
 
 	.set_supported_cpuid = vmx_set_supported_cpuid,
 	.set_supported_cpuid = vmx_set_supported_cpuid,
+
+	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 };
 };
 
 
 static int __init vmx_init(void)
 static int __init vmx_init(void)

+ 649 - 525
arch/x86/kvm/x86.c

@@ -6,6 +6,7 @@
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright (C) 2008 Qumranet, Inc.
  * Copyright IBM Corporation, 2008
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Avi Kivity   <avi@qumranet.com>
@@ -41,17 +42,19 @@
 #include <linux/srcu.h>
 #include <linux/srcu.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
+#include <linux/uaccess.h>
 #include <trace/events/kvm.h>
 #include <trace/events/kvm.h>
 
 
 #define CREATE_TRACE_POINTS
 #define CREATE_TRACE_POINTS
 #include "trace.h"
 #include "trace.h"
 
 
 #include <asm/debugreg.h>
 #include <asm/debugreg.h>
-#include <asm/uaccess.h>
 #include <asm/msr.h>
 #include <asm/msr.h>
 #include <asm/desc.h>
 #include <asm/desc.h>
 #include <asm/mtrr.h>
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/mce.h>
+#include <asm/i387.h>
+#include <asm/xcr.h>
 
 
 #define MAX_IO_MSRS 256
 #define MAX_IO_MSRS 256
 #define CR0_RESERVED_BITS						\
 #define CR0_RESERVED_BITS						\
@@ -62,6 +65,7 @@
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE	\
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
 			  | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR	\
+			  | X86_CR4_OSXSAVE \
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 			  | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
 
 
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
 #define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
@@ -147,6 +151,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
 	{ NULL }
 	{ NULL }
 };
 };
 
 
+u64 __read_mostly host_xcr0;
+
+static inline u32 bit(int bitno)
+{
+	return 1 << (bitno & 31);
+}
+
 static void kvm_on_user_return(struct user_return_notifier *urn)
 static void kvm_on_user_return(struct user_return_notifier *urn)
 {
 {
 	unsigned slot;
 	unsigned slot;
@@ -285,7 +296,7 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
 	prev_nr = vcpu->arch.exception.nr;
 	prev_nr = vcpu->arch.exception.nr;
 	if (prev_nr == DF_VECTOR) {
 	if (prev_nr == DF_VECTOR) {
 		/* triple fault -> shutdown */
 		/* triple fault -> shutdown */
-		set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 		return;
 	}
 	}
 	class1 = exception_class(prev_nr);
 	class1 = exception_class(prev_nr);
@@ -414,121 +425,163 @@ out:
 	return changed;
 	return changed;
 }
 }
 
 
-void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
+int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
 {
 {
+	unsigned long old_cr0 = kvm_read_cr0(vcpu);
+	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
+				    X86_CR0_CD | X86_CR0_NW;
+
 	cr0 |= X86_CR0_ET;
 	cr0 |= X86_CR0_ET;
 
 
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
-	if (cr0 & 0xffffffff00000000UL) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr0 & 0xffffffff00000000UL)
+		return 1;
 #endif
 #endif
 
 
 	cr0 &= ~CR0_RESERVED_BITS;
 	cr0 &= ~CR0_RESERVED_BITS;
 
 
-	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
+		return 1;
 
 
-	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
+		return 1;
 
 
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_X86_64
 		if ((vcpu->arch.efer & EFER_LME)) {
 		if ((vcpu->arch.efer & EFER_LME)) {
 			int cs_db, cs_l;
 			int cs_db, cs_l;
 
 
-			if (!is_pae(vcpu)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (!is_pae(vcpu))
+				return 1;
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
-			if (cs_l) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-
-			}
+			if (cs_l)
+				return 1;
 		} else
 		} else
 #endif
 #endif
-		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
-
+		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3))
+			return 1;
 	}
 	}
 
 
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 	kvm_x86_ops->set_cr0(vcpu, cr0);
 
 
-	kvm_mmu_reset_context(vcpu);
-	return;
+	if ((cr0 ^ old_cr0) & update_bits)
+		kvm_mmu_reset_context(vcpu);
+	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 EXPORT_SYMBOL_GPL(kvm_set_cr0);
 
 
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
 {
 {
-	kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
+	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
 }
 }
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 EXPORT_SYMBOL_GPL(kvm_lmsw);
 
 
-void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
 {
 {
-	unsigned long old_cr4 = kvm_read_cr4(vcpu);
-	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+	u64 xcr0;
 
 
-	if (cr4 & CR4_RESERVED_BITS) {
+	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
+	if (index != XCR_XFEATURE_ENABLED_MASK)
+		return 1;
+	xcr0 = xcr;
+	if (kvm_x86_ops->get_cpl(vcpu) != 0)
+		return 1;
+	if (!(xcr0 & XSTATE_FP))
+		return 1;
+	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
+		return 1;
+	if (xcr0 & ~host_xcr0)
+		return 1;
+	vcpu->arch.xcr0 = xcr0;
+	vcpu->guest_xcr0_loaded = 0;
+	return 0;
+}
+
+int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
+{
+	if (__kvm_set_xcr(vcpu, index, xcr)) {
 		kvm_inject_gp(vcpu, 0);
 		kvm_inject_gp(vcpu, 0);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_set_xcr);
+
+static bool guest_cpuid_has_xsave(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	return best && (best->ecx & bit(X86_FEATURE_XSAVE));
+}
+
+static void update_cpuid(struct kvm_vcpu *vcpu)
+{
+	struct kvm_cpuid_entry2 *best;
+
+	best = kvm_find_cpuid_entry(vcpu, 1, 0);
+	if (!best)
 		return;
 		return;
+
+	/* Update OSXSAVE bit */
+	if (cpu_has_xsave && best->function == 0x1) {
+		best->ecx &= ~(bit(X86_FEATURE_OSXSAVE));
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE))
+			best->ecx |= bit(X86_FEATURE_OSXSAVE);
 	}
 	}
+}
+
+int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+	unsigned long old_cr4 = kvm_read_cr4(vcpu);
+	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
+
+	if (cr4 & CR4_RESERVED_BITS)
+		return 1;
+
+	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
+		return 1;
 
 
 	if (is_long_mode(vcpu)) {
 	if (is_long_mode(vcpu)) {
-		if (!(cr4 & X86_CR4_PAE)) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (!(cr4 & X86_CR4_PAE))
+			return 1;
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
 		   && ((cr4 ^ old_cr4) & pdptr_bits)
-		   && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+		   && !load_pdptrs(vcpu, vcpu->arch.cr3))
+		return 1;
+
+	if (cr4 & X86_CR4_VMXE)
+		return 1;
 
 
-	if (cr4 & X86_CR4_VMXE) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
 	kvm_x86_ops->set_cr4(vcpu, cr4);
 	kvm_x86_ops->set_cr4(vcpu, cr4);
-	vcpu->arch.cr4 = cr4;
-	kvm_mmu_reset_context(vcpu);
+
+	if ((cr4 ^ old_cr4) & pdptr_bits)
+		kvm_mmu_reset_context(vcpu);
+
+	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
+		update_cpuid(vcpu);
+
+	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
 
-void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
 {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 	if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_sync_roots(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
 		kvm_mmu_flush_tlb(vcpu);
-		return;
+		return 0;
 	}
 	}
 
 
 	if (is_long_mode(vcpu)) {
 	if (is_long_mode(vcpu)) {
-		if (cr3 & CR3_L_MODE_RESERVED_BITS) {
-			kvm_inject_gp(vcpu, 0);
-			return;
-		}
+		if (cr3 & CR3_L_MODE_RESERVED_BITS)
+			return 1;
 	} else {
 	} else {
 		if (is_pae(vcpu)) {
 		if (is_pae(vcpu)) {
-			if (cr3 & CR3_PAE_RESERVED_BITS) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
-			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
-				kvm_inject_gp(vcpu, 0);
-				return;
-			}
+			if (cr3 & CR3_PAE_RESERVED_BITS)
+				return 1;
+			if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3))
+				return 1;
 		}
 		}
 		/*
 		/*
 		 * We don't check reserved bits in nonpae mode, because
 		 * We don't check reserved bits in nonpae mode, because
@@ -546,24 +599,28 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * to debug) behavior on the guest side.
 	 * to debug) behavior on the guest side.
 	 */
 	 */
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
 	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
-		kvm_inject_gp(vcpu, 0);
-	else {
-		vcpu->arch.cr3 = cr3;
-		vcpu->arch.mmu.new_cr3(vcpu);
-	}
+		return 1;
+	vcpu->arch.cr3 = cr3;
+	vcpu->arch.mmu.new_cr3(vcpu);
+	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 EXPORT_SYMBOL_GPL(kvm_set_cr3);
 
 
-void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
 {
 {
-	if (cr8 & CR8_RESERVED_BITS) {
-		kvm_inject_gp(vcpu, 0);
-		return;
-	}
+	if (cr8 & CR8_RESERVED_BITS)
+		return 1;
 	if (irqchip_in_kernel(vcpu->kvm))
 	if (irqchip_in_kernel(vcpu->kvm))
 		kvm_lapic_set_tpr(vcpu, cr8);
 		kvm_lapic_set_tpr(vcpu, cr8);
 	else
 	else
 		vcpu->arch.cr8 = cr8;
 		vcpu->arch.cr8 = cr8;
+	return 0;
+}
+
+void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
+{
+	if (__kvm_set_cr8(vcpu, cr8))
+		kvm_inject_gp(vcpu, 0);
 }
 }
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 EXPORT_SYMBOL_GPL(kvm_set_cr8);
 
 
@@ -576,7 +633,7 @@ unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 EXPORT_SYMBOL_GPL(kvm_get_cr8);
 
 
-int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 {
 {
 	switch (dr) {
 	switch (dr) {
 	case 0 ... 3:
 	case 0 ... 3:
@@ -585,29 +642,21 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 			vcpu->arch.eff_db[dr] = val;
 			vcpu->arch.eff_db[dr] = val;
 		break;
 		break;
 	case 4:
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 		/* fall through */
 	case 6:
 	case 6:
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
 		break;
 		break;
 	case 5:
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
-			return 1;
-		}
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
+			return 1; /* #UD */
 		/* fall through */
 		/* fall through */
 	default: /* 7 */
 	default: /* 7 */
-		if (val & 0xffffffff00000000ULL) {
-			kvm_inject_gp(vcpu, 0);
-			return 1;
-		}
+		if (val & 0xffffffff00000000ULL)
+			return -1; /* #GP */
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
 			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
 			kvm_x86_ops->set_dr7(vcpu, vcpu->arch.dr7);
@@ -618,28 +667,37 @@ int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
 
 
 	return 0;
 	return 0;
 }
 }
+
+int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
+{
+	int res;
+
+	res = __kvm_set_dr(vcpu, dr, val);
+	if (res > 0)
+		kvm_queue_exception(vcpu, UD_VECTOR);
+	else if (res < 0)
+		kvm_inject_gp(vcpu, 0);
+
+	return res;
+}
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 EXPORT_SYMBOL_GPL(kvm_set_dr);
 
 
-int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
+static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 {
 	switch (dr) {
 	switch (dr) {
 	case 0 ... 3:
 	case 0 ... 3:
 		*val = vcpu->arch.db[dr];
 		*val = vcpu->arch.db[dr];
 		break;
 		break;
 	case 4:
 	case 4:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
 			return 1;
-		}
 		/* fall through */
 		/* fall through */
 	case 6:
 	case 6:
 		*val = vcpu->arch.dr6;
 		*val = vcpu->arch.dr6;
 		break;
 		break;
 	case 5:
 	case 5:
-		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
-			kvm_queue_exception(vcpu, UD_VECTOR);
+		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
 			return 1;
 			return 1;
-		}
 		/* fall through */
 		/* fall through */
 	default: /* 7 */
 	default: /* 7 */
 		*val = vcpu->arch.dr7;
 		*val = vcpu->arch.dr7;
@@ -648,12 +706,16 @@ int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 
 
 	return 0;
 	return 0;
 }
 }
-EXPORT_SYMBOL_GPL(kvm_get_dr);
 
 
-static inline u32 bit(int bitno)
+int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
 {
 {
-	return 1 << (bitno & 31);
+	if (_kvm_get_dr(vcpu, dr, val)) {
+		kvm_queue_exception(vcpu, UD_VECTOR);
+		return 1;
+	}
+	return 0;
 }
 }
+EXPORT_SYMBOL_GPL(kvm_get_dr);
 
 
 /*
 /*
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
  * List of msr numbers which we expose to userspace through KVM_GET_MSRS
@@ -682,10 +744,14 @@ static unsigned num_msrs_to_save;
 
 
 static u32 emulated_msrs[] = {
 static u32 emulated_msrs[] = {
 	MSR_IA32_MISC_ENABLE,
 	MSR_IA32_MISC_ENABLE,
+	MSR_IA32_MCG_STATUS,
+	MSR_IA32_MCG_CTL,
 };
 };
 
 
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 {
 {
+	u64 old_efer = vcpu->arch.efer;
+
 	if (efer & efer_reserved_bits)
 	if (efer & efer_reserved_bits)
 		return 1;
 		return 1;
 
 
@@ -714,11 +780,13 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
 
 	kvm_x86_ops->set_efer(vcpu, efer);
 	kvm_x86_ops->set_efer(vcpu, efer);
 
 
-	vcpu->arch.efer = efer;
-
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
 	kvm_mmu_reset_context(vcpu);
 	kvm_mmu_reset_context(vcpu);
 
 
+	/* Update reserved bits */
+	if ((efer ^ old_efer) & EFER_NX)
+		kvm_mmu_reset_context(vcpu);
+
 	return 0;
 	return 0;
 }
 }
 
 
@@ -882,7 +950,7 @@ static int kvm_request_guest_time_update(struct kvm_vcpu *v)
 
 
 	if (!vcpu->time_page)
 	if (!vcpu->time_page)
 		return 0;
 		return 0;
-	set_bit(KVM_REQ_KVMCLOCK_UPDATE, &v->requests);
+	kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
 	return 1;
 	return 1;
 }
 }
 
 
@@ -1524,16 +1592,12 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
 {
 {
 	int i, idx;
 	int i, idx;
 
 
-	vcpu_load(vcpu);
-
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	for (i = 0; i < msrs->nmsrs; ++i)
 	for (i = 0; i < msrs->nmsrs; ++i)
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 		if (do_msr(vcpu, entries[i].index, &entries[i].data))
 			break;
 			break;
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 
 
-	vcpu_put(vcpu);
-
 	return i;
 	return i;
 }
 }
 
 
@@ -1618,6 +1682,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
+	case KVM_CAP_XSAVE:
 		r = 1;
 		r = 1;
 		break;
 		break;
 	case KVM_CAP_COALESCED_MMIO:
 	case KVM_CAP_COALESCED_MMIO:
@@ -1641,6 +1706,9 @@ int kvm_dev_ioctl_check_extension(long ext)
 	case KVM_CAP_MCE:
 	case KVM_CAP_MCE:
 		r = KVM_MAX_MCE_BANKS;
 		r = KVM_MAX_MCE_BANKS;
 		break;
 		break;
+	case KVM_CAP_XCRS:
+		r = cpu_has_xsave;
+		break;
 	default:
 	default:
 		r = 0;
 		r = 0;
 		break;
 		break;
@@ -1717,8 +1785,28 @@ out:
 	return r;
 	return r;
 }
 }
 
 
+static void wbinvd_ipi(void *garbage)
+{
+	wbinvd();
+}
+
+static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+	return vcpu->kvm->arch.iommu_domain &&
+		!(vcpu->kvm->arch.iommu_flags & KVM_IOMMU_CACHE_COHERENCY);
+}
+
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 {
 {
+	/* Address WBINVD may be executed by guest */
+	if (need_emulate_wbinvd(vcpu)) {
+		if (kvm_x86_ops->has_wbinvd_exit())
+			cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
+		else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
+			smp_call_function_single(vcpu->cpu,
+					wbinvd_ipi, NULL, 1);
+	}
+
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	kvm_x86_ops->vcpu_load(vcpu, cpu);
 	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
 	if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
 		unsigned long khz = cpufreq_quick_get(cpu);
 		unsigned long khz = cpufreq_quick_get(cpu);
@@ -1731,8 +1819,8 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 {
-	kvm_put_guest_fpu(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_x86_ops->vcpu_put(vcpu);
+	kvm_put_guest_fpu(vcpu);
 }
 }
 
 
 static int is_efer_nx(void)
 static int is_efer_nx(void)
@@ -1781,7 +1869,6 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	if (copy_from_user(cpuid_entries, entries,
 	if (copy_from_user(cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry)))
 		goto out_free;
 		goto out_free;
-	vcpu_load(vcpu);
 	for (i = 0; i < cpuid->nent; i++) {
 	for (i = 0; i < cpuid->nent; i++) {
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
 		vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1799,7 +1886,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
 	r = 0;
 	r = 0;
 	kvm_apic_set_version(vcpu);
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
+	update_cpuid(vcpu);
 
 
 out_free:
 out_free:
 	vfree(cpuid_entries);
 	vfree(cpuid_entries);
@@ -1820,11 +1907,10 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 	if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 			   cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
 		goto out;
 		goto out;
-	vcpu_load(vcpu);
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	vcpu->arch.cpuid_nent = cpuid->nent;
 	kvm_apic_set_version(vcpu);
 	kvm_apic_set_version(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
 	kvm_x86_ops->cpuid_update(vcpu);
-	vcpu_put(vcpu);
+	update_cpuid(vcpu);
 	return 0;
 	return 0;
 
 
 out:
 out:
@@ -1837,7 +1923,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 {
 {
 	int r;
 	int r;
 
 
-	vcpu_load(vcpu);
 	r = -E2BIG;
 	r = -E2BIG;
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 	if (cpuid->nent < vcpu->arch.cpuid_nent)
 		goto out;
 		goto out;
@@ -1849,7 +1934,6 @@ static int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu,
 
 
 out:
 out:
 	cpuid->nent = vcpu->arch.cpuid_nent;
 	cpuid->nent = vcpu->arch.cpuid_nent;
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }
 
 
@@ -1901,13 +1985,13 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 		0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
 	/* cpuid 1.ecx */
 	/* cpuid 1.ecx */
 	const u32 kvm_supported_word4_x86_features =
 	const u32 kvm_supported_word4_x86_features =
-		F(XMM3) | 0 /* Reserved, DTES64, MONITOR */ |
+		F(XMM3) | F(PCLMULQDQ) | 0 /* DTES64, MONITOR */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* DS-CPL, VMX, SMX, EST */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 		0 /* TM2 */ | F(SSSE3) | 0 /* CNXT-ID */ | 0 /* Reserved */ |
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		0 /* Reserved, DCA */ | F(XMM4_1) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
 		F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
-		0 /* Reserved, XSAVE, OSXSAVE */;
+		0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX);
 	/* cpuid 0x80000001.ecx */
 	/* cpuid 0x80000001.ecx */
 	const u32 kvm_supported_word6_x86_features =
 	const u32 kvm_supported_word6_x86_features =
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
 		F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ |
@@ -1922,7 +2006,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
 
 	switch (function) {
 	switch (function) {
 	case 0:
 	case 0:
-		entry->eax = min(entry->eax, (u32)0xb);
+		entry->eax = min(entry->eax, (u32)0xd);
 		break;
 		break;
 	case 1:
 	case 1:
 		entry->edx &= kvm_supported_word0_x86_features;
 		entry->edx &= kvm_supported_word0_x86_features;
@@ -1980,6 +2064,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 		}
 		}
 		break;
 		break;
 	}
 	}
+	case 0xd: {
+		int i;
+
+		entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+		for (i = 1; *nent < maxnent; ++i) {
+			if (entry[i - 1].eax == 0 && i != 2)
+				break;
+			do_cpuid_1_ent(&entry[i], function, i);
+			entry[i].flags |=
+			       KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
+			++*nent;
+		}
+		break;
+	}
 	case KVM_CPUID_SIGNATURE: {
 	case KVM_CPUID_SIGNATURE: {
 		char signature[12] = "KVMKVMKVM\0\0";
 		char signature[12] = "KVMKVMKVM\0\0";
 		u32 *sigptr = (u32 *)signature;
 		u32 *sigptr = (u32 *)signature;
@@ -2081,9 +2179,7 @@ out:
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 				    struct kvm_lapic_state *s)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
 	memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-	vcpu_put(vcpu);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -2091,11 +2187,9 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 				    struct kvm_lapic_state *s)
 {
 {
-	vcpu_load(vcpu);
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
 	memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
 	kvm_apic_post_state_restore(vcpu);
 	kvm_apic_post_state_restore(vcpu);
 	update_cr8_intercept(vcpu);
 	update_cr8_intercept(vcpu);
-	vcpu_put(vcpu);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -2107,20 +2201,15 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
 		return -EINVAL;
 		return -EINVAL;
 	if (irqchip_in_kernel(vcpu->kvm))
 	if (irqchip_in_kernel(vcpu->kvm))
 		return -ENXIO;
 		return -ENXIO;
-	vcpu_load(vcpu);
 
 
 	kvm_queue_interrupt(vcpu, irq->irq, false);
 	kvm_queue_interrupt(vcpu, irq->irq, false);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
 {
 {
-	vcpu_load(vcpu);
 	kvm_inject_nmi(vcpu);
 	kvm_inject_nmi(vcpu);
-	vcpu_put(vcpu);
 
 
 	return 0;
 	return 0;
 }
 }
@@ -2140,7 +2229,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	int r;
 	int r;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 	unsigned bank_num = mcg_cap & 0xff, bank;
 
 
-	vcpu_load(vcpu);
 	r = -EINVAL;
 	r = -EINVAL;
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 	if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
 		goto out;
 		goto out;
@@ -2155,7 +2243,6 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
 	for (bank = 0; bank < bank_num; bank++)
 	for (bank = 0; bank < bank_num; bank++)
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 		vcpu->arch.mce_banks[bank*4] = ~(u64)0;
 out:
 out:
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }
 
 
@@ -2188,7 +2275,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 			printk(KERN_DEBUG "kvm: set_mce: "
 			printk(KERN_DEBUG "kvm: set_mce: "
 			       "injects mce exception while "
 			       "injects mce exception while "
 			       "previous one is in progress!\n");
 			       "previous one is in progress!\n");
-			set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 			return 0;
 			return 0;
 		}
 		}
 		if (banks[1] & MCI_STATUS_VAL)
 		if (banks[1] & MCI_STATUS_VAL)
@@ -2213,8 +2300,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 					       struct kvm_vcpu_events *events)
 					       struct kvm_vcpu_events *events)
 {
 {
-	vcpu_load(vcpu);
-
 	events->exception.injected =
 	events->exception.injected =
 		vcpu->arch.exception.pending &&
 		vcpu->arch.exception.pending &&
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
 		!kvm_exception_is_soft(vcpu->arch.exception.nr);
@@ -2239,8 +2324,6 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
 			 | KVM_VCPUEVENT_VALID_SHADOW);
 			 | KVM_VCPUEVENT_VALID_SHADOW);
-
-	vcpu_put(vcpu);
 }
 }
 
 
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2251,8 +2334,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 			      | KVM_VCPUEVENT_VALID_SHADOW))
 		return -EINVAL;
 		return -EINVAL;
 
 
-	vcpu_load(vcpu);
-
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.pending = events->exception.injected;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.nr = events->exception.nr;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
 	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
@@ -2275,22 +2356,16 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
 		vcpu->arch.sipi_vector = events->sipi_vector;
 		vcpu->arch.sipi_vector = events->sipi_vector;
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
 					     struct kvm_debugregs *dbgregs)
 					     struct kvm_debugregs *dbgregs)
 {
 {
-	vcpu_load(vcpu);
-
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr6 = vcpu->arch.dr6;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->dr7 = vcpu->arch.dr7;
 	dbgregs->flags = 0;
 	dbgregs->flags = 0;
-
-	vcpu_put(vcpu);
 }
 }
 
 
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2299,40 +2374,113 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
 	if (dbgregs->flags)
 	if (dbgregs->flags)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	vcpu_load(vcpu);
-
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
 	vcpu->arch.dr6 = dbgregs->dr6;
 	vcpu->arch.dr6 = dbgregs->dr6;
 	vcpu->arch.dr7 = dbgregs->dr7;
 	vcpu->arch.dr7 = dbgregs->dr7;
 
 
-	vcpu_put(vcpu);
+	return 0;
+}
+
+static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
+					 struct kvm_xsave *guest_xsave)
+{
+	if (cpu_has_xsave)
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->xsave,
+			sizeof(struct xsave_struct));
+	else {
+		memcpy(guest_xsave->region,
+			&vcpu->arch.guest_fpu.state->fxsave,
+			sizeof(struct i387_fxsave_struct));
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
+			XSTATE_FPSSE;
+	}
+}
+
+static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
+					struct kvm_xsave *guest_xsave)
+{
+	u64 xstate_bv =
+		*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
 
 
+	if (cpu_has_xsave)
+		memcpy(&vcpu->arch.guest_fpu.state->xsave,
+			guest_xsave->region, sizeof(struct xsave_struct));
+	else {
+		if (xstate_bv & ~XSTATE_FPSSE)
+			return -EINVAL;
+		memcpy(&vcpu->arch.guest_fpu.state->fxsave,
+			guest_xsave->region, sizeof(struct i387_fxsave_struct));
+	}
 	return 0;
 	return 0;
 }
 }
 
 
+static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
+					struct kvm_xcrs *guest_xcrs)
+{
+	if (!cpu_has_xsave) {
+		guest_xcrs->nr_xcrs = 0;
+		return;
+	}
+
+	guest_xcrs->nr_xcrs = 1;
+	guest_xcrs->flags = 0;
+	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
+	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
+}
+
+static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
+				       struct kvm_xcrs *guest_xcrs)
+{
+	int i, r = 0;
+
+	if (!cpu_has_xsave)
+		return -EINVAL;
+
+	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags)
+		return -EINVAL;
+
+	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
+		/* Only support XCR0 currently */
+		if (guest_xcrs->xcrs[0].xcr == XCR_XFEATURE_ENABLED_MASK) {
+			r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
+				guest_xcrs->xcrs[0].value);
+			break;
+		}
+	if (r)
+		r = -EINVAL;
+	return r;
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 			 unsigned int ioctl, unsigned long arg)
 {
 {
 	struct kvm_vcpu *vcpu = filp->private_data;
 	struct kvm_vcpu *vcpu = filp->private_data;
 	void __user *argp = (void __user *)arg;
 	void __user *argp = (void __user *)arg;
 	int r;
 	int r;
-	struct kvm_lapic_state *lapic = NULL;
+	union {
+		struct kvm_lapic_state *lapic;
+		struct kvm_xsave *xsave;
+		struct kvm_xcrs *xcrs;
+		void *buffer;
+	} u;
 
 
+	u.buffer = NULL;
 	switch (ioctl) {
 	switch (ioctl) {
 	case KVM_GET_LAPIC: {
 	case KVM_GET_LAPIC: {
 		r = -EINVAL;
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 		if (!vcpu->arch.apic)
 			goto out;
 			goto out;
-		lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 
 
 		r = -ENOMEM;
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
 			goto out;
-		r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
 		if (r)
 		if (r)
 			goto out;
 			goto out;
 		r = -EFAULT;
 		r = -EFAULT;
-		if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
+		if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
 			goto out;
 			goto out;
 		r = 0;
 		r = 0;
 		break;
 		break;
@@ -2341,14 +2489,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EINVAL;
 		r = -EINVAL;
 		if (!vcpu->arch.apic)
 		if (!vcpu->arch.apic)
 			goto out;
 			goto out;
-		lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
+		u.lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
 		r = -ENOMEM;
 		r = -ENOMEM;
-		if (!lapic)
+		if (!u.lapic)
 			goto out;
 			goto out;
 		r = -EFAULT;
 		r = -EFAULT;
-		if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
+		if (copy_from_user(u.lapic, argp, sizeof(struct kvm_lapic_state)))
 			goto out;
 			goto out;
-		r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
+		r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
 		if (r)
 		if (r)
 			goto out;
 			goto out;
 		r = 0;
 		r = 0;
@@ -2464,9 +2612,7 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = -EFAULT;
 		r = -EFAULT;
 		if (copy_from_user(&mce, argp, sizeof mce))
 		if (copy_from_user(&mce, argp, sizeof mce))
 			goto out;
 			goto out;
-		vcpu_load(vcpu);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
 		r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
-		vcpu_put(vcpu);
 		break;
 		break;
 	}
 	}
 	case KVM_GET_VCPU_EVENTS: {
 	case KVM_GET_VCPU_EVENTS: {
@@ -2513,11 +2659,67 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
 		break;
 		break;
 	}
 	}
+	case KVM_GET_XSAVE: {
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!u.xsave)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XSAVE: {
+		u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!u.xsave)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(u.xsave, argp, sizeof(struct kvm_xsave)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
+		break;
+	}
+	case KVM_GET_XCRS: {
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!u.xcrs)
+			break;
+
+		kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
+
+		r = -EFAULT;
+		if (copy_to_user(argp, u.xcrs,
+				 sizeof(struct kvm_xcrs)))
+			break;
+		r = 0;
+		break;
+	}
+	case KVM_SET_XCRS: {
+		u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL);
+		r = -ENOMEM;
+		if (!u.xcrs)
+			break;
+
+		r = -EFAULT;
+		if (copy_from_user(u.xcrs, argp,
+				   sizeof(struct kvm_xcrs)))
+			break;
+
+		r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
+		break;
+	}
 	default:
 	default:
 		r = -EINVAL;
 		r = -EINVAL;
 	}
 	}
 out:
 out:
-	kfree(lapic);
+	kfree(u.buffer);
 	return r;
 	return r;
 }
 }
 
 
@@ -2560,115 +2762,6 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
 	return kvm->arch.n_alloc_mmu_pages;
 	return kvm->arch.n_alloc_mmu_pages;
 }
 }
 
 
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (alias->flags & KVM_ALIAS_INVALID)
-			continue;
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
-{
-	int i;
-	struct kvm_mem_alias *alias;
-	struct kvm_mem_aliases *aliases;
-
-	aliases = kvm_aliases(kvm);
-
-	for (i = 0; i < aliases->naliases; ++i) {
-		alias = &aliases->aliases[i];
-		if (gfn >= alias->base_gfn
-		    && gfn < alias->base_gfn + alias->npages)
-			return alias->target_gfn + gfn - alias->base_gfn;
-	}
-	return gfn;
-}
-
-/*
- * Set a new alias region.  Aliases map a portion of physical memory into
- * another portion.  This is useful for memory windows, for example the PC
- * VGA region.
- */
-static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
-					 struct kvm_memory_alias *alias)
-{
-	int r, n;
-	struct kvm_mem_alias *p;
-	struct kvm_mem_aliases *aliases, *old_aliases;
-
-	r = -EINVAL;
-	/* General sanity checks */
-	if (alias->memory_size & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->guest_phys_addr & (PAGE_SIZE - 1))
-		goto out;
-	if (alias->slot >= KVM_ALIAS_SLOTS)
-		goto out;
-	if (alias->guest_phys_addr + alias->memory_size
-	    < alias->guest_phys_addr)
-		goto out;
-	if (alias->target_phys_addr + alias->memory_size
-	    < alias->target_phys_addr)
-		goto out;
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out;
-
-	mutex_lock(&kvm->slots_lock);
-
-	/* invalidate any gfn reference in case of deletion/shrinking */
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-	aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kvm_mmu_zap_all(kvm);
-	kfree(old_aliases);
-
-	r = -ENOMEM;
-	aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!aliases)
-		goto out_unlock;
-
-	memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
-
-	p = &aliases->aliases[alias->slot];
-	p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
-	p->npages = alias->memory_size >> PAGE_SHIFT;
-	p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
-	p->flags &= ~(KVM_ALIAS_INVALID);
-
-	for (n = KVM_ALIAS_SLOTS; n > 0; --n)
-		if (aliases->aliases[n - 1].npages)
-			break;
-	aliases->naliases = n;
-
-	old_aliases = kvm->arch.aliases;
-	rcu_assign_pointer(kvm->arch.aliases, aliases);
-	synchronize_srcu_expedited(&kvm->srcu);
-	kfree(old_aliases);
-	r = 0;
-
-out_unlock:
-	mutex_unlock(&kvm->slots_lock);
-out:
-	return r;
-}
-
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
 {
 {
 	int r;
 	int r;
@@ -2797,7 +2890,6 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 	struct kvm_memory_slot *memslot;
 	struct kvm_memory_slot *memslot;
 	unsigned long n;
 	unsigned long n;
 	unsigned long is_dirty = 0;
 	unsigned long is_dirty = 0;
-	unsigned long *dirty_bitmap = NULL;
 
 
 	mutex_lock(&kvm->slots_lock);
 	mutex_lock(&kvm->slots_lock);
 
 
@@ -2812,27 +2904,30 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 
 
 	n = kvm_dirty_bitmap_bytes(memslot);
 	n = kvm_dirty_bitmap_bytes(memslot);
 
 
-	r = -ENOMEM;
-	dirty_bitmap = vmalloc(n);
-	if (!dirty_bitmap)
-		goto out;
-	memset(dirty_bitmap, 0, n);
-
 	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
 	for (i = 0; !is_dirty && i < n/sizeof(long); i++)
 		is_dirty = memslot->dirty_bitmap[i];
 		is_dirty = memslot->dirty_bitmap[i];
 
 
 	/* If nothing is dirty, don't bother messing with page tables. */
 	/* If nothing is dirty, don't bother messing with page tables. */
 	if (is_dirty) {
 	if (is_dirty) {
 		struct kvm_memslots *slots, *old_slots;
 		struct kvm_memslots *slots, *old_slots;
+		unsigned long *dirty_bitmap;
 
 
 		spin_lock(&kvm->mmu_lock);
 		spin_lock(&kvm->mmu_lock);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
 		kvm_mmu_slot_remove_write_access(kvm, log->slot);
 		spin_unlock(&kvm->mmu_lock);
 		spin_unlock(&kvm->mmu_lock);
 
 
-		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
-		if (!slots)
-			goto out_free;
+		r = -ENOMEM;
+		dirty_bitmap = vmalloc(n);
+		if (!dirty_bitmap)
+			goto out;
+		memset(dirty_bitmap, 0, n);
 
 
+		r = -ENOMEM;
+		slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
+		if (!slots) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 		slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
 
 
@@ -2841,13 +2936,20 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
 		synchronize_srcu_expedited(&kvm->srcu);
 		synchronize_srcu_expedited(&kvm->srcu);
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
 		kfree(old_slots);
 		kfree(old_slots);
+
+		r = -EFAULT;
+		if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) {
+			vfree(dirty_bitmap);
+			goto out;
+		}
+		vfree(dirty_bitmap);
+	} else {
+		r = -EFAULT;
+		if (clear_user(log->dirty_bitmap, n))
+			goto out;
 	}
 	}
 
 
 	r = 0;
 	r = 0;
-	if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
-		r = -EFAULT;
-out_free:
-	vfree(dirty_bitmap);
 out:
 out:
 	mutex_unlock(&kvm->slots_lock);
 	mutex_unlock(&kvm->slots_lock);
 	return r;
 	return r;
@@ -2867,7 +2969,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	union {
 	union {
 		struct kvm_pit_state ps;
 		struct kvm_pit_state ps;
 		struct kvm_pit_state2 ps2;
 		struct kvm_pit_state2 ps2;
-		struct kvm_memory_alias alias;
 		struct kvm_pit_config pit_config;
 		struct kvm_pit_config pit_config;
 	} u;
 	} u;
 
 
@@ -2888,22 +2989,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 			goto out;
 			goto out;
 		break;
 		break;
 	}
 	}
-	case KVM_SET_MEMORY_REGION: {
-		struct kvm_memory_region kvm_mem;
-		struct kvm_userspace_memory_region kvm_userspace_mem;
-
-		r = -EFAULT;
-		if (copy_from_user(&kvm_mem, argp, sizeof kvm_mem))
-			goto out;
-		kvm_userspace_mem.slot = kvm_mem.slot;
-		kvm_userspace_mem.flags = kvm_mem.flags;
-		kvm_userspace_mem.guest_phys_addr = kvm_mem.guest_phys_addr;
-		kvm_userspace_mem.memory_size = kvm_mem.memory_size;
-		r = kvm_vm_ioctl_set_memory_region(kvm, &kvm_userspace_mem, 0);
-		if (r)
-			goto out;
-		break;
-	}
 	case KVM_SET_NR_MMU_PAGES:
 	case KVM_SET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
 		if (r)
 		if (r)
@@ -2912,14 +2997,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_GET_NR_MMU_PAGES:
 	case KVM_GET_NR_MMU_PAGES:
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
 		break;
 		break;
-	case KVM_SET_MEMORY_ALIAS:
-		r = -EFAULT;
-		if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
-			goto out;
-		r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
-		if (r)
-			goto out;
-		break;
 	case KVM_CREATE_IRQCHIP: {
 	case KVM_CREATE_IRQCHIP: {
 		struct kvm_pic *vpic;
 		struct kvm_pic *vpic;
 
 
@@ -3259,7 +3336,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 		}
 		}
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
 		if (ret < 0) {
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 			goto out;
 		}
 		}
 
 
@@ -3315,7 +3392,7 @@ static int kvm_write_guest_virt_system(gva_t addr, void *val,
 		}
 		}
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
 		if (ret < 0) {
 		if (ret < 0) {
-			r = X86EMUL_UNHANDLEABLE;
+			r = X86EMUL_IO_NEEDED;
 			goto out;
 			goto out;
 		}
 		}
 
 
@@ -3330,10 +3407,10 @@ out:
 static int emulator_read_emulated(unsigned long addr,
 static int emulator_read_emulated(unsigned long addr,
 				  void *val,
 				  void *val,
 				  unsigned int bytes,
 				  unsigned int bytes,
+				  unsigned int *error_code,
 				  struct kvm_vcpu *vcpu)
 				  struct kvm_vcpu *vcpu)
 {
 {
 	gpa_t                 gpa;
 	gpa_t                 gpa;
-	u32 error_code;
 
 
 	if (vcpu->mmio_read_completed) {
 	if (vcpu->mmio_read_completed) {
 		memcpy(val, vcpu->mmio_data, bytes);
 		memcpy(val, vcpu->mmio_data, bytes);
@@ -3343,12 +3420,10 @@ static int emulator_read_emulated(unsigned long addr,
 		return X86EMUL_CONTINUE;
 		return X86EMUL_CONTINUE;
 	}
 	}
 
 
-	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code);
 
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 
 	/* For APIC access vmexit */
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3370,11 +3445,12 @@ mmio:
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
 
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 0;
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
 
 
-	return X86EMUL_UNHANDLEABLE;
+	return X86EMUL_IO_NEEDED;
 }
 }
 
 
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
@@ -3392,17 +3468,15 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
 static int emulator_write_emulated_onepage(unsigned long addr,
 static int emulator_write_emulated_onepage(unsigned long addr,
 					   const void *val,
 					   const void *val,
 					   unsigned int bytes,
 					   unsigned int bytes,
+					   unsigned int *error_code,
 					   struct kvm_vcpu *vcpu)
 					   struct kvm_vcpu *vcpu)
 {
 {
 	gpa_t                 gpa;
 	gpa_t                 gpa;
-	u32 error_code;
 
 
-	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
+	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code);
 
 
-	if (gpa == UNMAPPED_GVA) {
-		kvm_inject_page_fault(vcpu, addr, error_code);
+	if (gpa == UNMAPPED_GVA)
 		return X86EMUL_PROPAGATE_FAULT;
 		return X86EMUL_PROPAGATE_FAULT;
-	}
 
 
 	/* For APIC access vmexit */
 	/* For APIC access vmexit */
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
 	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -3420,10 +3494,11 @@ mmio:
 		return X86EMUL_CONTINUE;
 		return X86EMUL_CONTINUE;
 
 
 	vcpu->mmio_needed = 1;
 	vcpu->mmio_needed = 1;
-	vcpu->mmio_phys_addr = gpa;
-	vcpu->mmio_size = bytes;
-	vcpu->mmio_is_write = 1;
-	memcpy(vcpu->mmio_data, val, bytes);
+	vcpu->run->exit_reason = KVM_EXIT_MMIO;
+	vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
+	vcpu->run->mmio.len = vcpu->mmio_size = bytes;
+	vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
+	memcpy(vcpu->run->mmio.data, val, bytes);
 
 
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 }
 }
@@ -3431,6 +3506,7 @@ mmio:
 int emulator_write_emulated(unsigned long addr,
 int emulator_write_emulated(unsigned long addr,
 			    const void *val,
 			    const void *val,
 			    unsigned int bytes,
 			    unsigned int bytes,
+			    unsigned int *error_code,
 			    struct kvm_vcpu *vcpu)
 			    struct kvm_vcpu *vcpu)
 {
 {
 	/* Crossing a page boundary? */
 	/* Crossing a page boundary? */
@@ -3438,16 +3514,17 @@ int emulator_write_emulated(unsigned long addr,
 		int rc, now;
 		int rc, now;
 
 
 		now = -addr & ~PAGE_MASK;
 		now = -addr & ~PAGE_MASK;
-		rc = emulator_write_emulated_onepage(addr, val, now, vcpu);
+		rc = emulator_write_emulated_onepage(addr, val, now, error_code,
+						     vcpu);
 		if (rc != X86EMUL_CONTINUE)
 		if (rc != X86EMUL_CONTINUE)
 			return rc;
 			return rc;
 		addr += now;
 		addr += now;
 		val += now;
 		val += now;
 		bytes -= now;
 		bytes -= now;
 	}
 	}
-	return emulator_write_emulated_onepage(addr, val, bytes, vcpu);
+	return emulator_write_emulated_onepage(addr, val, bytes, error_code,
+					       vcpu);
 }
 }
-EXPORT_SYMBOL_GPL(emulator_write_emulated);
 
 
 #define CMPXCHG_TYPE(t, ptr, old, new) \
 #define CMPXCHG_TYPE(t, ptr, old, new) \
 	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
 	(cmpxchg((t *)(ptr), *(t *)(old), *(t *)(new)) == *(t *)(old))
@@ -3463,6 +3540,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 				     const void *old,
 				     const void *old,
 				     const void *new,
 				     const void *new,
 				     unsigned int bytes,
 				     unsigned int bytes,
+				     unsigned int *error_code,
 				     struct kvm_vcpu *vcpu)
 				     struct kvm_vcpu *vcpu)
 {
 {
 	gpa_t gpa;
 	gpa_t gpa;
@@ -3484,6 +3562,10 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 		goto emul_write;
 		goto emul_write;
 
 
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
 	page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		kvm_release_page_clean(page);
+		goto emul_write;
+	}
 
 
 	kaddr = kmap_atomic(page, KM_USER0);
 	kaddr = kmap_atomic(page, KM_USER0);
 	kaddr += offset_in_page(gpa);
 	kaddr += offset_in_page(gpa);
@@ -3516,7 +3598,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
 emul_write:
 emul_write:
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
 
 
-	return emulator_write_emulated(addr, new, bytes, vcpu);
+	return emulator_write_emulated(addr, new, bytes, error_code, vcpu);
 }
 }
 
 
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
 static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3604,42 +3686,38 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 }
 }
 
 
-int emulate_clts(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
 {
 {
-	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
-	kvm_x86_ops->fpu_activate(vcpu);
+	if (!need_emulate_wbinvd(vcpu))
+		return X86EMUL_CONTINUE;
+
+	if (kvm_x86_ops->has_wbinvd_exit()) {
+		smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
+				wbinvd_ipi, NULL, 1);
+		cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
+	}
+	wbinvd();
 	return X86EMUL_CONTINUE;
 	return X86EMUL_CONTINUE;
 }
 }
+EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
 
-int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
+int emulate_clts(struct kvm_vcpu *vcpu)
 {
 {
-	return kvm_get_dr(ctxt->vcpu, dr, dest);
+	kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
+	kvm_x86_ops->fpu_activate(vcpu);
+	return X86EMUL_CONTINUE;
 }
 }
 
 
-int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
+int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu)
 {
 {
-	unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
-
-	return kvm_set_dr(ctxt->vcpu, dr, value & mask);
+	return _kvm_get_dr(vcpu, dr, dest);
 }
 }
 
 
-void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
+int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu)
 {
 {
-	u8 opcodes[4];
-	unsigned long rip = kvm_rip_read(vcpu);
-	unsigned long rip_linear;
-
-	if (!printk_ratelimit())
-		return;
 
 
-	rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
-
-	kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
-
-	printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
-	       context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
+	return __kvm_set_dr(vcpu, dr, value);
 }
 }
-EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
 
 
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 static u64 mk_cr_64(u64 curr_cr, u32 new_val)
 {
 {
@@ -3674,27 +3752,32 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
 	return value;
 	return value;
 }
 }
 
 
-static void emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
+static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
 {
 {
+	int res = 0;
+
 	switch (cr) {
 	switch (cr) {
 	case 0:
 	case 0:
-		kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
+		res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
 		break;
 		break;
 	case 2:
 	case 2:
 		vcpu->arch.cr2 = val;
 		vcpu->arch.cr2 = val;
 		break;
 		break;
 	case 3:
 	case 3:
-		kvm_set_cr3(vcpu, val);
+		res = kvm_set_cr3(vcpu, val);
 		break;
 		break;
 	case 4:
 	case 4:
-		kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
+		res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
 		break;
 		break;
 	case 8:
 	case 8:
-		kvm_set_cr8(vcpu, val & 0xfUL);
+		res = __kvm_set_cr8(vcpu, val & 0xfUL);
 		break;
 		break;
 	default:
 	default:
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
 		vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
+		res = -1;
 	}
 	}
+
+	return res;
 }
 }
 
 
 static int emulator_get_cpl(struct kvm_vcpu *vcpu)
 static int emulator_get_cpl(struct kvm_vcpu *vcpu)
@@ -3707,6 +3790,12 @@ static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu)
 	kvm_x86_ops->get_gdt(vcpu, dt);
 	kvm_x86_ops->get_gdt(vcpu, dt);
 }
 }
 
 
+static unsigned long emulator_get_cached_segment_base(int seg,
+						      struct kvm_vcpu *vcpu)
+{
+	return get_segment_base(vcpu, seg);
+}
+
 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
 static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
 					   struct kvm_vcpu *vcpu)
 					   struct kvm_vcpu *vcpu)
 {
 {
@@ -3779,11 +3868,6 @@ static void emulator_set_segment_selector(u16 sel, int seg,
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 	kvm_set_segment(vcpu, &kvm_seg, seg);
 }
 }
 
 
-static void emulator_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
-{
-	kvm_x86_ops->set_rflags(vcpu, rflags);
-}
-
 static struct x86_emulate_ops emulate_ops = {
 static struct x86_emulate_ops emulate_ops = {
 	.read_std            = kvm_read_guest_virt_system,
 	.read_std            = kvm_read_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
 	.write_std           = kvm_write_guest_virt_system,
@@ -3797,11 +3881,15 @@ static struct x86_emulate_ops emulate_ops = {
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.set_cached_descriptor = emulator_set_cached_descriptor,
 	.get_segment_selector = emulator_get_segment_selector,
 	.get_segment_selector = emulator_get_segment_selector,
 	.set_segment_selector = emulator_set_segment_selector,
 	.set_segment_selector = emulator_set_segment_selector,
+	.get_cached_segment_base = emulator_get_cached_segment_base,
 	.get_gdt             = emulator_get_gdt,
 	.get_gdt             = emulator_get_gdt,
 	.get_cr              = emulator_get_cr,
 	.get_cr              = emulator_get_cr,
 	.set_cr              = emulator_set_cr,
 	.set_cr              = emulator_set_cr,
 	.cpl                 = emulator_get_cpl,
 	.cpl                 = emulator_get_cpl,
-	.set_rflags          = emulator_set_rflags,
+	.get_dr              = emulator_get_dr,
+	.set_dr              = emulator_set_dr,
+	.set_msr             = kvm_set_msr,
+	.get_msr             = kvm_get_msr,
 };
 };
 
 
 static void cache_all_regs(struct kvm_vcpu *vcpu)
 static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3812,14 +3900,75 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
 	vcpu->arch.regs_dirty = ~0;
 	vcpu->arch.regs_dirty = ~0;
 }
 }
 
 
+static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
+{
+	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu, mask);
+	/*
+	 * an sti; sti; sequence only disable interrupts for the first
+	 * instruction. So, if the last instruction, be it emulated or
+	 * not, left the system with the INT_STI flag enabled, it
+	 * means that the last instruction is an sti. We should not
+	 * leave the flag on in this case. The same goes for mov ss
+	 */
+	if (!(int_shadow & mask))
+		kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+}
+
+static void inject_emulated_exception(struct kvm_vcpu *vcpu)
+{
+	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+	if (ctxt->exception == PF_VECTOR)
+		kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code);
+	else if (ctxt->error_code_valid)
+		kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code);
+	else
+		kvm_queue_exception(vcpu, ctxt->exception);
+}
+
+static int handle_emulation_failure(struct kvm_vcpu *vcpu)
+{
+	++vcpu->stat.insn_emulation_fail;
+	trace_kvm_emulate_insn_failed(vcpu);
+	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+	vcpu->run->internal.ndata = 0;
+	kvm_queue_exception(vcpu, UD_VECTOR);
+	return EMULATE_FAIL;
+}
+
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
+{
+	gpa_t gpa;
+
+	if (tdp_enabled)
+		return false;
+
+	/*
+	 * if emulation was due to access to shadowed page table
+	 * and it failed try to unshadow page and re-entetr the
+	 * guest to let CPU execute the instruction.
+	 */
+	if (kvm_mmu_unprotect_page_virt(vcpu, gva))
+		return true;
+
+	gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva, NULL);
+
+	if (gpa == UNMAPPED_GVA)
+		return true; /* let cpu generate fault */
+
+	if (!kvm_is_error_hva(gfn_to_hva(vcpu->kvm, gpa >> PAGE_SHIFT)))
+		return true;
+
+	return false;
+}
+
 int emulate_instruction(struct kvm_vcpu *vcpu,
 int emulate_instruction(struct kvm_vcpu *vcpu,
 			unsigned long cr2,
 			unsigned long cr2,
 			u16 error_code,
 			u16 error_code,
 			int emulation_type)
 			int emulation_type)
 {
 {
-	int r, shadow_mask;
-	struct decode_cache *c;
-	struct kvm_run *run = vcpu->run;
+	int r;
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 
 
 	kvm_clear_exception_queue(vcpu);
 	kvm_clear_exception_queue(vcpu);
 	vcpu->arch.mmio_fault_cr2 = cr2;
 	vcpu->arch.mmio_fault_cr2 = cr2;
@@ -3831,8 +3980,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 	 */
 	 */
 	cache_all_regs(vcpu);
 	cache_all_regs(vcpu);
 
 
-	vcpu->mmio_is_write = 0;
-
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
 		int cs_db, cs_l;
 		int cs_db, cs_l;
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
 		kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
@@ -3846,13 +3993,16 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 			? X86EMUL_MODE_VM86 : cs_l
 			? X86EMUL_MODE_VM86 : cs_l
 			? X86EMUL_MODE_PROT64 :	cs_db
 			? X86EMUL_MODE_PROT64 :	cs_db
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 			? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+		memset(c, 0, sizeof(struct decode_cache));
+		memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+		vcpu->arch.emulate_ctxt.interruptibility = 0;
+		vcpu->arch.emulate_ctxt.exception = -1;
 
 
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 		trace_kvm_emulate_insn_start(vcpu);
 		trace_kvm_emulate_insn_start(vcpu);
 
 
 		/* Only allow emulation of specific instructions on #UD
 		/* Only allow emulation of specific instructions on #UD
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
 		 * (namely VMMCALL, sysenter, sysexit, syscall)*/
-		c = &vcpu->arch.emulate_ctxt.decode;
 		if (emulation_type & EMULTYPE_TRAP_UD) {
 		if (emulation_type & EMULTYPE_TRAP_UD) {
 			if (!c->twobyte)
 			if (!c->twobyte)
 				return EMULATE_FAIL;
 				return EMULATE_FAIL;
@@ -3880,11 +4030,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 
 
 		++vcpu->stat.insn_emulation;
 		++vcpu->stat.insn_emulation;
 		if (r)  {
 		if (r)  {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
-			if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
+			if (reexecute_instruction(vcpu, cr2))
 				return EMULATE_DONE;
 				return EMULATE_DONE;
-			return EMULATE_FAIL;
+			if (emulation_type & EMULTYPE_SKIP)
+				return EMULATE_FAIL;
+			return handle_emulation_failure(vcpu);
 		}
 		}
 	}
 	}
 
 
@@ -3893,48 +4043,42 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
 		return EMULATE_DONE;
 		return EMULATE_DONE;
 	}
 	}
 
 
+	/* this is needed for vmware backdor interface to work since it
+	   changes registers values  during IO operation */
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
+
 restart:
 restart:
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
 	r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
-	shadow_mask = vcpu->arch.emulate_ctxt.interruptibility;
 
 
-	if (r == 0)
-		kvm_x86_ops->set_interrupt_shadow(vcpu, shadow_mask);
+	if (r) { /* emulation failed */
+		if (reexecute_instruction(vcpu, cr2))
+			return EMULATE_DONE;
 
 
-	if (vcpu->arch.pio.count) {
-		if (!vcpu->arch.pio.in)
-			vcpu->arch.pio.count = 0;
-		return EMULATE_DO_MMIO;
+		return handle_emulation_failure(vcpu);
 	}
 	}
 
 
-	if (r || vcpu->mmio_is_write) {
-		run->exit_reason = KVM_EXIT_MMIO;
-		run->mmio.phys_addr = vcpu->mmio_phys_addr;
-		memcpy(run->mmio.data, vcpu->mmio_data, 8);
-		run->mmio.len = vcpu->mmio_size;
-		run->mmio.is_write = vcpu->mmio_is_write;
+	toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility);
+	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
+
+	if (vcpu->arch.emulate_ctxt.exception >= 0) {
+		inject_emulated_exception(vcpu);
+		return EMULATE_DONE;
 	}
 	}
 
 
-	if (r) {
-		if (kvm_mmu_unprotect_page_virt(vcpu, cr2))
-			goto done;
-		if (!vcpu->mmio_needed) {
-			++vcpu->stat.insn_emulation_fail;
-			trace_kvm_emulate_insn_failed(vcpu);
-			kvm_report_emulation_failure(vcpu, "mmio");
-			return EMULATE_FAIL;
-		}
+	if (vcpu->arch.pio.count) {
+		if (!vcpu->arch.pio.in)
+			vcpu->arch.pio.count = 0;
 		return EMULATE_DO_MMIO;
 		return EMULATE_DO_MMIO;
 	}
 	}
 
 
-	if (vcpu->mmio_is_write) {
-		vcpu->mmio_needed = 0;
+	if (vcpu->mmio_needed) {
+		if (vcpu->mmio_is_write)
+			vcpu->mmio_needed = 0;
 		return EMULATE_DO_MMIO;
 		return EMULATE_DO_MMIO;
 	}
 	}
 
 
-done:
-	if (vcpu->arch.exception.pending)
-		vcpu->arch.emulate_ctxt.restart = false;
-
 	if (vcpu->arch.emulate_ctxt.restart)
 	if (vcpu->arch.emulate_ctxt.restart)
 		goto restart;
 		goto restart;
 
 
@@ -4108,6 +4252,9 @@ int kvm_arch_init(void *opaque)
 
 
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 	perf_register_guest_info_callbacks(&kvm_guest_cbs);
 
 
+	if (cpu_has_xsave)
+		host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+
 	return 0;
 	return 0;
 
 
 out:
 out:
@@ -4270,7 +4417,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
 
 
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 	kvm_x86_ops->patch_hypercall(vcpu, instruction);
 
 
-	return emulator_write_emulated(rip, instruction, 3, vcpu);
+	return emulator_write_emulated(rip, instruction, 3, NULL, vcpu);
 }
 }
 
 
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
 void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
@@ -4506,59 +4653,78 @@ static void inject_pending_event(struct kvm_vcpu *vcpu)
 	}
 	}
 }
 }
 
 
+static void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
+			!vcpu->guest_xcr0_loaded) {
+		/* kvm_set_xcr() also depends on this */
+		xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
+		vcpu->guest_xcr0_loaded = 1;
+	}
+}
+
+static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+{
+	if (vcpu->guest_xcr0_loaded) {
+		if (vcpu->arch.xcr0 != host_xcr0)
+			xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
+		vcpu->guest_xcr0_loaded = 0;
+	}
+}
+
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 {
 {
 	int r;
 	int r;
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 	bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
 		vcpu->run->request_interrupt_window;
 		vcpu->run->request_interrupt_window;
 
 
-	if (vcpu->requests)
-		if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
-			kvm_mmu_unload(vcpu);
-
-	r = kvm_mmu_reload(vcpu);
-	if (unlikely(r))
-		goto out;
-
 	if (vcpu->requests) {
 	if (vcpu->requests) {
-		if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
+			kvm_mmu_unload(vcpu);
+		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
 			__kvm_migrate_timers(vcpu);
 			__kvm_migrate_timers(vcpu);
-		if (test_and_clear_bit(KVM_REQ_KVMCLOCK_UPDATE, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu))
 			kvm_write_guest_time(vcpu);
 			kvm_write_guest_time(vcpu);
-		if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
 			kvm_mmu_sync_roots(vcpu);
 			kvm_mmu_sync_roots(vcpu);
-		if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
+		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
 			kvm_x86_ops->tlb_flush(vcpu);
 			kvm_x86_ops->tlb_flush(vcpu);
-		if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
-				       &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
 			r = 0;
 			r = 0;
 			goto out;
 			goto out;
 		}
 		}
-		if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
 			r = 0;
 			r = 0;
 			goto out;
 			goto out;
 		}
 		}
-		if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
+		if (kvm_check_request(KVM_REQ_DEACTIVATE_FPU, vcpu)) {
 			vcpu->fpu_active = 0;
 			vcpu->fpu_active = 0;
 			kvm_x86_ops->fpu_deactivate(vcpu);
 			kvm_x86_ops->fpu_deactivate(vcpu);
 		}
 		}
 	}
 	}
 
 
+	r = kvm_mmu_reload(vcpu);
+	if (unlikely(r))
+		goto out;
+
 	preempt_disable();
 	preempt_disable();
 
 
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	kvm_x86_ops->prepare_guest_switch(vcpu);
 	if (vcpu->fpu_active)
 	if (vcpu->fpu_active)
 		kvm_load_guest_fpu(vcpu);
 		kvm_load_guest_fpu(vcpu);
+	kvm_load_guest_xcr0(vcpu);
 
 
-	local_irq_disable();
+	atomic_set(&vcpu->guest_mode, 1);
+	smp_wmb();
 
 
-	clear_bit(KVM_REQ_KICK, &vcpu->requests);
-	smp_mb__after_clear_bit();
+	local_irq_disable();
 
 
-	if (vcpu->requests || need_resched() || signal_pending(current)) {
-		set_bit(KVM_REQ_KICK, &vcpu->requests);
+	if (!atomic_read(&vcpu->guest_mode) || vcpu->requests
+	    || need_resched() || signal_pending(current)) {
+		atomic_set(&vcpu->guest_mode, 0);
+		smp_wmb();
 		local_irq_enable();
 		local_irq_enable();
 		preempt_enable();
 		preempt_enable();
 		r = 1;
 		r = 1;
@@ -4603,7 +4769,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 	if (hw_breakpoint_active())
 	if (hw_breakpoint_active())
 		hw_breakpoint_restore();
 		hw_breakpoint_restore();
 
 
-	set_bit(KVM_REQ_KICK, &vcpu->requests);
+	atomic_set(&vcpu->guest_mode, 0);
+	smp_wmb();
 	local_irq_enable();
 	local_irq_enable();
 
 
 	++vcpu->stat.exits;
 	++vcpu->stat.exits;
@@ -4665,7 +4832,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
 			kvm_vcpu_block(vcpu);
 			kvm_vcpu_block(vcpu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
-			if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
+			if (kvm_check_request(KVM_REQ_UNHALT, vcpu))
 			{
 			{
 				switch(vcpu->arch.mp_state) {
 				switch(vcpu->arch.mp_state) {
 				case KVM_MP_STATE_HALTED:
 				case KVM_MP_STATE_HALTED:
@@ -4717,8 +4884,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 	int r;
 	int r;
 	sigset_t sigsaved;
 	sigset_t sigsaved;
 
 
-	vcpu_load(vcpu);
-
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 		sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
 
 
@@ -4743,7 +4908,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
 		srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
-		if (r == EMULATE_DO_MMIO) {
+		if (r != EMULATE_DONE) {
 			r = 0;
 			r = 0;
 			goto out;
 			goto out;
 		}
 		}
@@ -4759,14 +4924,11 @@ out:
 	if (vcpu->sigset_active)
 	if (vcpu->sigset_active)
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 		sigprocmask(SIG_SETMASK, &sigsaved, NULL);
 
 
-	vcpu_put(vcpu);
 	return r;
 	return r;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	vcpu_load(vcpu);
-
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
 	regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4789,15 +4951,11 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rip = kvm_rip_read(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 	regs->rflags = kvm_get_rflags(vcpu);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 {
 {
-	vcpu_load(vcpu);
-
 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
 	kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
 	kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
 	kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4822,8 +4980,6 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 
 
 	vcpu->arch.exception.pending = false;
 	vcpu->arch.exception.pending = false;
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -4842,8 +4998,6 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 {
 {
 	struct desc_ptr dt;
 	struct desc_ptr dt;
 
 
-	vcpu_load(vcpu);
-
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
 	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
@@ -4875,32 +5029,27 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
 		set_bit(vcpu->arch.interrupt.nr,
 		set_bit(vcpu->arch.interrupt.nr,
 			(unsigned long *)sregs->interrupt_bitmap);
 			(unsigned long *)sregs->interrupt_bitmap);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 				    struct kvm_mp_state *mp_state)
 {
 {
-	vcpu_load(vcpu);
 	mp_state->mp_state = vcpu->arch.mp_state;
 	mp_state->mp_state = vcpu->arch.mp_state;
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
 				    struct kvm_mp_state *mp_state)
 				    struct kvm_mp_state *mp_state)
 {
 {
-	vcpu_load(vcpu);
 	vcpu->arch.mp_state = mp_state->mp_state;
 	vcpu->arch.mp_state = mp_state->mp_state;
-	vcpu_put(vcpu);
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		    bool has_error_code, u32 error_code)
 		    bool has_error_code, u32 error_code)
 {
 {
+	struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
 	int cs_db, cs_l, ret;
 	int cs_db, cs_l, ret;
 	cache_all_regs(vcpu);
 	cache_all_regs(vcpu);
 
 
@@ -4915,6 +5064,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 		? X86EMUL_MODE_VM86 : cs_l
 		? X86EMUL_MODE_VM86 : cs_l
 		? X86EMUL_MODE_PROT64 :	cs_db
 		? X86EMUL_MODE_PROT64 :	cs_db
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
 		? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+	memset(c, 0, sizeof(struct decode_cache));
+	memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
 
 
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
 	ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops,
 				   tss_selector, reason, has_error_code,
 				   tss_selector, reason, has_error_code,
@@ -4923,6 +5074,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
 	if (ret)
 	if (ret)
 		return EMULATE_FAIL;
 		return EMULATE_FAIL;
 
 
+	memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
+	kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
 	return EMULATE_DONE;
 	return EMULATE_DONE;
 }
 }
@@ -4935,8 +5088,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	int pending_vec, max_bits;
 	int pending_vec, max_bits;
 	struct desc_ptr dt;
 	struct desc_ptr dt;
 
 
-	vcpu_load(vcpu);
-
 	dt.size = sregs->idt.limit;
 	dt.size = sregs->idt.limit;
 	dt.address = sregs->idt.base;
 	dt.address = sregs->idt.base;
 	kvm_x86_ops->set_idt(vcpu, &dt);
 	kvm_x86_ops->set_idt(vcpu, &dt);
@@ -4996,8 +5147,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 	    !is_protmode(vcpu))
 	    !is_protmode(vcpu))
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 		vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
@@ -5007,12 +5156,10 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 	unsigned long rflags;
 	unsigned long rflags;
 	int i, r;
 	int i, r;
 
 
-	vcpu_load(vcpu);
-
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 	if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
 		r = -EBUSY;
 		r = -EBUSY;
 		if (vcpu->arch.exception.pending)
 		if (vcpu->arch.exception.pending)
-			goto unlock_out;
+			goto out;
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 		if (dbg->control & KVM_GUESTDBG_INJECT_DB)
 			kvm_queue_exception(vcpu, DB_VECTOR);
 			kvm_queue_exception(vcpu, DB_VECTOR);
 		else
 		else
@@ -5054,33 +5201,11 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
 
 
 	r = 0;
 	r = 0;
 
 
-unlock_out:
-	vcpu_put(vcpu);
+out:
 
 
 	return r;
 	return r;
 }
 }
 
 
-/*
- * fxsave fpu state.  Taken from x86_64/processor.h.  To be killed when
- * we have asm/x86/processor.h
- */
-struct fxsave {
-	u16	cwd;
-	u16	swd;
-	u16	twd;
-	u16	fop;
-	u64	rip;
-	u64	rdp;
-	u32	mxcsr;
-	u32	mxcsr_mask;
-	u32	st_space[32];	/* 8*16 bytes for each FP-reg = 128 bytes */
-#ifdef CONFIG_X86_64
-	u32	xmm_space[64];	/* 16*16 bytes for each XMM-reg = 256 bytes */
-#else
-	u32	xmm_space[32];	/* 8*16 bytes for each XMM-reg = 128 bytes */
-#endif
-};
-
 /*
 /*
  * Translate a guest virtual address to a guest physical address.
  * Translate a guest virtual address to a guest physical address.
  */
  */
@@ -5091,7 +5216,6 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	gpa_t gpa;
 	gpa_t gpa;
 	int idx;
 	int idx;
 
 
-	vcpu_load(vcpu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	idx = srcu_read_lock(&vcpu->kvm->srcu);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
 	srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -5099,16 +5223,14 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->valid = gpa != UNMAPPED_GVA;
 	tr->writeable = 1;
 	tr->writeable = 1;
 	tr->usermode = 0;
 	tr->usermode = 0;
-	vcpu_put(vcpu);
 
 
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	memcpy(fpu->fpr, fxsave->st_space, 128);
 	fpu->fcw = fxsave->cwd;
 	fpu->fcw = fxsave->cwd;
@@ -5119,16 +5241,13 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fpu->last_dp = fxsave->rdp;
 	fpu->last_dp = fxsave->rdp;
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 	memcpy(fpu->xmm, fxsave->xmm_space, sizeof fxsave->xmm_space);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 {
 {
-	struct fxsave *fxsave = (struct fxsave *)&vcpu->arch.guest_fx_image;
-
-	vcpu_load(vcpu);
+	struct i387_fxsave_struct *fxsave =
+			&vcpu->arch.guest_fpu.state->fxsave;
 
 
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	memcpy(fxsave->st_space, fpu->fpr, 128);
 	fxsave->cwd = fpu->fcw;
 	fxsave->cwd = fpu->fcw;
@@ -5139,61 +5258,63 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
 	fxsave->rdp = fpu->last_dp;
 	fxsave->rdp = fpu->last_dp;
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 	memcpy(fxsave->xmm_space, fpu->xmm, sizeof fxsave->xmm_space);
 
 
-	vcpu_put(vcpu);
-
 	return 0;
 	return 0;
 }
 }
 
 
-void fx_init(struct kvm_vcpu *vcpu)
+int fx_init(struct kvm_vcpu *vcpu)
 {
 {
-	unsigned after_mxcsr_mask;
+	int err;
+
+	err = fpu_alloc(&vcpu->arch.guest_fpu);
+	if (err)
+		return err;
+
+	fpu_finit(&vcpu->arch.guest_fpu);
 
 
 	/*
 	/*
-	 * Touch the fpu the first time in non atomic context as if
-	 * this is the first fpu instruction the exception handler
-	 * will fire before the instruction returns and it'll have to
-	 * allocate ram with GFP_KERNEL.
+	 * Ensure guest xcr0 is valid for loading
 	 */
 	 */
-	if (!used_math())
-		kvm_fx_save(&vcpu->arch.host_fx_image);
-
-	/* Initialize guest FPU by resetting ours and saving into guest's */
-	preempt_disable();
-	kvm_fx_save(&vcpu->arch.host_fx_image);
-	kvm_fx_finit();
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
-	preempt_enable();
+	vcpu->arch.xcr0 = XSTATE_FP;
 
 
 	vcpu->arch.cr0 |= X86_CR0_ET;
 	vcpu->arch.cr0 |= X86_CR0_ET;
-	after_mxcsr_mask = offsetof(struct i387_fxsave_struct, st_space);
-	vcpu->arch.guest_fx_image.mxcsr = 0x1f80;
-	memset((void *)&vcpu->arch.guest_fx_image + after_mxcsr_mask,
-	       0, sizeof(struct i387_fxsave_struct) - after_mxcsr_mask);
+
+	return 0;
 }
 }
 EXPORT_SYMBOL_GPL(fx_init);
 EXPORT_SYMBOL_GPL(fx_init);
 
 
+static void fx_free(struct kvm_vcpu *vcpu)
+{
+	fpu_free(&vcpu->arch.guest_fpu);
+}
+
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
 {
 {
 	if (vcpu->guest_fpu_loaded)
 	if (vcpu->guest_fpu_loaded)
 		return;
 		return;
 
 
+	/*
+	 * Restore all possible states in the guest,
+	 * and assume host would use all available bits.
+	 * Guest xcr0 would be loaded later.
+	 */
+	kvm_put_guest_xcr0(vcpu);
 	vcpu->guest_fpu_loaded = 1;
 	vcpu->guest_fpu_loaded = 1;
-	kvm_fx_save(&vcpu->arch.host_fx_image);
-	kvm_fx_restore(&vcpu->arch.guest_fx_image);
+	unlazy_fpu(current);
+	fpu_restore_checking(&vcpu->arch.guest_fpu);
 	trace_kvm_fpu(1);
 	trace_kvm_fpu(1);
 }
 }
 
 
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
 {
 {
+	kvm_put_guest_xcr0(vcpu);
+
 	if (!vcpu->guest_fpu_loaded)
 	if (!vcpu->guest_fpu_loaded)
 		return;
 		return;
 
 
 	vcpu->guest_fpu_loaded = 0;
 	vcpu->guest_fpu_loaded = 0;
-	kvm_fx_save(&vcpu->arch.guest_fx_image);
-	kvm_fx_restore(&vcpu->arch.host_fx_image);
+	fpu_save_init(&vcpu->arch.guest_fpu);
 	++vcpu->stat.fpu_reload;
 	++vcpu->stat.fpu_reload;
-	set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
+	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);
 	trace_kvm_fpu(0);
 	trace_kvm_fpu(0);
 }
 }
 
 
@@ -5204,6 +5325,8 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
 		vcpu->arch.time_page = NULL;
 		vcpu->arch.time_page = NULL;
 	}
 	}
 
 
+	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 }
 
 
@@ -5217,9 +5340,6 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 {
 {
 	int r;
 	int r;
 
 
-	/* We do fxsave: this must be aligned. */
-	BUG_ON((unsigned long)&vcpu->arch.host_fx_image & 0xF);
-
 	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu->arch.mtrr_state.have_fixed = 1;
 	vcpu_load(vcpu);
 	vcpu_load(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
 	r = kvm_arch_vcpu_reset(vcpu);
@@ -5241,6 +5361,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 	kvm_mmu_unload(vcpu);
 	kvm_mmu_unload(vcpu);
 	vcpu_put(vcpu);
 	vcpu_put(vcpu);
 
 
+	fx_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 	kvm_x86_ops->vcpu_free(vcpu);
 }
 }
 
 
@@ -5334,7 +5455,12 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	}
 	}
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
 
 
+	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
+		goto fail_free_mce_banks;
+
 	return 0;
 	return 0;
+fail_free_mce_banks:
+	kfree(vcpu->arch.mce_banks);
 fail_free_lapic:
 fail_free_lapic:
 	kvm_free_lapic(vcpu);
 	kvm_free_lapic(vcpu);
 fail_mmu_destroy:
 fail_mmu_destroy:
@@ -5364,12 +5490,6 @@ struct  kvm *kvm_arch_create_vm(void)
 	if (!kvm)
 	if (!kvm)
 		return ERR_PTR(-ENOMEM);
 		return ERR_PTR(-ENOMEM);
 
 
-	kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
-	if (!kvm->arch.aliases) {
-		kfree(kvm);
-		return ERR_PTR(-ENOMEM);
-	}
-
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
 
 
@@ -5412,12 +5532,12 @@ static void kvm_free_vcpus(struct kvm *kvm)
 void kvm_arch_sync_events(struct kvm *kvm)
 void kvm_arch_sync_events(struct kvm *kvm)
 {
 {
 	kvm_free_all_assigned_devices(kvm);
 	kvm_free_all_assigned_devices(kvm);
+	kvm_free_pit(kvm);
 }
 }
 
 
 void kvm_arch_destroy_vm(struct kvm *kvm)
 void kvm_arch_destroy_vm(struct kvm *kvm)
 {
 {
 	kvm_iommu_unmap_guest(kvm);
 	kvm_iommu_unmap_guest(kvm);
-	kvm_free_pit(kvm);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vpic);
 	kfree(kvm->arch.vioapic);
 	kfree(kvm->arch.vioapic);
 	kvm_free_vcpus(kvm);
 	kvm_free_vcpus(kvm);
@@ -5427,7 +5547,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	if (kvm->arch.ept_identity_pagetable)
 	if (kvm->arch.ept_identity_pagetable)
 		put_page(kvm->arch.ept_identity_pagetable);
 		put_page(kvm->arch.ept_identity_pagetable);
 	cleanup_srcu_struct(&kvm->srcu);
 	cleanup_srcu_struct(&kvm->srcu);
-	kfree(kvm->arch.aliases);
 	kfree(kvm);
 	kfree(kvm);
 }
 }
 
 
@@ -5438,6 +5557,11 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 				int user_alloc)
 				int user_alloc)
 {
 {
 	int npages = memslot->npages;
 	int npages = memslot->npages;
+	int map_flags = MAP_PRIVATE | MAP_ANONYMOUS;
+
+	/* Prevent internal slot pages from being moved by fork()/COW. */
+	if (memslot->id >= KVM_MEMORY_SLOTS)
+		map_flags = MAP_SHARED | MAP_ANONYMOUS;
 
 
 	/*To keep backward compatibility with older userspace,
 	/*To keep backward compatibility with older userspace,
 	 *x86 needs to hanlde !user_alloc case.
 	 *x86 needs to hanlde !user_alloc case.
@@ -5450,7 +5574,7 @@ int kvm_arch_prepare_memory_region(struct kvm *kvm,
 			userspace_addr = do_mmap(NULL, 0,
 			userspace_addr = do_mmap(NULL, 0,
 						 npages * PAGE_SIZE,
 						 npages * PAGE_SIZE,
 						 PROT_READ | PROT_WRITE,
 						 PROT_READ | PROT_WRITE,
-						 MAP_PRIVATE | MAP_ANONYMOUS,
+						 map_flags,
 						 0);
 						 0);
 			up_write(&current->mm->mmap_sem);
 			up_write(&current->mm->mmap_sem);
 
 
@@ -5523,7 +5647,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
 
 
 	me = get_cpu();
 	me = get_cpu();
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
 	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
-		if (!test_and_set_bit(KVM_REQ_KICK, &vcpu->requests))
+		if (atomic_xchg(&vcpu->guest_mode, 0))
 			smp_send_reschedule(cpu);
 			smp_send_reschedule(cpu);
 	put_cpu();
 	put_cpu();
 }
 }

+ 0 - 7
arch/x86/kvm/x86.h

@@ -65,13 +65,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 	return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
 }
 }
 
 
-static inline struct kvm_mem_aliases *kvm_aliases(struct kvm *kvm)
-{
-	return rcu_dereference_check(kvm->arch.aliases,
-			srcu_read_lock_held(&kvm->srcu)
-			|| lockdep_is_held(&kvm->slots_lock));
-}
-
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 
 

+ 13 - 0
include/linux/kvm.h

@@ -524,6 +524,12 @@ struct kvm_enable_cap {
 #define KVM_CAP_PPC_OSI 52
 #define KVM_CAP_PPC_OSI 52
 #define KVM_CAP_PPC_UNSET_IRQ 53
 #define KVM_CAP_PPC_UNSET_IRQ 53
 #define KVM_CAP_ENABLE_CAP 54
 #define KVM_CAP_ENABLE_CAP 54
+#ifdef __KVM_HAVE_XSAVE
+#define KVM_CAP_XSAVE 55
+#endif
+#ifdef __KVM_HAVE_XCRS
+#define KVM_CAP_XCRS 56
+#endif
 
 
 #ifdef KVM_CAP_IRQ_ROUTING
 #ifdef KVM_CAP_IRQ_ROUTING
 
 
@@ -613,6 +619,7 @@ struct kvm_clock_data {
  */
  */
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_CREATE_VCPU           _IO(KVMIO,   0x41)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
 #define KVM_GET_DIRTY_LOG         _IOW(KVMIO,  0x42, struct kvm_dirty_log)
+/* KVM_SET_MEMORY_ALIAS is obsolete: */
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
 #define KVM_SET_MEMORY_ALIAS      _IOW(KVMIO,  0x43, struct kvm_memory_alias)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
 #define KVM_SET_NR_MMU_PAGES      _IO(KVMIO,   0x44)
 #define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
 #define KVM_GET_NR_MMU_PAGES      _IO(KVMIO,   0x45)
@@ -714,6 +721,12 @@ struct kvm_clock_data {
 #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
 #define KVM_GET_DEBUGREGS         _IOR(KVMIO,  0xa1, struct kvm_debugregs)
 #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 #define KVM_SET_DEBUGREGS         _IOW(KVMIO,  0xa2, struct kvm_debugregs)
 #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
 #define KVM_ENABLE_CAP            _IOW(KVMIO,  0xa3, struct kvm_enable_cap)
+/* Available with KVM_CAP_XSAVE */
+#define KVM_GET_XSAVE		  _IOR(KVMIO,  0xa4, struct kvm_xsave)
+#define KVM_SET_XSAVE		  _IOW(KVMIO,  0xa5, struct kvm_xsave)
+/* Available with KVM_CAP_XCRS */
+#define KVM_GET_XCRS		  _IOR(KVMIO,  0xa6, struct kvm_xcrs)
+#define KVM_SET_XCRS		  _IOW(KVMIO,  0xa7, struct kvm_xcrs)
 
 
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 #define KVM_DEV_ASSIGN_ENABLE_IOMMU	(1 << 0)
 
 

+ 27 - 8
include/linux/kvm_host.h

@@ -81,13 +81,14 @@ struct kvm_vcpu {
 	int vcpu_id;
 	int vcpu_id;
 	struct mutex mutex;
 	struct mutex mutex;
 	int   cpu;
 	int   cpu;
+	atomic_t guest_mode;
 	struct kvm_run *run;
 	struct kvm_run *run;
 	unsigned long requests;
 	unsigned long requests;
 	unsigned long guest_debug;
 	unsigned long guest_debug;
 	int srcu_idx;
 	int srcu_idx;
 
 
 	int fpu_active;
 	int fpu_active;
-	int guest_fpu_loaded;
+	int guest_fpu_loaded, guest_xcr0_loaded;
 	wait_queue_head_t wq;
 	wait_queue_head_t wq;
 	int sigset_active;
 	int sigset_active;
 	sigset_t sigset;
 	sigset_t sigset;
@@ -123,6 +124,7 @@ struct kvm_memory_slot {
 	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	} *lpage_info[KVM_NR_PAGE_SIZES - 1];
 	unsigned long userspace_addr;
 	unsigned long userspace_addr;
 	int user_alloc;
 	int user_alloc;
+	int id;
 };
 };
 
 
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
 static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memslot)
@@ -266,6 +268,8 @@ extern pfn_t bad_pfn;
 
 
 int is_error_page(struct page *page);
 int is_error_page(struct page *page);
 int is_error_pfn(pfn_t pfn);
 int is_error_pfn(pfn_t pfn);
+int is_hwpoison_pfn(pfn_t pfn);
+int is_fault_pfn(pfn_t pfn);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_is_error_hva(unsigned long addr);
 int kvm_set_memory_region(struct kvm *kvm,
 int kvm_set_memory_region(struct kvm *kvm,
 			  struct kvm_userspace_memory_region *mem,
 			  struct kvm_userspace_memory_region *mem,
@@ -284,8 +288,6 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
 				int user_alloc);
 				int user_alloc);
 void kvm_disable_largepages(void);
 void kvm_disable_largepages(void);
 void kvm_arch_flush_shadow(struct kvm *kvm);
 void kvm_arch_flush_shadow(struct kvm *kvm);
-gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
-gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn);
 
 
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
 unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn);
@@ -445,7 +447,8 @@ void kvm_register_irq_mask_notifier(struct kvm *kvm, int irq,
 				    struct kvm_irq_mask_notifier *kimn);
 				    struct kvm_irq_mask_notifier *kimn);
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 				      struct kvm_irq_mask_notifier *kimn);
 				      struct kvm_irq_mask_notifier *kimn);
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask);
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask);
 
 
 #ifdef __KVM_HAVE_IOAPIC
 #ifdef __KVM_HAVE_IOAPIC
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
 void kvm_get_intr_delivery_bitmask(struct kvm_ioapic *ioapic,
@@ -562,10 +565,6 @@ static inline int mmu_notifier_retry(struct kvm_vcpu *vcpu, unsigned long mmu_se
 }
 }
 #endif
 #endif
 
 
-#ifndef KVM_ARCH_HAS_UNALIAS_INSTANTIATION
-#define unalias_gfn_instantiation unalias_gfn
-#endif
-
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 #ifdef CONFIG_HAVE_KVM_IRQCHIP
 
 
 #define KVM_MAX_IRQ_ROUTES 1024
 #define KVM_MAX_IRQ_ROUTES 1024
@@ -628,5 +627,25 @@ static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl,
 
 
 #endif
 #endif
 
 
+static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
+{
+	set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_make_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	return test_and_set_bit(req, &vcpu->requests);
+}
+
+static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
+{
+	if (test_bit(req, &vcpu->requests)) {
+		clear_bit(req, &vcpu->requests);
+		return true;
+	} else {
+		return false;
+	}
+}
+
 #endif
 #endif
 
 

+ 2 - 2
include/linux/kvm_types.h

@@ -32,11 +32,11 @@
 
 
 typedef unsigned long  gva_t;
 typedef unsigned long  gva_t;
 typedef u64            gpa_t;
 typedef u64            gpa_t;
-typedef unsigned long  gfn_t;
+typedef u64            gfn_t;
 
 
 typedef unsigned long  hva_t;
 typedef unsigned long  hva_t;
 typedef u64            hpa_t;
 typedef u64            hpa_t;
-typedef unsigned long  hfn_t;
+typedef u64            hfn_t;
 
 
 typedef hfn_t pfn_t;
 typedef hfn_t pfn_t;
 
 

+ 8 - 0
include/linux/mm.h

@@ -1465,6 +1465,14 @@ extern int sysctl_memory_failure_recovery;
 extern void shake_page(struct page *p, int access);
 extern void shake_page(struct page *p, int access);
 extern atomic_long_t mce_bad_pages;
 extern atomic_long_t mce_bad_pages;
 extern int soft_offline_page(struct page *page, int flags);
 extern int soft_offline_page(struct page *page, int flags);
+#ifdef CONFIG_MEMORY_FAILURE
+int is_hwpoison_address(unsigned long addr);
+#else
+static inline int is_hwpoison_address(unsigned long addr)
+{
+	return 0;
+}
+#endif
 
 
 extern void dump_page(struct page *page);
 extern void dump_page(struct page *page);
 
 

+ 33 - 0
mm/memory-failure.c

@@ -45,6 +45,7 @@
 #include <linux/page-isolation.h>
 #include <linux/page-isolation.h>
 #include <linux/suspend.h>
 #include <linux/suspend.h>
 #include <linux/slab.h>
 #include <linux/slab.h>
+#include <linux/swapops.h>
 #include "internal.h"
 #include "internal.h"
 
 
 int sysctl_memory_failure_early_kill __read_mostly = 0;
 int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -1296,3 +1297,35 @@ done:
 	/* keep elevated page count for bad page */
 	/* keep elevated page count for bad page */
 	return ret;
 	return ret;
 }
 }
+
+/*
+ * The caller must hold current->mm->mmap_sem in read mode.
+ */
+int is_hwpoison_address(unsigned long addr)
+{
+	pgd_t *pgdp;
+	pud_t pud, *pudp;
+	pmd_t pmd, *pmdp;
+	pte_t pte, *ptep;
+	swp_entry_t entry;
+
+	pgdp = pgd_offset(current->mm, addr);
+	if (!pgd_present(*pgdp))
+		return 0;
+	pudp = pud_offset(pgdp, addr);
+	pud = *pudp;
+	if (!pud_present(pud) || pud_large(pud))
+		return 0;
+	pmdp = pmd_offset(pudp, addr);
+	pmd = *pmdp;
+	if (!pmd_present(pmd) || pmd_large(pmd))
+		return 0;
+	ptep = pte_offset_map(pmdp, addr);
+	pte = *ptep;
+	pte_unmap(ptep);
+	if (!is_swap_pte(pte))
+		return 0;
+	entry = pte_to_swp_entry(pte);
+	return is_hwpoison_entry(entry);
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_address);

+ 1 - 6
virt/kvm/assigned-dev.c

@@ -1,7 +1,7 @@
 /*
 /*
  * Kernel-based Virtual Machine - device assignment support
  * Kernel-based Virtual Machine - device assignment support
  *
  *
- * Copyright (C) 2006-9 Red Hat, Inc
+ * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates.
  *
  *
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * This work is licensed under the terms of the GNU GPL, version 2.  See
  * the COPYING file in the top-level directory.
  * the COPYING file in the top-level directory.
@@ -58,12 +58,10 @@ static int find_index_from_host_irq(struct kvm_assigned_dev_kernel
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work)
 {
 {
 	struct kvm_assigned_dev_kernel *assigned_dev;
 	struct kvm_assigned_dev_kernel *assigned_dev;
-	struct kvm *kvm;
 	int i;
 	int i;
 
 
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 	assigned_dev = container_of(work, struct kvm_assigned_dev_kernel,
 				    interrupt_work);
 				    interrupt_work);
-	kvm = assigned_dev->kvm;
 
 
 	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	spin_lock_irq(&assigned_dev->assigned_dev_lock);
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
 	if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) {
@@ -448,9 +446,6 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm,
 	struct kvm_assigned_dev_kernel *match;
 	struct kvm_assigned_dev_kernel *match;
 	unsigned long host_irq_type, guest_irq_type;
 	unsigned long host_irq_type, guest_irq_type;
 
 
-	if (!capable(CAP_SYS_RAWIO))
-		return -EPERM;
-
 	if (!irqchip_in_kernel(kvm))
 	if (!irqchip_in_kernel(kvm))
 		return r;
 		return r;
 
 

+ 1 - 0
virt/kvm/coalesced_mmio.c

@@ -2,6 +2,7 @@
  * KVM coalesced MMIO
  * KVM coalesced MMIO
  *
  *
  * Copyright (c) 2008 Bull S.A.S.
  * Copyright (c) 2008 Bull S.A.S.
+ * Copyright 2009 Red Hat, Inc. and/or its affiliates.
  *
  *
  *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
  *  Author: Laurent Vivier <Laurent.Vivier@bull.net>
  *
  *

+ 1 - 0
virt/kvm/eventfd.c

@@ -2,6 +2,7 @@
  * kvm eventfd support - use eventfd objects to signal various KVM events
  * kvm eventfd support - use eventfd objects to signal various KVM events
  *
  *
  * Copyright 2009 Novell.  All Rights Reserved.
  * Copyright 2009 Novell.  All Rights Reserved.
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *
  * Author:
  * Author:
  *	Gregory Haskins <ghaskins@novell.com>
  *	Gregory Haskins <ghaskins@novell.com>

+ 2 - 1
virt/kvm/ioapic.c

@@ -1,5 +1,6 @@
 /*
 /*
  *  Copyright (C) 2001  MandrakeSoft S.A.
  *  Copyright (C) 2001  MandrakeSoft S.A.
+ *  Copyright 2010 Red Hat, Inc. and/or its affiliates.
  *
  *
  *    MandrakeSoft S.A.
  *    MandrakeSoft S.A.
  *    43, rue d'Aboukir
  *    43, rue d'Aboukir
@@ -151,7 +152,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		update_handled_vectors(ioapic);
 		update_handled_vectors(ioapic);
 		mask_after = e->fields.mask;
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 		if (mask_before != mask_after)
-			kvm_fire_mask_notifiers(ioapic->kvm, index, mask_after);
+			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG
 		    && ioapic->irr & (1 << index))
 		    && ioapic->irr & (1 << index))
 			ioapic_service(ioapic, index);
 			ioapic_service(ioapic, index);

+ 9 - 3
virt/kvm/iommu.c

@@ -16,6 +16,8 @@
  *
  *
  * Copyright (C) 2006-2008 Intel Corporation
  * Copyright (C) 2006-2008 Intel Corporation
  * Copyright IBM Corporation, 2008
  * Copyright IBM Corporation, 2008
+ * Copyright 2010 Red Hat, Inc. and/or its affiliates.
+ *
  * Author: Allen M. Kay <allen.m.kay@intel.com>
  * Author: Allen M. Kay <allen.m.kay@intel.com>
  * Author: Weidong Han <weidong.han@intel.com>
  * Author: Weidong Han <weidong.han@intel.com>
  * Author: Ben-Ami Yassour <benami@il.ibm.com>
  * Author: Ben-Ami Yassour <benami@il.ibm.com>
@@ -106,7 +108,7 @@ int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot)
 			      get_order(page_size), flags);
 			      get_order(page_size), flags);
 		if (r) {
 		if (r) {
 			printk(KERN_ERR "kvm_iommu_map_address:"
 			printk(KERN_ERR "kvm_iommu_map_address:"
-			       "iommu failed to map pfn=%lx\n", pfn);
+			       "iommu failed to map pfn=%llx\n", pfn);
 			goto unmap_pages;
 			goto unmap_pages;
 		}
 		}
 
 
@@ -124,9 +126,10 @@ unmap_pages:
 
 
 static int kvm_iommu_map_memslots(struct kvm *kvm)
 static int kvm_iommu_map_memslots(struct kvm *kvm)
 {
 {
-	int i, r = 0;
+	int i, idx, r = 0;
 	struct kvm_memslots *slots;
 	struct kvm_memslots *slots;
 
 
+	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 	slots = kvm_memslots(kvm);
 
 
 	for (i = 0; i < slots->nmemslots; i++) {
 	for (i = 0; i < slots->nmemslots; i++) {
@@ -134,6 +137,7 @@ static int kvm_iommu_map_memslots(struct kvm *kvm)
 		if (r)
 		if (r)
 			break;
 			break;
 	}
 	}
+	srcu_read_unlock(&kvm->srcu, idx);
 
 
 	return r;
 	return r;
 }
 }
@@ -283,15 +287,17 @@ static void kvm_iommu_put_pages(struct kvm *kvm,
 
 
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 static int kvm_iommu_unmap_memslots(struct kvm *kvm)
 {
 {
-	int i;
+	int i, idx;
 	struct kvm_memslots *slots;
 	struct kvm_memslots *slots;
 
 
+	idx = srcu_read_lock(&kvm->srcu);
 	slots = kvm_memslots(kvm);
 	slots = kvm_memslots(kvm);
 
 
 	for (i = 0; i < slots->nmemslots; i++) {
 	for (i = 0; i < slots->nmemslots; i++) {
 		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
 		kvm_iommu_put_pages(kvm, slots->memslots[i].base_gfn,
 				    slots->memslots[i].npages);
 				    slots->memslots[i].npages);
 	}
 	}
+	srcu_read_unlock(&kvm->srcu, idx);
 
 
 	return 0;
 	return 0;
 }
 }

+ 10 - 5
virt/kvm/irq_comm.c

@@ -17,6 +17,7 @@
  * Authors:
  * Authors:
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *   Yaozu (Eddie) Dong <Eddie.dong@intel.com>
  *
  *
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  */
  */
 
 
 #include <linux/kvm_host.h>
 #include <linux/kvm_host.h>
@@ -99,7 +100,7 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
 			if (r < 0)
 			if (r < 0)
 				r = 0;
 				r = 0;
 			r += kvm_apic_set_irq(vcpu, irq);
 			r += kvm_apic_set_irq(vcpu, irq);
-		} else {
+		} else if (kvm_lapic_enabled(vcpu)) {
 			if (!lowest)
 			if (!lowest)
 				lowest = vcpu;
 				lowest = vcpu;
 			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
 			else if (kvm_apic_compare_prio(vcpu, lowest) < 0)
@@ -278,15 +279,19 @@ void kvm_unregister_irq_mask_notifier(struct kvm *kvm, int irq,
 	synchronize_rcu();
 	synchronize_rcu();
 }
 }
 
 
-void kvm_fire_mask_notifiers(struct kvm *kvm, int irq, bool mask)
+void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
+			     bool mask)
 {
 {
 	struct kvm_irq_mask_notifier *kimn;
 	struct kvm_irq_mask_notifier *kimn;
 	struct hlist_node *n;
 	struct hlist_node *n;
+	int gsi;
 
 
 	rcu_read_lock();
 	rcu_read_lock();
-	hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
-		if (kimn->irq == irq)
-			kimn->func(kimn, mask);
+	gsi = rcu_dereference(kvm->irq_routing)->chip[irqchip][pin];
+	if (gsi != -1)
+		hlist_for_each_entry_rcu(kimn, n, &kvm->mask_notifier_list, link)
+			if (kimn->irq == gsi)
+				kimn->func(kimn, mask);
 	rcu_read_unlock();
 	rcu_read_unlock();
 }
 }
 
 

+ 79 - 27
virt/kvm/kvm_main.c

@@ -5,6 +5,7 @@
  * machines without emulation or binary translation.
  * machines without emulation or binary translation.
  *
  *
  * Copyright (C) 2006 Qumranet, Inc.
  * Copyright (C) 2006 Qumranet, Inc.
+ * Copyright 2010 Red Hat, Inc. and/or its affilates.
  *
  *
  * Authors:
  * Authors:
  *   Avi Kivity   <avi@qumranet.com>
  *   Avi Kivity   <avi@qumranet.com>
@@ -92,6 +93,12 @@ static bool kvm_rebooting;
 
 
 static bool largepages_enabled = true;
 static bool largepages_enabled = true;
 
 
+static struct page *hwpoison_page;
+static pfn_t hwpoison_pfn;
+
+static struct page *fault_page;
+static pfn_t fault_pfn;
+
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 inline int kvm_is_mmio_pfn(pfn_t pfn)
 {
 {
 	if (pfn_valid(pfn)) {
 	if (pfn_valid(pfn)) {
@@ -141,7 +148,7 @@ static bool make_all_cpus_request(struct kvm *kvm, unsigned int req)
 	raw_spin_lock(&kvm->requests_lock);
 	raw_spin_lock(&kvm->requests_lock);
 	me = smp_processor_id();
 	me = smp_processor_id();
 	kvm_for_each_vcpu(i, vcpu, kvm) {
 	kvm_for_each_vcpu(i, vcpu, kvm) {
-		if (test_and_set_bit(req, &vcpu->requests))
+		if (kvm_make_check_request(req, vcpu))
 			continue;
 			continue;
 		cpu = vcpu->cpu;
 		cpu = vcpu->cpu;
 		if (cpus != NULL && cpu != -1 && cpu != me)
 		if (cpus != NULL && cpu != -1 && cpu != me)
@@ -566,6 +573,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 
 
 	new = old = *memslot;
 	new = old = *memslot;
 
 
+	new.id = mem->slot;
 	new.base_gfn = base_gfn;
 	new.base_gfn = base_gfn;
 	new.npages = npages;
 	new.npages = npages;
 	new.flags = mem->flags;
 	new.flags = mem->flags;
@@ -596,7 +604,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
 	/* Allocate if a slot is being created */
 	/* Allocate if a slot is being created */
 #ifndef CONFIG_S390
 #ifndef CONFIG_S390
 	if (npages && !new.rmap) {
 	if (npages && !new.rmap) {
-		new.rmap = vmalloc(npages * sizeof(struct page *));
+		new.rmap = vmalloc(npages * sizeof(*new.rmap));
 
 
 		if (!new.rmap)
 		if (!new.rmap)
 			goto out_free;
 			goto out_free;
@@ -621,9 +629,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		if (new.lpage_info[i])
 		if (new.lpage_info[i])
 			continue;
 			continue;
 
 
-		lpages = 1 + (base_gfn + npages - 1) /
-			     KVM_PAGES_PER_HPAGE(level);
-		lpages -= base_gfn / KVM_PAGES_PER_HPAGE(level);
+		lpages = 1 + ((base_gfn + npages - 1)
+			     >> KVM_HPAGE_GFN_SHIFT(level));
+		lpages -= base_gfn >> KVM_HPAGE_GFN_SHIFT(level);
 
 
 		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
 		new.lpage_info[i] = vmalloc(lpages * sizeof(*new.lpage_info[i]));
 
 
@@ -633,9 +641,9 @@ int __kvm_set_memory_region(struct kvm *kvm,
 		memset(new.lpage_info[i], 0,
 		memset(new.lpage_info[i], 0,
 		       lpages * sizeof(*new.lpage_info[i]));
 		       lpages * sizeof(*new.lpage_info[i]));
 
 
-		if (base_gfn % KVM_PAGES_PER_HPAGE(level))
+		if (base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
 			new.lpage_info[i][0].write_count = 1;
 			new.lpage_info[i][0].write_count = 1;
-		if ((base_gfn+npages) % KVM_PAGES_PER_HPAGE(level))
+		if ((base_gfn+npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
 			new.lpage_info[i][lpages - 1].write_count = 1;
 			new.lpage_info[i][lpages - 1].write_count = 1;
 		ugfn = new.userspace_addr >> PAGE_SHIFT;
 		ugfn = new.userspace_addr >> PAGE_SHIFT;
 		/*
 		/*
@@ -810,16 +818,28 @@ EXPORT_SYMBOL_GPL(kvm_disable_largepages);
 
 
 int is_error_page(struct page *page)
 int is_error_page(struct page *page)
 {
 {
-	return page == bad_page;
+	return page == bad_page || page == hwpoison_page || page == fault_page;
 }
 }
 EXPORT_SYMBOL_GPL(is_error_page);
 EXPORT_SYMBOL_GPL(is_error_page);
 
 
 int is_error_pfn(pfn_t pfn)
 int is_error_pfn(pfn_t pfn)
 {
 {
-	return pfn == bad_pfn;
+	return pfn == bad_pfn || pfn == hwpoison_pfn || pfn == fault_pfn;
 }
 }
 EXPORT_SYMBOL_GPL(is_error_pfn);
 EXPORT_SYMBOL_GPL(is_error_pfn);
 
 
+int is_hwpoison_pfn(pfn_t pfn)
+{
+	return pfn == hwpoison_pfn;
+}
+EXPORT_SYMBOL_GPL(is_hwpoison_pfn);
+
+int is_fault_pfn(pfn_t pfn)
+{
+	return pfn == fault_pfn;
+}
+EXPORT_SYMBOL_GPL(is_fault_pfn);
+
 static inline unsigned long bad_hva(void)
 static inline unsigned long bad_hva(void)
 {
 {
 	return PAGE_OFFSET;
 	return PAGE_OFFSET;
@@ -831,7 +851,7 @@ int kvm_is_error_hva(unsigned long addr)
 }
 }
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 EXPORT_SYMBOL_GPL(kvm_is_error_hva);
 
 
-struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
+struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
 {
 {
 	int i;
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memslots *slots = kvm_memslots(kvm);
@@ -845,20 +865,13 @@ struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn)
 	}
 	}
 	return NULL;
 	return NULL;
 }
 }
-EXPORT_SYMBOL_GPL(gfn_to_memslot_unaliased);
-
-struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn)
-{
-	gfn = unalias_gfn(kvm, gfn);
-	return gfn_to_memslot_unaliased(kvm, gfn);
-}
+EXPORT_SYMBOL_GPL(gfn_to_memslot);
 
 
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 int kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
 {
 {
 	int i;
 	int i;
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 	for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
 		struct kvm_memory_slot *memslot = &slots->memslots[i];
 
 
@@ -903,7 +916,6 @@ int memslot_id(struct kvm *kvm, gfn_t gfn)
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memslots *slots = kvm_memslots(kvm);
 	struct kvm_memory_slot *memslot = NULL;
 	struct kvm_memory_slot *memslot = NULL;
 
 
-	gfn = unalias_gfn(kvm, gfn);
 	for (i = 0; i < slots->nmemslots; ++i) {
 	for (i = 0; i < slots->nmemslots; ++i) {
 		memslot = &slots->memslots[i];
 		memslot = &slots->memslots[i];
 
 
@@ -924,8 +936,7 @@ unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
 {
 {
 	struct kvm_memory_slot *slot;
 	struct kvm_memory_slot *slot;
 
 
-	gfn = unalias_gfn_instantiation(kvm, gfn);
-	slot = gfn_to_memslot_unaliased(kvm, gfn);
+	slot = gfn_to_memslot(kvm, gfn);
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 	if (!slot || slot->flags & KVM_MEMSLOT_INVALID)
 		return bad_hva();
 		return bad_hva();
 	return gfn_to_hva_memslot(slot, gfn);
 	return gfn_to_hva_memslot(slot, gfn);
@@ -946,13 +957,19 @@ static pfn_t hva_to_pfn(struct kvm *kvm, unsigned long addr)
 		struct vm_area_struct *vma;
 		struct vm_area_struct *vma;
 
 
 		down_read(&current->mm->mmap_sem);
 		down_read(&current->mm->mmap_sem);
+		if (is_hwpoison_address(addr)) {
+			up_read(&current->mm->mmap_sem);
+			get_page(hwpoison_page);
+			return page_to_pfn(hwpoison_page);
+		}
+
 		vma = find_vma(current->mm, addr);
 		vma = find_vma(current->mm, addr);
 
 
 		if (vma == NULL || addr < vma->vm_start ||
 		if (vma == NULL || addr < vma->vm_start ||
 		    !(vma->vm_flags & VM_PFNMAP)) {
 		    !(vma->vm_flags & VM_PFNMAP)) {
 			up_read(&current->mm->mmap_sem);
 			up_read(&current->mm->mmap_sem);
-			get_page(bad_page);
-			return page_to_pfn(bad_page);
+			get_page(fault_page);
+			return page_to_pfn(fault_page);
 		}
 		}
 
 
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
 		pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
@@ -1187,8 +1204,7 @@ void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
 {
 {
 	struct kvm_memory_slot *memslot;
 	struct kvm_memory_slot *memslot;
 
 
-	gfn = unalias_gfn(kvm, gfn);
-	memslot = gfn_to_memslot_unaliased(kvm, gfn);
+	memslot = gfn_to_memslot(kvm, gfn);
 	if (memslot && memslot->dirty_bitmap) {
 	if (memslot && memslot->dirty_bitmap) {
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 		unsigned long rel_gfn = gfn - memslot->base_gfn;
 
 
@@ -1207,7 +1223,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 		prepare_to_wait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
 
 
 		if (kvm_arch_vcpu_runnable(vcpu)) {
 		if (kvm_arch_vcpu_runnable(vcpu)) {
-			set_bit(KVM_REQ_UNHALT, &vcpu->requests);
+			kvm_make_request(KVM_REQ_UNHALT, vcpu);
 			break;
 			break;
 		}
 		}
 		if (kvm_cpu_has_pending_timer(vcpu))
 		if (kvm_cpu_has_pending_timer(vcpu))
@@ -1378,6 +1394,18 @@ static long kvm_vcpu_ioctl(struct file *filp,
 
 
 	if (vcpu->kvm->mm != current->mm)
 	if (vcpu->kvm->mm != current->mm)
 		return -EIO;
 		return -EIO;
+
+#if defined(CONFIG_S390) || defined(CONFIG_PPC)
+	/*
+	 * Special cases: vcpu ioctls that are asynchronous to vcpu execution,
+	 * so vcpu_load() would break it.
+	 */
+	if (ioctl == KVM_S390_INTERRUPT || ioctl == KVM_INTERRUPT)
+		return kvm_arch_vcpu_ioctl(filp, ioctl, arg);
+#endif
+
+
+	vcpu_load(vcpu);
 	switch (ioctl) {
 	switch (ioctl) {
 	case KVM_RUN:
 	case KVM_RUN:
 		r = -EINVAL;
 		r = -EINVAL;
@@ -1520,7 +1548,7 @@ out_free2:
 				goto out;
 				goto out;
 			p = &sigset;
 			p = &sigset;
 		}
 		}
-		r = kvm_vcpu_ioctl_set_sigmask(vcpu, &sigset);
+		r = kvm_vcpu_ioctl_set_sigmask(vcpu, p);
 		break;
 		break;
 	}
 	}
 	case KVM_GET_FPU: {
 	case KVM_GET_FPU: {
@@ -1555,6 +1583,7 @@ out_free2:
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 		r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
 	}
 	}
 out:
 out:
+	vcpu_put(vcpu);
 	kfree(fpu);
 	kfree(fpu);
 	kfree(kvm_sregs);
 	kfree(kvm_sregs);
 	return r;
 	return r;
@@ -2197,6 +2226,24 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align,
 
 
 	bad_pfn = page_to_pfn(bad_page);
 	bad_pfn = page_to_pfn(bad_page);
 
 
+	hwpoison_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (hwpoison_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	hwpoison_pfn = page_to_pfn(hwpoison_page);
+
+	fault_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+
+	if (fault_page == NULL) {
+		r = -ENOMEM;
+		goto out_free_0;
+	}
+
+	fault_pfn = page_to_pfn(fault_page);
+
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 	if (!zalloc_cpumask_var(&cpus_hardware_enabled, GFP_KERNEL)) {
 		r = -ENOMEM;
 		r = -ENOMEM;
 		goto out_free_0;
 		goto out_free_0;
@@ -2269,6 +2316,10 @@ out_free_1:
 out_free_0a:
 out_free_0a:
 	free_cpumask_var(cpus_hardware_enabled);
 	free_cpumask_var(cpus_hardware_enabled);
 out_free_0:
 out_free_0:
+	if (fault_page)
+		__free_page(fault_page);
+	if (hwpoison_page)
+		__free_page(hwpoison_page);
 	__free_page(bad_page);
 	__free_page(bad_page);
 out:
 out:
 	kvm_arch_exit();
 	kvm_arch_exit();
@@ -2290,6 +2341,7 @@ void kvm_exit(void)
 	kvm_arch_hardware_unsetup();
 	kvm_arch_hardware_unsetup();
 	kvm_arch_exit();
 	kvm_arch_exit();
 	free_cpumask_var(cpus_hardware_enabled);
 	free_cpumask_var(cpus_hardware_enabled);
+	__free_page(hwpoison_page);
 	__free_page(bad_page);
 	__free_page(bad_page);
 }
 }
 EXPORT_SYMBOL_GPL(kvm_exit);
 EXPORT_SYMBOL_GPL(kvm_exit);