7 years ago · e61cf2e3a5
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -3561,6 +3561,62 @@ Returns: 0 on success,
 
															 	-ENOENT on deassign if the conn_id isn't registered
														
 
															 	-EEXIST on assign if the conn_id is already registered
														
 
															+4.114 KVM_GET_NESTED_STATE
														
 
															+
														
 
															+Capability: KVM_CAP_NESTED_STATE
														
 
															+Architectures: x86
														
 
															+Type: vcpu ioctl
														
 
															+Parameters: struct kvm_nested_state (in/out)
														
 
															+Returns: 0 on success, -1 on error
														
 
															+Errors:
														
 
															+  E2BIG:     the total state size (including the fixed-size part of struct
														
 
															+             kvm_nested_state) exceeds the value of 'size' specified by
														
 
															+             the user; the size required will be written into size.
														
 
															+
														
 
															+struct kvm_nested_state {
														
 
															+	__u16 flags;
														
 
															+	__u16 format;
														
 
															+	__u32 size;
														
 
															+	union {
														
 
															+		struct kvm_vmx_nested_state vmx;
														
 
															+		struct kvm_svm_nested_state svm;
														
 
															+		__u8 pad[120];
														
 
															+	};
														
 
															+	__u8 data[0];
														
 
															+};
														
 
															+
														
 
															+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
														
 
															+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
														
 
															+
														
 
															+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
														
 
															+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
														
 
															+
														
 
															+struct kvm_vmx_nested_state {
														
 
															+	__u64 vmxon_pa;
														
 
															+	__u64 vmcs_pa;
														
 
															+
														
 
															+	struct {
														
 
															+		__u16 flags;
														
 
															+	} smm;
														
 
															+};
														
 
															+
														
 
															+This ioctl copies the vcpu's nested virtualization state from the kernel to
														
 
															+userspace.
														
 
															+
														
 
															+The maximum size of the state, including the fixed-size part of struct
														
 
															+kvm_nested_state, can be retrieved by passing KVM_CAP_NESTED_STATE to
														
 
															+the KVM_CHECK_EXTENSION ioctl().
														
 
															+
														
 
															+4.115 KVM_SET_NESTED_STATE
														
 
															+
														
 
															+Capability: KVM_CAP_NESTED_STATE
														
 
															+Architectures: x86
														
 
															+Type: vcpu ioctl
														
 
															+Parameters: struct kvm_nested_state (in)
														
 
															+Returns: 0 on success, -1 on error
														
 
															+
														
 
															+This copies the vcpu's kvm_nested_state struct from userspace to the kernel.  For
														
 
															+the definition of struct kvm_nested_state, see KVM_GET_NESTED_STATE.
														
 
															 5. The kvm_run structure
														
 
															 ------------------------
														
--- a/Documentation/virtual/kvm/cpuid.txt
+++ b/Documentation/virtual/kvm/cpuid.txt
@@ -62,6 +62,10 @@ KVM_FEATURE_ASYNC_PF_VMEXIT        ||    10 || paravirtualized async PF VM exit
 
															                                    ||       || can be enabled by setting bit 2
														
 
															                                    ||       || when writing to msr 0x4b564d02
														
 
															 ------------------------------------------------------------------------------
														
 
															+KVM_FEATURE_PV_SEND_IPI            ||    11 || guest checks this feature bit
														
 
															+                                   ||       || before using paravirtualized
														
 
															+                                   ||       || send IPIs.
														
 
															+------------------------------------------------------------------------------
														
 
															 KVM_FEATURE_CLOCKSOURCE_STABLE_BIT ||    24 || host will warn if no guest-side
														
 
															                                    ||       || per-cpu warps are expected in
														
 
															                                    ||       || kvmclock.
														
--- a/Documentation/virtual/kvm/hypercalls.txt
+++ b/Documentation/virtual/kvm/hypercalls.txt
@@ -121,3 +121,23 @@ compute the CLOCK_REALTIME for its clock, at the same instant.
 
															 Returns KVM_EOPNOTSUPP if the host does not use TSC clocksource,
														
 
															 or if clock type is different than KVM_CLOCK_PAIRING_WALLCLOCK.
														
 
															+
														
 
															+6. KVM_HC_SEND_IPI
														
 
															+------------------------
														
 
															+Architecture: x86
														
 
															+Status: active
														
 
															+Purpose: Send IPIs to multiple vCPUs.
														
 
															+
														
 
															+a0: lower part of the bitmap of destination APIC IDs
														
 
															+a1: higher part of the bitmap of destination APIC IDs
														
 
															+a2: the lowest APIC ID in bitmap
														
 
															+a3: APIC ICR
														
 
															+
														
 
															+The hypercall lets a guest send multicast IPIs, with at most 128
														
 
															+128 destinations per hypercall in 64-bit mode and 64 vCPUs per
														
 
															+hypercall in 32-bit mode.  The destinations are represented by a
														
 
															+bitmap contained in the first two arguments (a0 and a1). Bit 0 of
														
 
															+a0 corresponds to the APIC ID in the third argument (a2), bit 1
														
 
															+corresponds to the APIC ID a2+1, and so on.
														
 
															+
														
 
															+Returns the number of CPUs to which the IPIs were delivered successfully.
														
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -390,4 +390,51 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 
															 #define SPLIT_HACK_MASK			0xff000000
														
 
															 #define SPLIT_HACK_OFFS			0xfb000000
														
 
															+/*
														
 
															+ * This packs a VCPU ID from the [0..KVM_MAX_VCPU_ID) space down to the
														
 
															+ * [0..KVM_MAX_VCPUS) space, using knowledge of the guest's core stride
														
 
															+ * (but not its actual threading mode, which is not available) to avoid
														
 
															+ * collisions.
														
 
															+ *
														
 
															+ * The implementation leaves VCPU IDs from the range [0..KVM_MAX_VCPUS) (block
														
 
															+ * 0) unchanged: if the guest is filling each VCORE completely then it will be
														
 
															+ * using consecutive IDs and it will fill the space without any packing.
														
 
															+ *
														
 
															+ * For higher VCPU IDs, the packed ID is based on the VCPU ID modulo
														
 
															+ * KVM_MAX_VCPUS (effectively masking off the top bits) and then an offset is
														
 
															+ * added to avoid collisions.
														
 
															+ *
														
 
															+ * VCPU IDs in the range [KVM_MAX_VCPUS..(KVM_MAX_VCPUS*2)) (block 1) are only
														
 
															+ * possible if the guest is leaving at least 1/2 of each VCORE empty, so IDs
														
 
															+ * can be safely packed into the second half of each VCORE by adding an offset
														
 
															+ * of (stride / 2).
														
 
															+ *
														
 
															+ * Similarly, if VCPU IDs in the range [(KVM_MAX_VCPUS*2)..(KVM_MAX_VCPUS*4))
														
 
															+ * (blocks 2 and 3) are seen, the guest must be leaving at least 3/4 of each
														
 
															+ * VCORE empty so packed IDs can be offset by (stride / 4) and (stride * 3 / 4).
														
 
															+ *
														
 
															+ * Finally, VCPU IDs from blocks 5..7 will only be seen if the guest is using a
														
 
															+ * stride of 8 and 1 thread per core so the remaining offsets of 1, 5, 3 and 7
														
 
															+ * must be free to use.
														
 
															+ *
														
 
															+ * (The offsets for each block are stored in block_offsets[], indexed by the
														
 
															+ * block number if the stride is 8. For cases where the guest's stride is less
														
 
															+ * than 8, we can re-use the block_offsets array by multiplying the block
														
 
															+ * number by (MAX_SMT_THREADS / stride) to reach the correct entry.)
														
 
															+ */
														
 
															+static inline u32 kvmppc_pack_vcpu_id(struct kvm *kvm, u32 id)
														
 
															+{
														
 
															+	const int block_offsets[MAX_SMT_THREADS] = {0, 4, 2, 6, 1, 5, 3, 7};
														
 
															+	int stride = kvm->arch.emul_smt_mode;
														
 
															+	int block = (id / KVM_MAX_VCPUS) * (MAX_SMT_THREADS / stride);
														
 
															+	u32 packed_id;
														
 
															+
														
 
															+	if (WARN_ONCE(block >= MAX_SMT_THREADS, "VCPU ID too large to pack"))
														
 
															+		return 0;
														
 
															+	packed_id = (id % KVM_MAX_VCPUS) + block_offsets[block];
														
 
															+	if (WARN_ONCE(packed_id >= KVM_MAX_VCPUS, "VCPU ID packing failed"))
														
 
															+		return 0;
														
 
															+	return packed_id;
														
 
															+}
														
 
															+
														
 
															 #endif /* __ASM_KVM_BOOK3S_H__ */
														
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -42,7 +42,14 @@
 
															 #define KVM_USER_MEM_SLOTS	512
														
 
															 #include <asm/cputhreads.h>
														
 
															-#define KVM_MAX_VCPU_ID                (threads_per_subcore * KVM_MAX_VCORES)
														
 
															+
														
 
															+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
														
 
															+#include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
														
 
															+#define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
														
 
															+
														
 
															+#else
														
 
															+#define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
														
 
															+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
														
 
															 #define __KVM_HAVE_ARCH_INTC_INITIALIZED
														
@@ -672,7 +679,7 @@ struct kvm_vcpu_arch {
 
															 	gva_t vaddr_accessed;
														
 
															 	pgd_t *pgdir;
														
 
															-	u8 io_gpr; /* GPR used as IO source/target */
														
 
															+	u16 io_gpr; /* GPR used as IO source/target */
														
 
															 	u8 mmio_host_swabbed;
														
 
															 	u8 mmio_sign_extend;
														
 
															 	/* conversion between single and double precision */
														
@@ -688,7 +695,6 @@ struct kvm_vcpu_arch {
 
															 	 */
														
 
															 	u8 mmio_vsx_copy_nums;
														
 
															 	u8 mmio_vsx_offset;
														
 
															-	u8 mmio_vsx_tx_sx_enabled;
														
 
															 	u8 mmio_vmx_copy_nums;
														
 
															 	u8 mmio_vmx_offset;
														
 
															 	u8 mmio_copy_type;
														
@@ -801,14 +807,14 @@ struct kvm_vcpu_arch {
 
															 #define KVMPPC_VCPU_BUSY_IN_HOST	2
														
 
															 /* Values for vcpu->arch.io_gpr */
														
 
															-#define KVM_MMIO_REG_MASK	0x001f
														
 
															-#define KVM_MMIO_REG_EXT_MASK	0xffe0
														
 
															+#define KVM_MMIO_REG_MASK	0x003f
														
 
															+#define KVM_MMIO_REG_EXT_MASK	0xffc0
														
 
															 #define KVM_MMIO_REG_GPR	0x0000
														
 
															-#define KVM_MMIO_REG_FPR	0x0020
														
 
															-#define KVM_MMIO_REG_QPR	0x0040
														
 
															-#define KVM_MMIO_REG_FQPR	0x0060
														
 
															-#define KVM_MMIO_REG_VSX	0x0080
														
 
															-#define KVM_MMIO_REG_VMX	0x00c0
														
 
															+#define KVM_MMIO_REG_FPR	0x0040
														
 
															+#define KVM_MMIO_REG_QPR	0x0080
														
 
															+#define KVM_MMIO_REG_FQPR	0x00c0
														
 
															+#define KVM_MMIO_REG_VSX	0x0100
														
 
															+#define KVM_MMIO_REG_VMX	0x0180
														
 
															 #define __KVM_HAVE_ARCH_WQP
														
 
															 #define __KVM_HAVE_CREATE_DEVICE
														
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -163,7 +163,7 @@
 
															 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
														
 
															 #define PSSCR_SD		0x00400000 /* Status Disable */
														
 
															 #define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
														
 
															-#define PSSCR_GUEST_VIS	0xf0000000000003ff /* Guest-visible PSSCR fields */
														
 
															+#define PSSCR_GUEST_VIS	0xf0000000000003ffUL /* Guest-visible PSSCR fields */
														
 
															 #define PSSCR_FAKE_SUSPEND	0x00000400 /* Fake-suspend bit (P9 DD2.2) */
														
 
															 #define PSSCR_FAKE_SUSPEND_LG	10	   /* Fake-suspend bit position */
														
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -179,7 +179,7 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
 
															 		if ((tbltmp->it_page_shift <= stt->page_shift) &&
														
 
															 				(tbltmp->it_offset << tbltmp->it_page_shift ==
														
 
															 				 stt->offset << stt->page_shift) &&
														
 
															-				(tbltmp->it_size << tbltmp->it_page_shift ==
														
 
															+				(tbltmp->it_size << tbltmp->it_page_shift >=
														
 
															 				 stt->size << stt->page_shift)) {
														
 
															 			/*
														
 
															 			 * Reference the table to avoid races with
														
@@ -295,7 +295,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
															 {
														
 
															 	struct kvmppc_spapr_tce_table *stt = NULL;
														
 
															 	struct kvmppc_spapr_tce_table *siter;
														
 
															-	unsigned long npages, size;
														
 
															+	unsigned long npages, size = args->size;
														
 
															 	int ret = -ENOMEM;
														
 
															 	int i;
														
@@ -303,7 +303,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
															 		(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
														
 
															 		return -EINVAL;
														
 
															-	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
														
 
															 	npages = kvmppc_tce_pages(size);
														
 
															 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
														
 
															 	if (ret)
														
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -127,14 +127,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
															  * and SPURR count and should be set according to the number of
														
 
															  * online threads in the vcore being run.
														
 
															  */
														
 
															-#define RWMR_RPA_P8_1THREAD	0x164520C62609AECA
														
 
															-#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9
														
 
															-#define RWMR_RPA_P8_3THREAD	0x164520C62609AECA
														
 
															-#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9
														
 
															-#define RWMR_RPA_P8_5THREAD	0x164520C62609AECA
														
 
															-#define RWMR_RPA_P8_6THREAD	0x164520C62609AECA
														
 
															-#define RWMR_RPA_P8_7THREAD	0x164520C62609AECA
														
 
															-#define RWMR_RPA_P8_8THREAD	0x164520C62609AECA
														
 
															+#define RWMR_RPA_P8_1THREAD	0x164520C62609AECAUL
														
 
															+#define RWMR_RPA_P8_2THREAD	0x7FFF2908450D8DA9UL
														
 
															+#define RWMR_RPA_P8_3THREAD	0x164520C62609AECAUL
														
 
															+#define RWMR_RPA_P8_4THREAD	0x199A421245058DA9UL
														
 
															+#define RWMR_RPA_P8_5THREAD	0x164520C62609AECAUL
														
 
															+#define RWMR_RPA_P8_6THREAD	0x164520C62609AECAUL
														
 
															+#define RWMR_RPA_P8_7THREAD	0x164520C62609AECAUL
														
 
															+#define RWMR_RPA_P8_8THREAD	0x164520C62609AECAUL
														
 
															 static unsigned long p8_rwmr_values[MAX_SMT_THREADS + 1] = {
														
 
															 	RWMR_RPA_P8_1THREAD,
														
@@ -1807,7 +1807,7 @@ static int threads_per_vcore(struct kvm *kvm)
 
															 	return threads_per_subcore;
														
 
															 }
														
 
															-static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
														
 
															+static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int id)
														
 
															 {
														
 
															 	struct kvmppc_vcore *vcore;
														
@@ -1821,7 +1821,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 
															 	init_swait_queue_head(&vcore->wq);
														
 
															 	vcore->preempt_tb = TB_NIL;
														
 
															 	vcore->lpcr = kvm->arch.lpcr;
														
 
															-	vcore->first_vcpuid = core * kvm->arch.smt_mode;
														
 
															+	vcore->first_vcpuid = id;
														
 
															 	vcore->kvm = kvm;
														
 
															 	INIT_LIST_HEAD(&vcore->preempt_list);
														
@@ -2037,12 +2037,26 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 
															 	mutex_lock(&kvm->lock);
														
 
															 	vcore = NULL;
														
 
															 	err = -EINVAL;
														
 
															-	core = id / kvm->arch.smt_mode;
														
 
															+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															+		if (id >= (KVM_MAX_VCPUS * kvm->arch.emul_smt_mode)) {
														
 
															+			pr_devel("KVM: VCPU ID too high\n");
														
 
															+			core = KVM_MAX_VCORES;
														
 
															+		} else {
														
 
															+			BUG_ON(kvm->arch.smt_mode != 1);
														
 
															+			core = kvmppc_pack_vcpu_id(kvm, id);
														
 
															+		}
														
 
															+	} else {
														
 
															+		core = id / kvm->arch.smt_mode;
														
 
															+	}
														
 
															 	if (core < KVM_MAX_VCORES) {
														
 
															 		vcore = kvm->arch.vcores[core];
														
 
															-		if (!vcore) {
														
 
															+		if (vcore && cpu_has_feature(CPU_FTR_ARCH_300)) {
														
 
															+			pr_devel("KVM: collision on id %u", id);
														
 
															+			vcore = NULL;
														
 
															+		} else if (!vcore) {
														
 
															 			err = -ENOMEM;
														
 
															-			vcore = kvmppc_vcore_create(kvm, core);
														
 
															+			vcore = kvmppc_vcore_create(kvm,
														
 
															+					id & ~(kvm->arch.smt_mode - 1));
														
 
															 			kvm->arch.vcores[core] = vcore;
														
 
															 			kvm->arch.online_vcores++;
														
 
															 		}
														
@@ -4550,6 +4564,8 @@ static int kvmppc_book3s_init_hv(void)
 
															 			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
														
 
															 			return -ENODEV;
														
 
															 		}
														
 
															+		/* presence of intc confirmed - node can be dropped again */
														
 
															+		of_node_put(np);
														
 
															 	}
														
 
															 #endif
														
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -317,6 +317,11 @@ static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
 
															 	return -EBUSY;
														
 
															 }
														
 
															+static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
														
 
															+{
														
 
															+	return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
														
 
															+}
														
 
															+
														
 
															 static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
														
 
															 			     struct kvmppc_xive_src_block *sb,
														
 
															 			     struct kvmppc_xive_irq_state *state)
														
@@ -362,7 +367,7 @@ static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
 
															 	 */
														
 
															 	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
														
 
															 		xive_native_configure_irq(hw_num,
														
 
															-					  xive->vp_base + state->act_server,
														
 
															+					  xive_vp(xive, state->act_server),
														
 
															 					  MASKED, state->number);
														
 
															 		/* set old_p so we can track if an H_EOI was done */
														
 
															 		state->old_p = true;
														
@@ -418,7 +423,7 @@ static void xive_finish_unmask(struct kvmppc_xive *xive,
 
															 	 */
														
 
															 	if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
														
 
															 		xive_native_configure_irq(hw_num,
														
 
															-					  xive->vp_base + state->act_server,
														
 
															+					  xive_vp(xive, state->act_server),
														
 
															 					  state->act_priority, state->number);
														
 
															 		/* If an EOI is needed, do it here */
														
 
															 		if (!state->old_p)
														
@@ -495,7 +500,7 @@ static int xive_target_interrupt(struct kvm *kvm,
 
															 	kvmppc_xive_select_irq(state, &hw_num, NULL);
														
 
															 	return xive_native_configure_irq(hw_num,
														
 
															-					 xive->vp_base + server,
														
 
															+					 xive_vp(xive, server),
														
 
															 					 prio, state->number);
														
 
															 }
														
@@ -883,7 +888,7 @@ int kvmppc_xive_set_mapped(struct kvm *kvm, unsigned long guest_irq,
 
															 	 * which is fine for a never started interrupt.
														
 
															 	 */
														
 
															 	xive_native_configure_irq(hw_irq,
														
 
															-				  xive->vp_base + state->act_server,
														
 
															+				  xive_vp(xive, state->act_server),
														
 
															 				  state->act_priority, state->number);
														
 
															 	/*
														
@@ -959,7 +964,7 @@ int kvmppc_xive_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
 
															 	/* Reconfigure the IPI */
														
 
															 	xive_native_configure_irq(state->ipi_number,
														
 
															-				  xive->vp_base + state->act_server,
														
 
															+				  xive_vp(xive, state->act_server),
														
 
															 				  state->act_priority, state->number);
														
 
															 	/*
														
@@ -1084,7 +1089,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 
															 		pr_devel("Duplicate !\n");
														
 
															 		return -EEXIST;
														
 
															 	}
														
 
															-	if (cpu >= KVM_MAX_VCPUS) {
														
 
															+	if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
														
 
															 		pr_devel("Out of bounds !\n");
														
 
															 		return -EINVAL;
														
 
															 	}
														
@@ -1098,7 +1103,7 @@ int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
 
															 	xc->xive = xive;
														
 
															 	xc->vcpu = vcpu;
														
 
															 	xc->server_num = cpu;
														
 
															-	xc->vp_id = xive->vp_base + cpu;
														
 
															+	xc->vp_id = xive_vp(xive, cpu);
														
 
															 	xc->mfrr = 0xff;
														
 
															 	xc->valid = true;
														
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -106,7 +106,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
															 	 * if mmio_vsx_tx_sx_enabled == 1, copy data between
														
 
															 	 * VSR[32..63] and memory
														
 
															 	 */
														
 
															-	vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst);
														
 
															 	vcpu->arch.mmio_vsx_copy_nums = 0;
														
 
															 	vcpu->arch.mmio_vsx_offset = 0;
														
 
															 	vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
														
@@ -242,8 +241,8 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
															 			}
														
 
															 			emulated = kvmppc_handle_vsx_load(run, vcpu,
														
 
															-					KVM_MMIO_REG_VSX | (op.reg & 0x1f),
														
 
															-					io_size_each, 1, op.type & SIGNEXT);
														
 
															+					KVM_MMIO_REG_VSX|op.reg, io_size_each,
														
 
															+					1, op.type & SIGNEXT);
														
 
															 			break;
														
 
															 		}
														
 
															 #endif
														
@@ -363,7 +362,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
															 			}
														
 
															 			emulated = kvmppc_handle_vsx_store(run, vcpu,
														
 
															-					op.reg & 0x1f, io_size_each, 1);
														
 
															+					op.reg, io_size_each, 1);
														
 
															 			break;
														
 
															 		}
														
 
															 #endif
														
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -879,10 +879,10 @@ static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu,
 
															 	if (offset == -1)
														
 
															 		return;
														
 
															-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															-		val.vval = VCPU_VSX_VR(vcpu, index);
														
 
															+	if (index >= 32) {
														
 
															+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
														
 
															 		val.vsxval[offset] = gpr;
														
 
															-		VCPU_VSX_VR(vcpu, index) = val.vval;
														
 
															+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
														
 
															 	} else {
														
 
															 		VCPU_VSX_FPR(vcpu, index, offset) = gpr;
														
 
															 	}
														
@@ -894,11 +894,11 @@ static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu,
 
															 	union kvmppc_one_reg val;
														
 
															 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
														
 
															-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															-		val.vval = VCPU_VSX_VR(vcpu, index);
														
 
															+	if (index >= 32) {
														
 
															+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
														
 
															 		val.vsxval[0] = gpr;
														
 
															 		val.vsxval[1] = gpr;
														
 
															-		VCPU_VSX_VR(vcpu, index) = val.vval;
														
 
															+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
														
 
															 	} else {
														
 
															 		VCPU_VSX_FPR(vcpu, index, 0) = gpr;
														
 
															 		VCPU_VSX_FPR(vcpu, index, 1) = gpr;
														
@@ -911,12 +911,12 @@ static inline void kvmppc_set_vsr_word_dump(struct kvm_vcpu *vcpu,
 
															 	union kvmppc_one_reg val;
														
 
															 	int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK;
														
 
															-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															+	if (index >= 32) {
														
 
															 		val.vsx32val[0] = gpr;
														
 
															 		val.vsx32val[1] = gpr;
														
 
															 		val.vsx32val[2] = gpr;
														
 
															 		val.vsx32val[3] = gpr;
														
 
															-		VCPU_VSX_VR(vcpu, index) = val.vval;
														
 
															+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
														
 
															 	} else {
														
 
															 		val.vsx32val[0] = gpr;
														
 
															 		val.vsx32val[1] = gpr;
														
@@ -936,10 +936,10 @@ static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu,
 
															 	if (offset == -1)
														
 
															 		return;
														
 
															-	if (vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															-		val.vval = VCPU_VSX_VR(vcpu, index);
														
 
															+	if (index >= 32) {
														
 
															+		val.vval = VCPU_VSX_VR(vcpu, index - 32);
														
 
															 		val.vsx32val[offset] = gpr32;
														
 
															-		VCPU_VSX_VR(vcpu, index) = val.vval;
														
 
															+		VCPU_VSX_VR(vcpu, index - 32) = val.vval;
														
 
															 	} else {
														
 
															 		dword_offset = offset / 2;
														
 
															 		word_offset = offset % 2;
														
@@ -1360,10 +1360,10 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 
															 			break;
														
 
															 		}
														
 
															-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															+		if (rs < 32) {
														
 
															 			*val = VCPU_VSX_FPR(vcpu, rs, vsx_offset);
														
 
															 		} else {
														
 
															-			reg.vval = VCPU_VSX_VR(vcpu, rs);
														
 
															+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
														
 
															 			*val = reg.vsxval[vsx_offset];
														
 
															 		}
														
 
															 		break;
														
@@ -1377,13 +1377,13 @@ static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val)
 
															 			break;
														
 
															 		}
														
 
															-		if (!vcpu->arch.mmio_vsx_tx_sx_enabled) {
														
 
															+		if (rs < 32) {
														
 
															 			dword_offset = vsx_offset / 2;
														
 
															 			word_offset = vsx_offset % 2;
														
 
															 			reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset);
														
 
															 			*val = reg.vsx32val[word_offset];
														
 
															 		} else {
														
 
															-			reg.vval = VCPU_VSX_VR(vcpu, rs);
														
 
															+			reg.vval = VCPU_VSX_VR(vcpu, rs - 32);
														
 
															 			*val = reg.vsx32val[vsx_offset];
														
 
															 		}
														
 
															 		break;
														
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -269,6 +269,7 @@ struct kvm_s390_sie_block {
 
															 	__u8	reserved1c0[8];		/* 0x01c0 */
														
 
															 #define ECD_HOSTREGMGMT	0x20000000
														
 
															 #define ECD_MEF		0x08000000
														
 
															+#define ECD_ETOKENF	0x02000000
														
 
															 	__u32	ecd;			/* 0x01c8 */
														
 
															 	__u8	reserved1cc[18];	/* 0x01cc */
														
 
															 	__u64	pp;			/* 0x01de */
														
@@ -655,6 +656,7 @@ struct kvm_vcpu_arch {
 
															 	seqcount_t cputm_seqcount;
														
 
															 	__u64 cputm_start;
														
 
															 	bool gs_enabled;
														
 
															+	bool skey_enabled;
														
 
															 };
														
 
															 struct kvm_vm_stat {
														
@@ -793,12 +795,6 @@ struct kvm_s390_vsie {
 
															 	struct page *pages[KVM_MAX_VCPUS];
														
 
															 };
														
 
															-struct kvm_s390_migration_state {
														
 
															-	unsigned long bitmap_size;	/* in bits (number of guest pages) */
														
 
															-	atomic64_t dirty_pages;		/* number of dirty pages */
														
 
															-	unsigned long *pgste_bitmap;
														
 
															-};
														
 
															-
														
 
															 struct kvm_arch{
														
 
															 	void *sca;
														
 
															 	int use_esca;
														
@@ -828,7 +824,8 @@ struct kvm_arch{
 
															 	struct kvm_s390_vsie vsie;
														
 
															 	u8 epdx;
														
 
															 	u64 epoch;
														
 
															-	struct kvm_s390_migration_state *migration_state;
														
 
															+	int migration_mode;
														
 
															+	atomic64_t cmma_dirty_pages;
														
 
															 	/* subset of available cpu features enabled by user space */
														
 
															 	DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
														
 
															 	struct kvm_s390_gisa *gisa;
														
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -4,7 +4,7 @@
 
															 /*
														
 
															  * KVM s390 specific structures and definitions
														
 
															  *
														
 
															- * Copyright IBM Corp. 2008
														
 
															+ * Copyright IBM Corp. 2008, 2018
														
 
															  *
														
 
															  *    Author(s): Carsten Otte <cotte@de.ibm.com>
														
 
															  *               Christian Borntraeger <borntraeger@de.ibm.com>
														
@@ -225,6 +225,7 @@ struct kvm_guest_debug_arch {
 
															 #define KVM_SYNC_FPRS   (1UL << 8)
														
 
															 #define KVM_SYNC_GSCB   (1UL << 9)
														
 
															 #define KVM_SYNC_BPBC   (1UL << 10)
														
 
															+#define KVM_SYNC_ETOKEN (1UL << 11)
														
 
															 /* length and alignment of the sdnx as a power of two */
														
 
															 #define SDNXC 8
														
 
															 #define SDNXL (1UL << SDNXC)
														
@@ -258,6 +259,8 @@ struct kvm_sync_regs {
 
															 		struct {
														
 
															 			__u64 reserved1[2];
														
 
															 			__u64 gscb[4];
														
 
															+			__u64 etoken;
														
 
															+			__u64 etoken_extension;
														
 
															 		};
														
 
															 	};
														
 
															 };
														
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -906,54 +906,37 @@ static void kvm_s390_sync_request_broadcast(struct kvm *kvm, int req)
 
															  */
														
 
															 static int kvm_s390_vm_start_migration(struct kvm *kvm)
														
 
															 {
														
 
															-	struct kvm_s390_migration_state *mgs;
														
 
															 	struct kvm_memory_slot *ms;
														
 
															-	/* should be the only one */
														
 
															 	struct kvm_memslots *slots;
														
 
															-	unsigned long ram_pages;
														
 
															+	unsigned long ram_pages = 0;
														
 
															 	int slotnr;
														
 
															 	/* migration mode already enabled */
														
 
															-	if (kvm->arch.migration_state)
														
 
															+	if (kvm->arch.migration_mode)
														
 
															 		return 0;
														
 
															-
														
 
															 	slots = kvm_memslots(kvm);
														
 
															 	if (!slots || !slots->used_slots)
														
 
															 		return -EINVAL;
														
 
															-	mgs = kzalloc(sizeof(*mgs), GFP_KERNEL);
														
 
															-	if (!mgs)
														
 
															-		return -ENOMEM;
														
 
															-	kvm->arch.migration_state = mgs;
														
 
															-
														
 
															-	if (kvm->arch.use_cmma) {
														
 
															+	if (!kvm->arch.use_cmma) {
														
 
															+		kvm->arch.migration_mode = 1;
														
 
															+		return 0;
														
 
															+	}
														
 
															+	/* mark all the pages in active slots as dirty */
														
 
															+	for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
														
 
															+		ms = slots->memslots + slotnr;
														
 
															 		/*
														
 
															-		 * Get the first slot. They are reverse sorted by base_gfn, so
														
 
															-		 * the first slot is also the one at the end of the address
														
 
															-		 * space. We have verified above that at least one slot is
														
 
															-		 * present.
														
 
															+		 * The second half of the bitmap is only used on x86,
														
 
															+		 * and would be wasted otherwise, so we put it to good
														
 
															+		 * use here to keep track of the state of the storage
														
 
															+		 * attributes.
														
 
															 		 */
														
 
															-		ms = slots->memslots;
														
 
															-		/* round up so we only use full longs */
														
 
															-		ram_pages = roundup(ms->base_gfn + ms->npages, BITS_PER_LONG);
														
 
															-		/* allocate enough bytes to store all the bits */
														
 
															-		mgs->pgste_bitmap = vmalloc(ram_pages / 8);
														
 
															-		if (!mgs->pgste_bitmap) {
														
 
															-			kfree(mgs);
														
 
															-			kvm->arch.migration_state = NULL;
														
 
															-			return -ENOMEM;
														
 
															-		}
														
 
															-
														
 
															-		mgs->bitmap_size = ram_pages;
														
 
															-		atomic64_set(&mgs->dirty_pages, ram_pages);
														
 
															-		/* mark all the pages in active slots as dirty */
														
 
															-		for (slotnr = 0; slotnr < slots->used_slots; slotnr++) {
														
 
															-			ms = slots->memslots + slotnr;
														
 
															-			bitmap_set(mgs->pgste_bitmap, ms->base_gfn, ms->npages);
														
 
															-		}
														
 
															-
														
 
															-		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
														
 
															+		memset(kvm_second_dirty_bitmap(ms), 0xff, kvm_dirty_bitmap_bytes(ms));
														
 
															+		ram_pages += ms->npages;
														
 
															 	}
														
 
															+	atomic64_set(&kvm->arch.cmma_dirty_pages, ram_pages);
														
 
															+	kvm->arch.migration_mode = 1;
														
 
															+	kvm_s390_sync_request_broadcast(kvm, KVM_REQ_START_MIGRATION);
														
 
															 	return 0;
														
 
															 }
														
@@ -963,21 +946,12 @@ static int kvm_s390_vm_start_migration(struct kvm *kvm)
 
															  */
														
 
															 static int kvm_s390_vm_stop_migration(struct kvm *kvm)
														
 
															 {
														
 
															-	struct kvm_s390_migration_state *mgs;
														
 
															-
														
 
															 	/* migration mode already disabled */
														
 
															-	if (!kvm->arch.migration_state)
														
 
															+	if (!kvm->arch.migration_mode)
														
 
															 		return 0;
														
 
															-	mgs = kvm->arch.migration_state;
														
 
															-	kvm->arch.migration_state = NULL;
														
 
															-
														
 
															-	if (kvm->arch.use_cmma) {
														
 
															+	kvm->arch.migration_mode = 0;
														
 
															+	if (kvm->arch.use_cmma)
														
 
															 		kvm_s390_sync_request_broadcast(kvm, KVM_REQ_STOP_MIGRATION);
														
 
															-		/* We have to wait for the essa emulation to finish */
														
 
															-		synchronize_srcu(&kvm->srcu);
														
 
															-		vfree(mgs->pgste_bitmap);
														
 
															-	}
														
 
															-	kfree(mgs);
														
 
															 	return 0;
														
 
															 }
														
@@ -1005,7 +979,7 @@ static int kvm_s390_vm_set_migration(struct kvm *kvm,
 
															 static int kvm_s390_vm_get_migration(struct kvm *kvm,
														
 
															 				     struct kvm_device_attr *attr)
														
 
															 {
														
 
															-	u64 mig = (kvm->arch.migration_state != NULL);
														
 
															+	u64 mig = kvm->arch.migration_mode;
														
 
															 	if (attr->attr != KVM_S390_VM_MIGRATION_STATUS)
														
 
															 		return -ENXIO;
														
@@ -1652,6 +1626,134 @@ out:
 
															 /* for consistency */
														
 
															 #define KVM_S390_CMMA_SIZE_MAX ((u32)KVM_S390_SKEYS_MAX)
														
 
															+/*
														
 
															+ * Similar to gfn_to_memslot, but returns the index of a memslot also when the
														
 
															+ * address falls in a hole. In that case the index of one of the memslots
														
 
															+ * bordering the hole is returned.
														
 
															+ */
														
 
															+static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn)
														
 
															+{
														
 
															+	int start = 0, end = slots->used_slots;
														
 
															+	int slot = atomic_read(&slots->lru_slot);
														
 
															+	struct kvm_memory_slot *memslots = slots->memslots;
														
 
															+
														
 
															+	if (gfn >= memslots[slot].base_gfn &&
														
 
															+	    gfn < memslots[slot].base_gfn + memslots[slot].npages)
														
 
															+		return slot;
														
 
															+
														
 
															+	while (start < end) {
														
 
															+		slot = start + (end - start) / 2;
														
 
															+
														
 
															+		if (gfn >= memslots[slot].base_gfn)
														
 
															+			end = slot;
														
 
															+		else
														
 
															+			start = slot + 1;
														
 
															+	}
														
 
															+
														
 
															+	if (gfn >= memslots[start].base_gfn &&
														
 
															+	    gfn < memslots[start].base_gfn + memslots[start].npages) {
														
 
															+		atomic_set(&slots->lru_slot, start);
														
 
															+	}
														
 
															+
														
 
															+	return start;
														
 
															+}
														
 
															+
														
 
															+static int kvm_s390_peek_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
														
 
															+			      u8 *res, unsigned long bufsize)
														
 
															+{
														
 
															+	unsigned long pgstev, hva, cur_gfn = args->start_gfn;
														
 
															+
														
 
															+	args->count = 0;
														
 
															+	while (args->count < bufsize) {
														
 
															+		hva = gfn_to_hva(kvm, cur_gfn);
														
 
															+		/*
														
 
															+		 * We return an error if the first value was invalid, but we
														
 
															+		 * return successfully if at least one value was copied.
														
 
															+		 */
														
 
															+		if (kvm_is_error_hva(hva))
														
 
															+			return args->count ? 0 : -EFAULT;
														
 
															+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
														
 
															+			pgstev = 0;
														
 
															+		res[args->count++] = (pgstev >> 24) & 0x43;
														
 
															+		cur_gfn++;
														
 
															+	}
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															+static unsigned long kvm_s390_next_dirty_cmma(struct kvm_memslots *slots,
														
 
															+					      unsigned long cur_gfn)
														
 
															+{
														
 
															+	int slotidx = gfn_to_memslot_approx(slots, cur_gfn);
														
 
															+	struct kvm_memory_slot *ms = slots->memslots + slotidx;
														
 
															+	unsigned long ofs = cur_gfn - ms->base_gfn;
														
 
															+
														
 
															+	if (ms->base_gfn + ms->npages <= cur_gfn) {
														
 
															+		slotidx--;
														
 
															+		/* If we are above the highest slot, wrap around */
														
 
															+		if (slotidx < 0)
														
 
															+			slotidx = slots->used_slots - 1;
														
 
															+
														
 
															+		ms = slots->memslots + slotidx;
														
 
															+		ofs = 0;
														
 
															+	}
														
 
															+	ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, ofs);
														
 
															+	while ((slotidx > 0) && (ofs >= ms->npages)) {
														
 
															+		slotidx--;
														
 
															+		ms = slots->memslots + slotidx;
														
 
															+		ofs = find_next_bit(kvm_second_dirty_bitmap(ms), ms->npages, 0);
														
 
															+	}
														
 
															+	return ms->base_gfn + ofs;
														
 
															+}
														
 
															+
														
 
															+static int kvm_s390_get_cmma(struct kvm *kvm, struct kvm_s390_cmma_log *args,
														
 
															+			     u8 *res, unsigned long bufsize)
														
 
															+{
														
 
															+	unsigned long mem_end, cur_gfn, next_gfn, hva, pgstev;
														
 
															+	struct kvm_memslots *slots = kvm_memslots(kvm);
														
 
															+	struct kvm_memory_slot *ms;
														
 
															+
														
 
															+	cur_gfn = kvm_s390_next_dirty_cmma(slots, args->start_gfn);
														
 
															+	ms = gfn_to_memslot(kvm, cur_gfn);
														
 
															+	args->count = 0;
														
 
															+	args->start_gfn = cur_gfn;
														
 
															+	if (!ms)
														
 
															+		return 0;
														
 
															+	next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
														
 
															+	mem_end = slots->memslots[0].base_gfn + slots->memslots[0].npages;
														
 
															+
														
 
															+	while (args->count < bufsize) {
														
 
															+		hva = gfn_to_hva(kvm, cur_gfn);
														
 
															+		if (kvm_is_error_hva(hva))
														
 
															+			return 0;
														
 
															+		/* Decrement only if we actually flipped the bit to 0 */
														
 
															+		if (test_and_clear_bit(cur_gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
														
 
															+			atomic64_dec(&kvm->arch.cmma_dirty_pages);
														
 
															+		if (get_pgste(kvm->mm, hva, &pgstev) < 0)
														
 
															+			pgstev = 0;
														
 
															+		/* Save the value */
														
 
															+		res[args->count++] = (pgstev >> 24) & 0x43;
														
 
															+		/* If the next bit is too far away, stop. */
														
 
															+		if (next_gfn > cur_gfn + KVM_S390_MAX_BIT_DISTANCE)
														
 
															+			return 0;
														
 
															+		/* If we reached the previous "next", find the next one */
														
 
															+		if (cur_gfn == next_gfn)
														
 
															+			next_gfn = kvm_s390_next_dirty_cmma(slots, cur_gfn + 1);
														
 
															+		/* Reached the end of memory or of the buffer, stop */
														
 
															+		if ((next_gfn >= mem_end) ||
														
 
															+		    (next_gfn - args->start_gfn >= bufsize))
														
 
															+			return 0;
														
 
															+		cur_gfn++;
														
 
															+		/* Reached the end of the current memslot, take the next one. */
														
 
															+		if (cur_gfn - ms->base_gfn >= ms->npages) {
														
 
															+			ms = gfn_to_memslot(kvm, cur_gfn);
														
 
															+			if (!ms)
														
 
															+				return 0;
														
 
															+		}
														
 
															+	}
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * This function searches for the next page with dirty CMMA attributes, and
														
 
															  * saves the attributes in the buffer up to either the end of the buffer or
														
@@ -1663,22 +1765,18 @@ out:
 
															 static int kvm_s390_get_cmma_bits(struct kvm *kvm,
														
 
															 				  struct kvm_s390_cmma_log *args)
														
 
															 {
														
 
															-	struct kvm_s390_migration_state *s = kvm->arch.migration_state;
														
 
															-	unsigned long bufsize, hva, pgstev, i, next, cur;
														
 
															-	int srcu_idx, peek, r = 0, rr;
														
 
															-	u8 *res;
														
 
															-
														
 
															-	cur = args->start_gfn;
														
 
															-	i = next = pgstev = 0;
														
 
															+	unsigned long bufsize;
														
 
															+	int srcu_idx, peek, ret;
														
 
															+	u8 *values;
														
 
															-	if (unlikely(!kvm->arch.use_cmma))
														
 
															+	if (!kvm->arch.use_cmma)
														
 
															 		return -ENXIO;
														
 
															 	/* Invalid/unsupported flags were specified */
														
 
															 	if (args->flags & ~KVM_S390_CMMA_PEEK)
														
 
															 		return -EINVAL;
														
 
															 	/* Migration mode query, and we are not doing a migration */
														
 
															 	peek = !!(args->flags & KVM_S390_CMMA_PEEK);
														
 
															-	if (!peek && !s)
														
 
															+	if (!peek && !kvm->arch.migration_mode)
														
 
															 		return -EINVAL;
														
 
															 	/* CMMA is disabled or was not used, or the buffer has length zero */
														
 
															 	bufsize = min(args->count, KVM_S390_CMMA_SIZE_MAX);
														
@@ -1686,74 +1784,35 @@ static int kvm_s390_get_cmma_bits(struct kvm *kvm,
 
															 		memset(args, 0, sizeof(*args));
														
 
															 		return 0;
														
 
															 	}
														
 
															-
														
 
															-	if (!peek) {
														
 
															-		/* We are not peeking, and there are no dirty pages */
														
 
															-		if (!atomic64_read(&s->dirty_pages)) {
														
 
															-			memset(args, 0, sizeof(*args));
														
 
															-			return 0;
														
 
															-		}
														
 
															-		cur = find_next_bit(s->pgste_bitmap, s->bitmap_size,
														
 
															-				    args->start_gfn);
														
 
															-		if (cur >= s->bitmap_size)	/* nothing found, loop back */
														
 
															-			cur = find_next_bit(s->pgste_bitmap, s->bitmap_size, 0);
														
 
															-		if (cur >= s->bitmap_size) {	/* again! (very unlikely) */
														
 
															-			memset(args, 0, sizeof(*args));
														
 
															-			return 0;
														
 
															-		}
														
 
															-		next = find_next_bit(s->pgste_bitmap, s->bitmap_size, cur + 1);
														
 
															+	/* We are not peeking, and there are no dirty pages */
														
 
															+	if (!peek && !atomic64_read(&kvm->arch.cmma_dirty_pages)) {
														
 
															+		memset(args, 0, sizeof(*args));
														
 
															+		return 0;
														
 
															 	}
														
 
															-	res = vmalloc(bufsize);
														
 
															-	if (!res)
														
 
															+	values = vmalloc(bufsize);
														
 
															+	if (!values)
														
 
															 		return -ENOMEM;
														
 
															-	args->start_gfn = cur;
														
 
															-
														
 
															 	down_read(&kvm->mm->mmap_sem);
														
 
															 	srcu_idx = srcu_read_lock(&kvm->srcu);
														
 
															-	while (i < bufsize) {
														
 
															-		hva = gfn_to_hva(kvm, cur);
														
 
															-		if (kvm_is_error_hva(hva)) {
														
 
															-			r = -EFAULT;
														
 
															-			break;
														
 
															-		}
														
 
															-		/* decrement only if we actually flipped the bit to 0 */
														
 
															-		if (!peek && test_and_clear_bit(cur, s->pgste_bitmap))
														
 
															-			atomic64_dec(&s->dirty_pages);
														
 
															-		r = get_pgste(kvm->mm, hva, &pgstev);
														
 
															-		if (r < 0)
														
 
															-			pgstev = 0;
														
 
															-		/* save the value */
														
 
															-		res[i++] = (pgstev >> 24) & 0x43;
														
 
															-		/*
														
 
															-		 * if the next bit is too far away, stop.
														
 
															-		 * if we reached the previous "next", find the next one
														
 
															-		 */
														
 
															-		if (!peek) {
														
 
															-			if (next > cur + KVM_S390_MAX_BIT_DISTANCE)
														
 
															-				break;
														
 
															-			if (cur == next)
														
 
															-				next = find_next_bit(s->pgste_bitmap,
														
 
															-						     s->bitmap_size, cur + 1);
														
 
															-		/* reached the end of the bitmap or of the buffer, stop */
														
 
															-			if ((next >= s->bitmap_size) ||
														
 
															-			    (next >= args->start_gfn + bufsize))
														
 
															-				break;
														
 
															-		}
														
 
															-		cur++;
														
 
															-	}
														
 
															+	if (peek)
														
 
															+		ret = kvm_s390_peek_cmma(kvm, args, values, bufsize);
														
 
															+	else
														
 
															+		ret = kvm_s390_get_cmma(kvm, args, values, bufsize);
														
 
															 	srcu_read_unlock(&kvm->srcu, srcu_idx);
														
 
															 	up_read(&kvm->mm->mmap_sem);
														
 
															-	args->count = i;
														
 
															-	args->remaining = s ? atomic64_read(&s->dirty_pages) : 0;
														
 
															-	rr = copy_to_user((void __user *)args->values, res, args->count);
														
 
															-	if (rr)
														
 
															-		r = -EFAULT;
														
 
															+	if (kvm->arch.migration_mode)
														
 
															+		args->remaining = atomic64_read(&kvm->arch.cmma_dirty_pages);
														
 
															+	else
														
 
															+		args->remaining = 0;
														
 
															-	vfree(res);
														
 
															-	return r;
														
 
															+	if (copy_to_user((void __user *)args->values, values, args->count))
														
 
															+		ret = -EFAULT;
														
 
															+
														
 
															+	vfree(values);
														
 
															+	return ret;
														
 
															 }
														
 
															 /*
														
@@ -2192,10 +2251,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 
															 	kvm_s390_destroy_adapters(kvm);
														
 
															 	kvm_s390_clear_float_irqs(kvm);
														
 
															 	kvm_s390_vsie_destroy(kvm);
														
 
															-	if (kvm->arch.migration_state) {
														
 
															-		vfree(kvm->arch.migration_state->pgste_bitmap);
														
 
															-		kfree(kvm->arch.migration_state);
														
 
															-	}
														
 
															 	KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
														
 
															 }
														
@@ -2353,6 +2408,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
															 		vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC;
														
 
															 	if (test_kvm_facility(vcpu->kvm, 133))
														
 
															 		vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB;
														
 
															+	if (test_kvm_facility(vcpu->kvm, 156))
														
 
															+		vcpu->run->kvm_valid_regs |= KVM_SYNC_ETOKEN;
														
 
															 	/* fprs can be synchronized via vrs, even if the guest has no vx. With
														
 
															 	 * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format.
														
 
															 	 */
														
@@ -2602,7 +2659,8 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
 
															 	}
														
 
															 	if (test_kvm_facility(vcpu->kvm, 139))
														
 
															 		vcpu->arch.sie_block->ecd |= ECD_MEF;
														
 
															-
														
 
															+	if (test_kvm_facility(vcpu->kvm, 156))
														
 
															+		vcpu->arch.sie_block->ecd |= ECD_ETOKENF;
														
 
															 	if (vcpu->arch.sie_block->gd) {
														
 
															 		vcpu->arch.sie_block->eca |= ECA_AIV;
														
 
															 		VCPU_EVENT(vcpu, 3, "AIV gisa format-%u enabled for cpu %03u",
														
@@ -3520,6 +3578,7 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
															 		}
														
 
															 		preempt_enable();
														
 
															 	}
														
 
															+	/* SIE will load etoken directly from SDNX and therefore kvm_run */
														
 
															 	kvm_run->kvm_dirty_regs = 0;
														
 
															 }
														
@@ -3559,7 +3618,7 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
 
															 			__ctl_clear_bit(2, 4);
														
 
															 		vcpu->arch.host_gscb = NULL;
														
 
															 	}
														
 
															-
														
 
															+	/* SIE will save etoken directly into SDNX and therefore kvm_run */
														
 
															 }
														
 
															 int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
														
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -205,13 +205,10 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
 
															 int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	int rc;
														
 
															-	struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block;
														
 
															 	trace_kvm_s390_skey_related_inst(vcpu);
														
 
															 	/* Already enabled? */
														
 
															-	if (vcpu->kvm->arch.use_skf &&
														
 
															-	    !(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) &&
														
 
															-	    !kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
														
 
															+	if (vcpu->arch.skey_enabled)
														
 
															 		return 0;
														
 
															 	rc = s390_enable_skey();
														
@@ -222,9 +219,10 @@ int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu)
 
															 	if (kvm_s390_test_cpuflags(vcpu, CPUSTAT_KSS))
														
 
															 		kvm_s390_clear_cpuflags(vcpu, CPUSTAT_KSS);
														
 
															 	if (!vcpu->kvm->arch.use_skf)
														
 
															-		sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
														
 
															+		vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
														
 
															 	else
														
 
															-		sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
														
 
															+		vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
														
 
															+	vcpu->arch.skey_enabled = true;
														
 
															 	return 0;
														
 
															 }
														
@@ -987,7 +985,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
															 			return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
														
 
															 		if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
														
 
															-			if (clear_user((void __user *)vmaddr, PAGE_SIZE))
														
 
															+			if (kvm_clear_guest(vcpu->kvm, start, PAGE_SIZE))
														
 
															 				return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
														
 
															 		}
														
@@ -1024,9 +1022,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
 
															 	return 0;
														
 
															 }
														
 
															-static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
														
 
															+/*
														
 
															+ * Must be called with relevant read locks held (kvm->mm->mmap_sem, kvm->srcu)
														
 
															+ */
														
 
															+static inline int __do_essa(struct kvm_vcpu *vcpu, const int orc)
														
 
															 {
														
 
															-	struct kvm_s390_migration_state *ms = vcpu->kvm->arch.migration_state;
														
 
															 	int r1, r2, nappended, entries;
														
 
															 	unsigned long gfn, hva, res, pgstev, ptev;
														
 
															 	unsigned long *cbrlo;
														
@@ -1076,10 +1076,12 @@ static inline int do_essa(struct kvm_vcpu *vcpu, const int orc)
 
															 		cbrlo[entries] = gfn << PAGE_SHIFT;
														
 
															 	}
														
 
															-	if (orc && gfn < ms->bitmap_size) {
														
 
															-		/* increment only if we are really flipping the bit to 1 */
														
 
															-		if (!test_and_set_bit(gfn, ms->pgste_bitmap))
														
 
															-			atomic64_inc(&ms->dirty_pages);
														
 
															+	if (orc) {
														
 
															+		struct kvm_memory_slot *ms = gfn_to_memslot(vcpu->kvm, gfn);
														
 
															+
														
 
															+		/* Increment only if we are really flipping the bit */
														
 
															+		if (ms && !test_and_set_bit(gfn - ms->base_gfn, kvm_second_dirty_bitmap(ms)))
														
 
															+			atomic64_inc(&vcpu->kvm->arch.cmma_dirty_pages);
														
 
															 	}
														
 
															 	return nappended;
														
@@ -1108,7 +1110,7 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 
															 						: ESSA_SET_STABLE_IF_RESIDENT))
														
 
															 		return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
														
 
															-	if (likely(!vcpu->kvm->arch.migration_state)) {
														
 
															+	if (!vcpu->kvm->arch.migration_mode) {
														
 
															 		/*
														
 
															 		 * CMMA is enabled in the KVM settings, but is disabled in
														
 
															 		 * the SIE block and in the mm_context, and we are not doing
														
@@ -1136,10 +1138,16 @@ static int handle_essa(struct kvm_vcpu *vcpu)
 
															 		/* Retry the ESSA instruction */
														
 
															 		kvm_s390_retry_instr(vcpu);
														
 
															 	} else {
														
 
															-		/* Account for the possible extra cbrl entry */
														
 
															-		i = do_essa(vcpu, orc);
														
 
															+		int srcu_idx;
														
 
															+
														
 
															+		down_read(&vcpu->kvm->mm->mmap_sem);
														
 
															+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															+		i = __do_essa(vcpu, orc);
														
 
															+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
														
 
															+		up_read(&vcpu->kvm->mm->mmap_sem);
														
 
															 		if (i < 0)
														
 
															 			return i;
														
 
															+		/* Account for the possible extra cbrl entry */
														
 
															 		entries += i;
														
 
															 	}
														
 
															 	vcpu->arch.sie_block->cbrlo &= PAGE_MASK;	/* reset nceo */
														
--- a/arch/s390/kvm/vsie.c
+++ b/arch/s390/kvm/vsie.c
@@ -2,7 +2,7 @@
 
															 /*
														
 
															  * kvm nested virtualization support for s390x
														
 
															  *
														
 
															- * Copyright IBM Corp. 2016
														
 
															+ * Copyright IBM Corp. 2016, 2018
														
 
															  *
														
 
															  *    Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
														
 
															  */
														
@@ -378,6 +378,10 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 	if (test_kvm_facility(vcpu->kvm, 139))
														
 
															 		scb_s->ecd |= scb_o->ecd & ECD_MEF;
														
 
															+	/* etoken */
														
 
															+	if (test_kvm_facility(vcpu->kvm, 156))
														
 
															+		scb_s->ecd |= scb_o->ecd & ECD_ETOKENF;
														
 
															+
														
 
															 	prepare_ibc(vcpu, vsie_page);
														
 
															 	rc = shadow_crycb(vcpu, vsie_page);
														
 
															 out:
														
@@ -627,7 +631,8 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															 		vsie_page->riccbd_gpa = gpa;
														
 
															 		scb_s->riccbd = hpa;
														
 
															 	}
														
 
															-	if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) {
														
 
															+	if (((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) ||
														
 
															+	    (scb_s->ecd & ECD_ETOKENF)) {
														
 
															 		unsigned long sdnxc;
														
 
															 		gpa = READ_ONCE(scb_o->sdnxo) & ~0xfUL;
														
@@ -818,6 +823,8 @@ static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
 
															  *          - < 0 if an error occurred
														
 
															  */
														
 
															 static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
														
 
															+	__releases(vcpu->kvm->srcu)
														
 
															+	__acquires(vcpu->kvm->srcu)
														
 
															 {
														
 
															 	struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
														
 
															 	struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
														
--- a/arch/s390/tools/gen_facilities.c
+++ b/arch/s390/tools/gen_facilities.c
@@ -4,7 +4,7 @@
 
															  * numbering scheme from the Princples of Operations: most significant bit
														
 
															  * has bit number 0.
														
 
															  *
														
 
															- *    Copyright IBM Corp. 2015
														
 
															+ *    Copyright IBM Corp. 2015, 2018
														
 
															  *
														
 
															  */
														
@@ -106,6 +106,7 @@ static struct facility_def facility_defs[] = {
 
															 		.name = "FACILITIES_KVM_CPUMODEL",
														
 
															 		.bits = (int[]){
														
 
															+			156, /* etoken facility */
														
 
															 			-1  /* END */
														
 
															 		}
														
 
															 	},
														
--- a/arch/x86/hyperv/Makefile
+++ b/arch/x86/hyperv/Makefile
@@ -1,2 +1,2 @@
 
															-obj-y			:= hv_init.o mmu.o
														
 
															+obj-y			:= hv_init.o mmu.o nested.o
														
 
															 obj-$(CONFIG_X86_64)	+= hv_apic.o
														
--- a/arch/x86/hyperv/nested.c
+++ b/arch/x86/hyperv/nested.c
@@ -0,0 +1,56 @@
 
															+// SPDX-License-Identifier: GPL-2.0
														
 
															+
														
 
															+/*
														
 
															+ * Hyper-V nested virtualization code.
														
 
															+ *
														
 
															+ * Copyright (C) 2018, Microsoft, Inc.
														
 
															+ *
														
 
															+ * Author : Lan Tianyu <Tianyu.Lan@microsoft.com>
														
 
															+ */
														
 
															+
														
 
															+
														
 
															+#include <linux/types.h>
														
 
															+#include <asm/hyperv-tlfs.h>
														
 
															+#include <asm/mshyperv.h>
														
 
															+#include <asm/tlbflush.h>
														
 
															+
														
 
															+#include <asm/trace/hyperv.h>
														
 
															+
														
 
															+int hyperv_flush_guest_mapping(u64 as)
														
 
															+{
														
 
															+	struct hv_guest_mapping_flush **flush_pcpu;
														
 
															+	struct hv_guest_mapping_flush *flush;
														
 
															+	u64 status;
														
 
															+	unsigned long flags;
														
 
															+	int ret = -ENOTSUPP;
														
 
															+
														
 
															+	if (!hv_hypercall_pg)
														
 
															+		goto fault;
														
 
															+
														
 
															+	local_irq_save(flags);
														
 
															+
														
 
															+	flush_pcpu = (struct hv_guest_mapping_flush **)
														
 
															+		this_cpu_ptr(hyperv_pcpu_input_arg);
														
 
															+
														
 
															+	flush = *flush_pcpu;
														
 
															+
														
 
															+	if (unlikely(!flush)) {
														
 
															+		local_irq_restore(flags);
														
 
															+		goto fault;
														
 
															+	}
														
 
															+
														
 
															+	flush->address_space = as;
														
 
															+	flush->flags = 0;
														
 
															+
														
 
															+	status = hv_do_hypercall(HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE,
														
 
															+				 flush, NULL);
														
 
															+	local_irq_restore(flags);
														
 
															+
														
 
															+	if (!(status & HV_HYPERCALL_RESULT_MASK))
														
 
															+		ret = 0;
														
 
															+
														
 
															+fault:
														
 
															+	trace_hyperv_nested_flush_guest_mapping(as, ret);
														
 
															+	return ret;
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(hyperv_flush_guest_mapping);
														
--- a/arch/x86/include/asm/hyperv-tlfs.h
+++ b/arch/x86/include/asm/hyperv-tlfs.h
@@ -310,6 +310,7 @@ struct ms_hyperv_tsc_page {
 
															 #define HV_X64_MSR_REENLIGHTENMENT_CONTROL	0x40000106
														
 
															 /* Nested features (CPUID 0x4000000A) EAX */
														
 
															+#define HV_X64_NESTED_GUEST_MAPPING_FLUSH	BIT(18)
														
 
															 #define HV_X64_NESTED_MSR_BITMAP		BIT(19)
														
 
															 struct hv_reenlightenment_control {
														
@@ -351,6 +352,7 @@ struct hv_tsc_emulation_status {
 
															 #define HVCALL_SEND_IPI_EX			0x0015
														
 
															 #define HVCALL_POST_MESSAGE			0x005c
														
 
															 #define HVCALL_SIGNAL_EVENT			0x005d
														
 
															+#define HVCALL_FLUSH_GUEST_PHYSICAL_ADDRESS_SPACE 0x00af
														
 
															 #define HV_X64_MSR_VP_ASSIST_PAGE_ENABLE	0x00000001
														
 
															 #define HV_X64_MSR_VP_ASSIST_PAGE_ADDRESS_SHIFT	12
														
@@ -742,6 +744,12 @@ struct ipi_arg_ex {
 
															 	struct hv_vpset vp_set;
														
 
															 };
														
 
															+/* HvFlushGuestPhysicalAddressSpace hypercalls */
														
 
															+struct hv_guest_mapping_flush {
														
 
															+	u64 address_space;
														
 
															+	u64 flags;
														
 
															+};
														
 
															+
														
 
															 /* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */
														
 
															 struct hv_tlb_flush {
														
 
															 	u64 address_space;
														
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -55,6 +55,7 @@
 
															 #define KVM_REQ_TRIPLE_FAULT		KVM_ARCH_REQ(2)
														
 
															 #define KVM_REQ_MMU_SYNC		KVM_ARCH_REQ(3)
														
 
															 #define KVM_REQ_CLOCK_UPDATE		KVM_ARCH_REQ(4)
														
 
															+#define KVM_REQ_LOAD_CR3		KVM_ARCH_REQ(5)
														
 
															 #define KVM_REQ_EVENT			KVM_ARCH_REQ(6)
														
 
															 #define KVM_REQ_APF_HALT		KVM_ARCH_REQ(7)
														
 
															 #define KVM_REQ_STEAL_UPDATE		KVM_ARCH_REQ(8)
														
@@ -76,13 +77,13 @@
 
															 #define KVM_REQ_HV_EXIT			KVM_ARCH_REQ(21)
														
 
															 #define KVM_REQ_HV_STIMER		KVM_ARCH_REQ(22)
														
 
															 #define KVM_REQ_LOAD_EOI_EXITMAP	KVM_ARCH_REQ(23)
														
 
															+#define KVM_REQ_GET_VMCS12_PAGES	KVM_ARCH_REQ(24)
														
 
															 #define CR0_RESERVED_BITS                                               \
														
 
															 	(~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
														
 
															 			  | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
														
 
															 			  | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
														
 
															-#define CR3_PCID_INVD		 BIT_64(63)
														
 
															 #define CR4_RESERVED_BITS                                               \
														
 
															 	(~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
														
 
															 			  | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE     \
														
@@ -326,6 +327,16 @@ struct rsvd_bits_validate {
 
															 	u64 bad_mt_xwr;
														
 
															 };
														
 
															+struct kvm_mmu_root_info {
														
 
															+	gpa_t cr3;
														
 
															+	hpa_t hpa;
														
 
															+};
														
 
															+
														
 
															+#define KVM_MMU_ROOT_INFO_INVALID \
														
 
															+	((struct kvm_mmu_root_info) { .cr3 = INVALID_PAGE, .hpa = INVALID_PAGE })
														
 
															+
														
 
															+#define KVM_MMU_NUM_PREV_ROOTS 3
														
 
															+
														
 
															 /*
														
 
															  * x86 supports 4 paging modes (5-level 64-bit, 4-level 64-bit, 3-level 32-bit,
														
 
															  * and 2-level 32-bit).  The kvm_mmu structure abstracts the details of the
														
@@ -345,7 +356,7 @@ struct kvm_mmu {
 
															 			       struct x86_exception *exception);
														
 
															 	int (*sync_page)(struct kvm_vcpu *vcpu,
														
 
															 			 struct kvm_mmu_page *sp);
														
 
															-	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
														
 
															+	void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa);
														
 
															 	void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
														
 
															 			   u64 *spte, const void *pte);
														
 
															 	hpa_t root_hpa;
														
@@ -354,6 +365,7 @@ struct kvm_mmu {
 
															 	u8 shadow_root_level;
														
 
															 	u8 ept_ad;
														
 
															 	bool direct_map;
														
 
															+	struct kvm_mmu_root_info prev_roots[KVM_MMU_NUM_PREV_ROOTS];
														
 
															 	/*
														
 
															 	 * Bitmap; bit set = permission fault
														
@@ -978,6 +990,15 @@ struct kvm_x86_ops {
 
															 	void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
														
 
															 	void (*tlb_flush)(struct kvm_vcpu *vcpu, bool invalidate_gpa);
														
 
															+	int  (*tlb_remote_flush)(struct kvm *kvm);
														
 
															+
														
 
															+	/*
														
 
															+	 * Flush any TLB entries associated with the given GVA.
														
 
															+	 * Does not need to flush GPA->HPA mappings.
														
 
															+	 * Can potentially get non-canonical addresses through INVLPGs, which
														
 
															+	 * the implementation may choose to ignore if appropriate.
														
 
															+	 */
														
 
															+	void (*tlb_flush_gva)(struct kvm_vcpu *vcpu, gva_t addr);
														
 
															 	void (*run)(struct kvm_vcpu *vcpu);
														
 
															 	int (*handle_exit)(struct kvm_vcpu *vcpu);
														
@@ -1090,6 +1111,14 @@ struct kvm_x86_ops {
 
															 	void (*setup_mce)(struct kvm_vcpu *vcpu);
														
 
															+	int (*get_nested_state)(struct kvm_vcpu *vcpu,
														
 
															+				struct kvm_nested_state __user *user_kvm_nested_state,
														
 
															+				unsigned user_data_size);
														
 
															+	int (*set_nested_state)(struct kvm_vcpu *vcpu,
														
 
															+				struct kvm_nested_state __user *user_kvm_nested_state,
														
 
															+				struct kvm_nested_state *kvm_state);
														
 
															+	void (*get_vmcs12_pages)(struct kvm_vcpu *vcpu);
														
 
															+
														
 
															 	int (*smi_allowed)(struct kvm_vcpu *vcpu);
														
 
															 	int (*pre_enter_smm)(struct kvm_vcpu *vcpu, char *smstate);
														
 
															 	int (*pre_leave_smm)(struct kvm_vcpu *vcpu, u64 smbase);
														
@@ -1122,6 +1151,16 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 
															 	return kvm_x86_ops->vm_free(kvm);
														
 
															 }
														
 
															+#define __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
														
 
															+static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
														
 
															+{
														
 
															+	if (kvm_x86_ops->tlb_remote_flush &&
														
 
															+	    !kvm_x86_ops->tlb_remote_flush(kvm))
														
 
															+		return 0;
														
 
															+	else
														
 
															+		return -ENOTSUPP;
														
 
															+}
														
 
															+
														
 
															 int kvm_mmu_module_init(void);
														
 
															 void kvm_mmu_module_exit(void);
														
@@ -1273,6 +1312,10 @@ static inline int __kvm_irq_line_state(unsigned long *irq_state,
 
															 	return !!(*irq_state);
														
 
															 }
														
 
															+#define KVM_MMU_ROOT_CURRENT		BIT(0)
														
 
															+#define KVM_MMU_ROOT_PREVIOUS(i)	BIT(1+i)
														
 
															+#define KVM_MMU_ROOTS_ALL		(~0UL)
														
 
															+
														
 
															 int kvm_pic_set_irq(struct kvm_pic *pic, int irq, int irq_source_id, int level);
														
 
															 void kvm_pic_clear_all(struct kvm_pic *pic, int irq_source_id);
														
@@ -1284,7 +1327,7 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
 
															 int kvm_mmu_load(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_unload(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
														
 
															-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu);
														
 
															+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free);
														
 
															 gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
														
 
															 			   struct x86_exception *exception);
														
 
															 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
														
@@ -1303,7 +1346,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
															 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code,
														
 
															 		       void *insn, int insn_len);
														
 
															 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
														
 
															-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu);
														
 
															+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
														
 
															+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush);
														
 
															 void kvm_enable_tdp(void);
														
 
															 void kvm_disable_tdp(void);
														
@@ -1418,6 +1462,10 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
 
															 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
														
 
															 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
														
 
															+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
														
 
															+    		    unsigned long ipi_bitmap_high, int min,
														
 
															+		    unsigned long icr, int op_64_bit);
														
 
															+
														
 
															 u64 kvm_get_arch_capabilities(void);
														
 
															 void kvm_define_shared_msr(unsigned index, u32 msr);
														
 
															 int kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
														
--- a/arch/x86/include/asm/mshyperv.h
+++ b/arch/x86/include/asm/mshyperv.h
@@ -347,6 +347,7 @@ void hyperv_reenlightenment_intr(struct pt_regs *regs);
 
															 void set_hv_tscchange_cb(void (*cb)(void));
														
 
															 void clear_hv_tscchange_cb(void);
														
 
															 void hyperv_stop_tsc_emulation(void);
														
 
															+int hyperv_flush_guest_mapping(u64 as);
														
 
															 #ifdef CONFIG_X86_64
														
 
															 void hv_apic_init(void);
														
@@ -366,6 +367,7 @@ static inline struct hv_vp_assist_page *hv_get_vp_assist_page(unsigned int cpu)
 
															 {
														
 
															 	return NULL;
														
 
															 }
														
 
															+static inline int hyperv_flush_guest_mapping(u64 as) { return -1; }
														
 
															 #endif /* CONFIG_HYPERV */
														
 
															 #ifdef CONFIG_HYPERV_TSCPAGE
														
--- a/arch/x86/include/asm/trace/hyperv.h
+++ b/arch/x86/include/asm/trace/hyperv.h
@@ -28,6 +28,20 @@ TRACE_EVENT(hyperv_mmu_flush_tlb_others,
 
															 		      __entry->addr, __entry->end)
														
 
															 	);
														
 
															+TRACE_EVENT(hyperv_nested_flush_guest_mapping,
														
 
															+	    TP_PROTO(u64 as, int ret),
														
 
															+	    TP_ARGS(as, ret),
														
 
															+
														
 
															+	    TP_STRUCT__entry(
														
 
															+		    __field(u64, as)
														
 
															+		    __field(int, ret)
														
 
															+		    ),
														
 
															+	    TP_fast_assign(__entry->as = as;
														
 
															+			   __entry->ret = ret;
														
 
															+		    ),
														
 
															+	    TP_printk("address space %llx ret %d", __entry->as, __entry->ret)
														
 
															+	);
														
 
															+
														
 
															 TRACE_EVENT(hyperv_send_ipi_mask,
														
 
															 	    TP_PROTO(const struct cpumask *cpus,
														
 
															 		     int vector),
														
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -378,4 +378,41 @@ struct kvm_sync_regs {
 
															 #define KVM_X86_QUIRK_LINT0_REENABLED	(1 << 0)
														
 
															 #define KVM_X86_QUIRK_CD_NW_CLEARED	(1 << 1)
														
 
															+#define KVM_STATE_NESTED_GUEST_MODE	0x00000001
														
 
															+#define KVM_STATE_NESTED_RUN_PENDING	0x00000002
														
 
															+
														
 
															+#define KVM_STATE_NESTED_SMM_GUEST_MODE	0x00000001
														
 
															+#define KVM_STATE_NESTED_SMM_VMXON	0x00000002
														
 
															+
														
 
															+struct kvm_vmx_nested_state {
														
 
															+	__u64 vmxon_pa;
														
 
															+	__u64 vmcs_pa;
														
 
															+
														
 
															+	struct {
														
 
															+		__u16 flags;
														
 
															+	} smm;
														
 
															+};
														
 
															+
														
 
															+/* for KVM_CAP_NESTED_STATE */
														
 
															+struct kvm_nested_state {
														
 
															+	/* KVM_STATE_* flags */
														
 
															+	__u16 flags;
														
 
															+
														
 
															+	/* 0 for VMX, 1 for SVM.  */
														
 
															+	__u16 format;
														
 
															+
														
 
															+	/* 128 for SVM, 128 + VMCS size for VMX.  */
														
 
															+	__u32 size;
														
 
															+
														
 
															+	union {
														
 
															+		/* VMXON, VMCS */
														
 
															+		struct kvm_vmx_nested_state vmx;
														
 
															+
														
 
															+		/* Pad the header to 128 bytes.  */
														
 
															+		__u8 pad[120];
														
 
															+	};
														
 
															+
														
 
															+	__u8 data[0];
														
 
															+};
														
 
															+
														
 
															 #endif /* _ASM_X86_KVM_H */
														
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -28,6 +28,7 @@
 
															 #define KVM_FEATURE_PV_UNHALT		7
														
 
															 #define KVM_FEATURE_PV_TLB_FLUSH	9
														
 
															 #define KVM_FEATURE_ASYNC_PF_VMEXIT	10
														
 
															+#define KVM_FEATURE_PV_SEND_IPI	11
														
 
															 #define KVM_HINTS_REALTIME      0
														
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -444,6 +444,98 @@ static void __init sev_map_percpu_data(void)
 
															 }
														
 
															 #ifdef CONFIG_SMP
														
 
															+#define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)
														
 
															+
														
 
															+static void __send_ipi_mask(const struct cpumask *mask, int vector)
														
 
															+{
														
 
															+	unsigned long flags;
														
 
															+	int cpu, apic_id, icr;
														
 
															+	int min = 0, max = 0;
														
 
															+#ifdef CONFIG_X86_64
														
 
															+	__uint128_t ipi_bitmap = 0;
														
 
															+#else
														
 
															+	u64 ipi_bitmap = 0;
														
 
															+#endif
														
 
															+
														
 
															+	if (cpumask_empty(mask))
														
 
															+		return;
														
 
															+
														
 
															+	local_irq_save(flags);
														
 
															+
														
 
															+	switch (vector) {
														
 
															+	default:
														
 
															+		icr = APIC_DM_FIXED | vector;
														
 
															+		break;
														
 
															+	case NMI_VECTOR:
														
 
															+		icr = APIC_DM_NMI;
														
 
															+		break;
														
 
															+	}
														
 
															+
														
 
															+	for_each_cpu(cpu, mask) {
														
 
															+		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
														
 
															+		if (!ipi_bitmap) {
														
 
															+			min = max = apic_id;
														
 
															+		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
														
 
															+			ipi_bitmap <<= min - apic_id;
														
 
															+			min = apic_id;
														
 
															+		} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
														
 
															+			max = apic_id < max ? max : apic_id;
														
 
															+		} else {
														
 
															+			kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
														
 
															+				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
														
 
															+			min = max = apic_id;
														
 
															+			ipi_bitmap = 0;
														
 
															+		}
														
 
															+		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
														
 
															+	}
														
 
															+
														
 
															+	if (ipi_bitmap) {
														
 
															+		kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
														
 
															+			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
														
 
															+	}
														
 
															+
														
 
															+	local_irq_restore(flags);
														
 
															+}
														
 
															+
														
 
															+static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
														
 
															+{
														
 
															+	__send_ipi_mask(mask, vector);
														
 
															+}
														
 
															+
														
 
															+static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
														
 
															+{
														
 
															+	unsigned int this_cpu = smp_processor_id();
														
 
															+	struct cpumask new_mask;
														
 
															+	const struct cpumask *local_mask;
														
 
															+
														
 
															+	cpumask_copy(&new_mask, mask);
														
 
															+	cpumask_clear_cpu(this_cpu, &new_mask);
														
 
															+	local_mask = &new_mask;
														
 
															+	__send_ipi_mask(local_mask, vector);
														
 
															+}
														
 
															+
														
 
															+static void kvm_send_ipi_allbutself(int vector)
														
 
															+{
														
 
															+	kvm_send_ipi_mask_allbutself(cpu_online_mask, vector);
														
 
															+}
														
 
															+
														
 
															+static void kvm_send_ipi_all(int vector)
														
 
															+{
														
 
															+	__send_ipi_mask(cpu_online_mask, vector);
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * Set the IPI entry points
														
 
															+ */
														
 
															+static void kvm_setup_pv_ipi(void)
														
 
															+{
														
 
															+	apic->send_IPI_mask = kvm_send_ipi_mask;
														
 
															+	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
														
 
															+	apic->send_IPI_allbutself = kvm_send_ipi_allbutself;
														
 
															+	apic->send_IPI_all = kvm_send_ipi_all;
														
 
															+	pr_info("KVM setup pv IPIs\n");
														
 
															+}
														
 
															+
														
 
															 static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
														
 
															 {
														
 
															 	native_smp_prepare_cpus(max_cpus);
														
@@ -611,13 +703,27 @@ static uint32_t __init kvm_detect(void)
 
															 	return kvm_cpuid_base();
														
 
															 }
														
 
															+static void __init kvm_apic_init(void)
														
 
															+{
														
 
															+#if defined(CONFIG_SMP)
														
 
															+	if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
														
 
															+		kvm_setup_pv_ipi();
														
 
															+#endif
														
 
															+}
														
 
															+
														
 
															+static void __init kvm_init_platform(void)
														
 
															+{
														
 
															+	kvmclock_init();
														
 
															+	x86_platform.apic_post_init = kvm_apic_init;
														
 
															+}
														
 
															+
														
 
															 const __initconst struct hypervisor_x86 x86_hyper_kvm = {
														
 
															 	.name			= "KVM",
														
 
															 	.detect			= kvm_detect,
														
 
															 	.type			= X86_HYPER_KVM,
														
 
															-	.init.init_platform	= kvmclock_init,
														
 
															 	.init.guest_late_init	= kvm_guest_init,
														
 
															 	.init.x2apic_available	= kvm_para_available,
														
 
															+	.init.init_platform	= kvm_init_platform,
														
 
															 };
														
 
															 static __init int activate_jump_labels(void)
														
@@ -736,6 +842,10 @@ void __init kvm_spinlock_init(void)
 
															 	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
														
 
															 		return;
														
 
															+	/* Don't use the pvqspinlock code if there is only 1 vCPU. */
														
 
															+	if (num_possible_cpus() == 1)
														
 
															+		return;
														
 
															+
														
 
															 	__pv_init_lock_hash();
														
 
															 	pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
														
 
															 	pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock);
														
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -621,7 +621,8 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
 
															 			     (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT) |
														
 
															 			     (1 << KVM_FEATURE_PV_UNHALT) |
														
 
															 			     (1 << KVM_FEATURE_PV_TLB_FLUSH) |
														
 
															-			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT);
														
 
															+			     (1 << KVM_FEATURE_ASYNC_PF_VMEXIT) |
														
 
															+			     (1 << KVM_FEATURE_PV_SEND_IPI);
														
 
															 		if (sched_info_on())
														
 
															 			entry->eax |= (1 << KVM_FEATURE_STEAL_TIME);
														
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4191,7 +4191,7 @@ static int check_cr_write(struct x86_emulate_ctxt *ctxt)
 
															 				maxphyaddr = 36;
														
 
															 			rsvd = rsvd_bits(maxphyaddr, 63);
														
 
															 			if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PCIDE)
														
 
															-				rsvd &= ~CR3_PCID_INVD;
														
 
															+				rsvd &= ~X86_CR3_PCID_NOFLUSH;
														
 
															 		}
														
 
															 		if (new_val & rsvd)
														
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -235,7 +235,7 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 
															 	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
														
 
															 	int ret;
														
 
															-	if (!synic->active)
														
 
															+	if (!synic->active && !host)
														
 
															 		return 1;
														
 
															 	trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
														
@@ -295,11 +295,12 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
 
															 	return ret;
														
 
															 }
														
 
															-static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
														
 
															+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata,
														
 
															+			 bool host)
														
 
															 {
														
 
															 	int ret;
														
 
															-	if (!synic->active)
														
 
															+	if (!synic->active && !host)
														
 
															 		return 1;
														
 
															 	ret = 0;
														
@@ -1014,6 +1015,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
 
															 	case HV_X64_MSR_TSC_EMULATION_STATUS:
														
 
															 		hv->hv_tsc_emulation_status = data;
														
 
															 		break;
														
 
															+	case HV_X64_MSR_TIME_REF_COUNT:
														
 
															+		/* read-only, but still ignore it if host-initiated */
														
 
															+		if (!host)
														
 
															+			return 1;
														
 
															+		break;
														
 
															 	default:
														
 
															 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
														
 
															 			    msr, data);
														
@@ -1101,6 +1107,12 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
															 		return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
														
 
															 					data, host);
														
 
															 	}
														
 
															+	case HV_X64_MSR_TSC_FREQUENCY:
														
 
															+	case HV_X64_MSR_APIC_FREQUENCY:
														
 
															+		/* read-only, but still ignore it if host-initiated */
														
 
															+		if (!host)
														
 
															+			return 1;
														
 
															+		break;
														
 
															 	default:
														
 
															 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
														
 
															 			    msr, data);
														
@@ -1156,7 +1168,8 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
															 	return 0;
														
 
															 }
														
 
															-static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
														
 
															+static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata,
														
 
															+			  bool host)
														
 
															 {
														
 
															 	u64 data = 0;
														
 
															 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
														
@@ -1183,7 +1196,7 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
															 	case HV_X64_MSR_SIMP:
														
 
															 	case HV_X64_MSR_EOM:
														
 
															 	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
														
 
															-		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
														
 
															+		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata, host);
														
 
															 	case HV_X64_MSR_STIMER0_CONFIG:
														
 
															 	case HV_X64_MSR_STIMER1_CONFIG:
														
 
															 	case HV_X64_MSR_STIMER2_CONFIG:
														
@@ -1229,7 +1242,7 @@ int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 
															 		return kvm_hv_set_msr(vcpu, msr, data, host);
														
 
															 }
														
 
															-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
														
 
															+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
														
 
															 {
														
 
															 	if (kvm_hv_msr_partition_wide(msr)) {
														
 
															 		int r;
														
@@ -1239,7 +1252,7 @@ int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
															 		mutex_unlock(&vcpu->kvm->arch.hyperv.hv_lock);
														
 
															 		return r;
														
 
															 	} else
														
 
															-		return kvm_hv_get_msr(vcpu, msr, pdata);
														
 
															+		return kvm_hv_get_msr(vcpu, msr, pdata, host);
														
 
															 }
														
 
															 static __always_inline int get_sparse_bank_no(u64 valid_bank_mask, int bank_no)
														
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -48,7 +48,7 @@ static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
 
															 }
														
 
															 int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
														
 
															-int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
														
 
															+int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host);
														
 
															 bool kvm_hv_hypercall_enabled(struct kvm *kvm);
														
 
															 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
														
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -547,6 +547,46 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq,
 
															 			irq->level, irq->trig_mode, dest_map);
														
 
															 }
														
 
															+int kvm_pv_send_ipi(struct kvm *kvm, unsigned long ipi_bitmap_low,
														
 
															+    		    unsigned long ipi_bitmap_high, int min,
														
 
															+		    unsigned long icr, int op_64_bit)
														
 
															+{
														
 
															+	int i;
														
 
															+	struct kvm_apic_map *map;
														
 
															+	struct kvm_vcpu *vcpu;
														
 
															+	struct kvm_lapic_irq irq = {0};
														
 
															+	int cluster_size = op_64_bit ? 64 : 32;
														
 
															+	int count = 0;
														
 
															+
														
 
															+	irq.vector = icr & APIC_VECTOR_MASK;
														
 
															+	irq.delivery_mode = icr & APIC_MODE_MASK;
														
 
															+	irq.level = (icr & APIC_INT_ASSERT) != 0;
														
 
															+	irq.trig_mode = icr & APIC_INT_LEVELTRIG;
														
 
															+
														
 
															+	if (icr & APIC_DEST_MASK)
														
 
															+		return -KVM_EINVAL;
														
 
															+	if (icr & APIC_SHORT_MASK)
														
 
															+		return -KVM_EINVAL;
														
 
															+
														
 
															+	rcu_read_lock();
														
 
															+	map = rcu_dereference(kvm->arch.apic_map);
														
 
															+
														
 
															+	/* Bits above cluster_size are masked in the caller.  */
														
 
															+	for_each_set_bit(i, &ipi_bitmap_low, BITS_PER_LONG) {
														
 
															+		vcpu = map->phys_map[min + i]->vcpu;
														
 
															+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
														
 
															+	}
														
 
															+
														
 
															+	min += cluster_size;
														
 
															+	for_each_set_bit(i, &ipi_bitmap_high, BITS_PER_LONG) {
														
 
															+		vcpu = map->phys_map[min + i]->vcpu;
														
 
															+		count += kvm_apic_set_irq(vcpu, &irq, NULL);
														
 
															+	}
														
 
															+
														
 
															+	rcu_read_unlock();
														
 
															+	return count;
														
 
															+}
														
 
															+
														
 
															 static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val)
														
 
															 {
														
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -178,7 +178,24 @@ struct kvm_shadow_walk_iterator {
 
															 	unsigned index;
														
 
															 };
														
 
															-#define for_each_shadow_entry(_vcpu, _addr, _walker)    \
														
 
															+static const union kvm_mmu_page_role mmu_base_role_mask = {
														
 
															+	.cr0_wp = 1,
														
 
															+	.cr4_pae = 1,
														
 
															+	.nxe = 1,
														
 
															+	.smep_andnot_wp = 1,
														
 
															+	.smap_andnot_wp = 1,
														
 
															+	.smm = 1,
														
 
															+	.guest_mode = 1,
														
 
															+	.ad_disabled = 1,
														
 
															+};
														
 
															+
														
 
															+#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker)     \
														
 
															+	for (shadow_walk_init_using_root(&(_walker), (_vcpu),              \
														
 
															+					 (_root), (_addr));                \
														
 
															+	     shadow_walk_okay(&(_walker));			           \
														
 
															+	     shadow_walk_next(&(_walker)))
														
 
															+
														
 
															+#define for_each_shadow_entry(_vcpu, _addr, _walker)            \
														
 
															 	for (shadow_walk_init(&(_walker), _vcpu, _addr);	\
														
 
															 	     shadow_walk_okay(&(_walker));			\
														
 
															 	     shadow_walk_next(&(_walker)))
														
@@ -221,7 +238,20 @@ static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK |
 
															 						    PT64_EPT_EXECUTABLE_MASK;
														
 
															 static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
														
 
															+/*
														
 
															+ * This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
														
 
															+ * to guard against L1TF attacks.
														
 
															+ */
														
 
															+static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
														
 
															+
														
 
															+/*
														
 
															+ * The number of high-order 1 bits to use in the mask above.
														
 
															+ */
														
 
															+static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
														
 
															+
														
 
															 static void mmu_spte_set(u64 *sptep, u64 spte);
														
 
															+static union kvm_mmu_page_role
														
 
															+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
														
 
															 void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
														
 
															 {
														
@@ -308,9 +338,13 @@ static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn,
 
															 {
														
 
															 	unsigned int gen = kvm_current_mmio_generation(vcpu);
														
 
															 	u64 mask = generation_mmio_spte_mask(gen);
														
 
															+	u64 gpa = gfn << PAGE_SHIFT;
														
 
															 	access &= ACC_WRITE_MASK | ACC_USER_MASK;
														
 
															-	mask |= shadow_mmio_value | access | gfn << PAGE_SHIFT;
														
 
															+	mask |= shadow_mmio_value | access;
														
 
															+	mask |= gpa | shadow_nonpresent_or_rsvd_mask;
														
 
															+	mask |= (gpa & shadow_nonpresent_or_rsvd_mask)
														
 
															+		<< shadow_nonpresent_or_rsvd_mask_len;
														
 
															 	trace_mark_mmio_spte(sptep, gfn, access, gen);
														
 
															 	mmu_spte_set(sptep, mask);
														
@@ -323,8 +357,14 @@ static bool is_mmio_spte(u64 spte)
 
															 static gfn_t get_mmio_spte_gfn(u64 spte)
														
 
															 {
														
 
															-	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask;
														
 
															-	return (spte & ~mask) >> PAGE_SHIFT;
														
 
															+	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) | shadow_mmio_mask |
														
 
															+		   shadow_nonpresent_or_rsvd_mask;
														
 
															+	u64 gpa = spte & ~mask;
														
 
															+
														
 
															+	gpa |= (spte >> shadow_nonpresent_or_rsvd_mask_len)
														
 
															+	       & shadow_nonpresent_or_rsvd_mask;
														
 
															+
														
 
															+	return gpa >> PAGE_SHIFT;
														
 
															 }
														
 
															 static unsigned get_mmio_spte_access(u64 spte)
														
@@ -381,7 +421,7 @@ void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
														
 
															-static void kvm_mmu_clear_all_pte_masks(void)
														
 
															+static void kvm_mmu_reset_all_pte_masks(void)
														
 
															 {
														
 
															 	shadow_user_mask = 0;
														
 
															 	shadow_accessed_mask = 0;
														
@@ -391,6 +431,18 @@ static void kvm_mmu_clear_all_pte_masks(void)
 
															 	shadow_mmio_mask = 0;
														
 
															 	shadow_present_mask = 0;
														
 
															 	shadow_acc_track_mask = 0;
														
 
															+
														
 
															+	/*
														
 
															+	 * If the CPU has 46 or less physical address bits, then set an
														
 
															+	 * appropriate mask to guard against L1TF attacks. Otherwise, it is
														
 
															+	 * assumed that the CPU is not vulnerable to L1TF.
														
 
															+	 */
														
 
															+	if (boot_cpu_data.x86_phys_bits <
														
 
															+	    52 - shadow_nonpresent_or_rsvd_mask_len)
														
 
															+		shadow_nonpresent_or_rsvd_mask =
														
 
															+			rsvd_bits(boot_cpu_data.x86_phys_bits -
														
 
															+				  shadow_nonpresent_or_rsvd_mask_len,
														
 
															+				  boot_cpu_data.x86_phys_bits - 1);
														
 
															 }
														
 
															 static int is_cpuid_PSE36(void)
														
@@ -1986,7 +2038,7 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
 
															 	return 0;
														
 
															 }
														
 
															-static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
														
 
															+static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
														
 
															 {
														
 
															 }
														
@@ -2117,12 +2169,8 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
 
															 static bool __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
														
 
															 			    struct list_head *invalid_list)
														
 
															 {
														
 
															-	if (sp->role.cr4_pae != !!is_pae(vcpu)) {
														
 
															-		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
														
 
															-		return false;
														
 
															-	}
														
 
															-
														
 
															-	if (vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
														
 
															+	if (sp->role.cr4_pae != !!is_pae(vcpu)
														
 
															+	    || vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
														
 
															 		kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
														
 
															 		return false;
														
 
															 	}
														
@@ -2392,11 +2440,12 @@ out:
 
															 	return sp;
														
 
															 }
														
 
															-static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
														
 
															-			     struct kvm_vcpu *vcpu, u64 addr)
														
 
															+static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
														
 
															+					struct kvm_vcpu *vcpu, hpa_t root,
														
 
															+					u64 addr)
														
 
															 {
														
 
															 	iterator->addr = addr;
														
 
															-	iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
														
 
															+	iterator->shadow_addr = root;
														
 
															 	iterator->level = vcpu->arch.mmu.shadow_root_level;
														
 
															 	if (iterator->level == PT64_ROOT_4LEVEL &&
														
@@ -2405,6 +2454,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 
															 		--iterator->level;
														
 
															 	if (iterator->level == PT32E_ROOT_LEVEL) {
														
 
															+		/*
														
 
															+		 * prev_root is currently only used for 64-bit hosts. So only
														
 
															+		 * the active root_hpa is valid here.
														
 
															+		 */
														
 
															+		BUG_ON(root != vcpu->arch.mmu.root_hpa);
														
 
															+
														
 
															 		iterator->shadow_addr
														
 
															 			= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
														
 
															 		iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
														
@@ -2414,6 +2469,13 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
 
															 	}
														
 
															 }
														
 
															+static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
														
 
															+			     struct kvm_vcpu *vcpu, u64 addr)
														
 
															+{
														
 
															+	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
														
 
															+				    addr);
														
 
															+}
														
 
															+
														
 
															 static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
														
 
															 {
														
 
															 	if (iterator->level < PT_PAGE_TABLE_LEVEL)
														
@@ -2702,6 +2764,45 @@ static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
 
															 		kvm_unsync_page(vcpu, sp);
														
 
															 	}
														
 
															+	/*
														
 
															+	 * We need to ensure that the marking of unsync pages is visible
														
 
															+	 * before the SPTE is updated to allow writes because
														
 
															+	 * kvm_mmu_sync_roots() checks the unsync flags without holding
														
 
															+	 * the MMU lock and so can race with this. If the SPTE was updated
														
 
															+	 * before the page had been marked as unsync-ed, something like the
														
 
															+	 * following could happen:
														
 
															+	 *
														
 
															+	 * CPU 1                    CPU 2
														
 
															+	 * ---------------------------------------------------------------------
														
 
															+	 * 1.2 Host updates SPTE
														
 
															+	 *     to be writable
														
 
															+	 *                      2.1 Guest writes a GPTE for GVA X.
														
 
															+	 *                          (GPTE being in the guest page table shadowed
														
 
															+	 *                           by the SP from CPU 1.)
														
 
															+	 *                          This reads SPTE during the page table walk.
														
 
															+	 *                          Since SPTE.W is read as 1, there is no
														
 
															+	 *                          fault.
														
 
															+	 *
														
 
															+	 *                      2.2 Guest issues TLB flush.
														
 
															+	 *                          That causes a VM Exit.
														
 
															+	 *
														
 
															+	 *                      2.3 kvm_mmu_sync_pages() reads sp->unsync.
														
 
															+	 *                          Since it is false, so it just returns.
														
 
															+	 *
														
 
															+	 *                      2.4 Guest accesses GVA X.
														
 
															+	 *                          Since the mapping in the SP was not updated,
														
 
															+	 *                          so the old mapping for GVA X incorrectly
														
 
															+	 *                          gets used.
														
 
															+	 * 1.1 Host marks SP
														
 
															+	 *     as unsync
														
 
															+	 *     (sp->unsync = true)
														
 
															+	 *
														
 
															+	 * The write barrier below ensures that 1.1 happens before 1.2 and thus
														
 
															+	 * the situation in 2.4 does not arise. The implicit barrier in 2.2
														
 
															+	 * pairs with this write barrier.
														
 
															+	 */
														
 
															+	smp_wmb();
														
 
															+
														
 
															 	return false;
														
 
															 }
														
@@ -2724,6 +2825,10 @@ static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
 
															 	return true;
														
 
															 }
														
 
															+/* Bits which may be returned by set_spte() */
														
 
															+#define SET_SPTE_WRITE_PROTECTED_PT	BIT(0)
														
 
															+#define SET_SPTE_NEED_REMOTE_TLB_FLUSH	BIT(1)
														
 
															+
														
 
															 static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
														
 
															 		    unsigned pte_access, int level,
														
 
															 		    gfn_t gfn, kvm_pfn_t pfn, bool speculative,
														
@@ -2800,7 +2905,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
															 		if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
														
 
															 			pgprintk("%s: found shadow page for %llx, marking ro\n",
														
 
															 				 __func__, gfn);
														
 
															-			ret = 1;
														
 
															+			ret |= SET_SPTE_WRITE_PROTECTED_PT;
														
 
															 			pte_access &= ~ACC_WRITE_MASK;
														
 
															 			spte &= ~(PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE);
														
 
															 		}
														
@@ -2816,7 +2921,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
 
															 set_pte:
														
 
															 	if (mmu_spte_update(sptep, spte))
														
 
															-		kvm_flush_remote_tlbs(vcpu->kvm);
														
 
															+		ret |= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
														
 
															 done:
														
 
															 	return ret;
														
 
															 }
														
@@ -2827,7 +2932,9 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 
															 {
														
 
															 	int was_rmapped = 0;
														
 
															 	int rmap_count;
														
 
															+	int set_spte_ret;
														
 
															 	int ret = RET_PF_RETRY;
														
 
															+	bool flush = false;
														
 
															 	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
														
 
															 		 *sptep, write_fault, gfn);
														
@@ -2844,22 +2951,25 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access,
 
															 			child = page_header(pte & PT64_BASE_ADDR_MASK);
														
 
															 			drop_parent_pte(child, sptep);
														
 
															-			kvm_flush_remote_tlbs(vcpu->kvm);
														
 
															+			flush = true;
														
 
															 		} else if (pfn != spte_to_pfn(*sptep)) {
														
 
															 			pgprintk("hfn old %llx new %llx\n",
														
 
															 				 spte_to_pfn(*sptep), pfn);
														
 
															 			drop_spte(vcpu->kvm, sptep);
														
 
															-			kvm_flush_remote_tlbs(vcpu->kvm);
														
 
															+			flush = true;
														
 
															 		} else
														
 
															 			was_rmapped = 1;
														
 
															 	}
														
 
															-	if (set_spte(vcpu, sptep, pte_access, level, gfn, pfn, speculative,
														
 
															-	      true, host_writable)) {
														
 
															+	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
														
 
															+				speculative, true, host_writable);
														
 
															+	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
														
 
															 		if (write_fault)
														
 
															 			ret = RET_PF_EMULATE;
														
 
															 		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															 	}
														
 
															+	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH || flush)
														
 
															+		kvm_flush_remote_tlbs(vcpu->kvm);
														
 
															 	if (unlikely(is_mmio_spte(*sptep)))
														
 
															 		ret = RET_PF_EMULATE;
														
@@ -3358,26 +3468,47 @@ static void mmu_free_root_page(struct kvm *kvm, hpa_t *root_hpa,
 
															 	*root_hpa = INVALID_PAGE;
														
 
															 }
														
 
															-void kvm_mmu_free_roots(struct kvm_vcpu *vcpu)
														
 
															+/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
														
 
															+void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
														
 
															 {
														
 
															 	int i;
														
 
															 	LIST_HEAD(invalid_list);
														
 
															 	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
														
 
															-	if (!VALID_PAGE(mmu->root_hpa))
														
 
															-		return;
														
 
															+	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
														
 
															+
														
 
															+	/* Before acquiring the MMU lock, see if we need to do any real work. */
														
 
															+	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
														
 
															+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+			if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
														
 
															+			    VALID_PAGE(mmu->prev_roots[i].hpa))
														
 
															+				break;
														
 
															+
														
 
															+		if (i == KVM_MMU_NUM_PREV_ROOTS)
														
 
															+			return;
														
 
															+	}
														
 
															 	spin_lock(&vcpu->kvm->mmu_lock);
														
 
															-	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
														
 
															-	    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
														
 
															-		mmu_free_root_page(vcpu->kvm, &mmu->root_hpa, &invalid_list);
														
 
															-	} else {
														
 
															-		for (i = 0; i < 4; ++i)
														
 
															-			if (mmu->pae_root[i] != 0)
														
 
															-				mmu_free_root_page(vcpu->kvm, &mmu->pae_root[i],
														
 
															-						   &invalid_list);
														
 
															-		mmu->root_hpa = INVALID_PAGE;
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+		if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
														
 
															+			mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
														
 
															+					   &invalid_list);
														
 
															+
														
 
															+	if (free_active_root) {
														
 
															+		if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
														
 
															+		    (mmu->root_level >= PT64_ROOT_4LEVEL || mmu->direct_map)) {
														
 
															+			mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
														
 
															+					   &invalid_list);
														
 
															+		} else {
														
 
															+			for (i = 0; i < 4; ++i)
														
 
															+				if (mmu->pae_root[i] != 0)
														
 
															+					mmu_free_root_page(vcpu->kvm,
														
 
															+							   &mmu->pae_root[i],
														
 
															+							   &invalid_list);
														
 
															+			mmu->root_hpa = INVALID_PAGE;
														
 
															+		}
														
 
															 	}
														
 
															 	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
														
@@ -3546,7 +3677,7 @@ static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
 
															 		return mmu_alloc_shadow_roots(vcpu);
														
 
															 }
														
 
															-static void mmu_sync_roots(struct kvm_vcpu *vcpu)
														
 
															+void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	int i;
														
 
															 	struct kvm_mmu_page *sp;
														
@@ -3558,14 +3689,39 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
															 		return;
														
 
															 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
														
 
															-	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
														
 
															+
														
 
															 	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
														
 
															 		hpa_t root = vcpu->arch.mmu.root_hpa;
														
 
															+
														
 
															 		sp = page_header(root);
														
 
															+
														
 
															+		/*
														
 
															+		 * Even if another CPU was marking the SP as unsync-ed
														
 
															+		 * simultaneously, any guest page table changes are not
														
 
															+		 * guaranteed to be visible anyway until this VCPU issues a TLB
														
 
															+		 * flush strictly after those changes are made. We only need to
														
 
															+		 * ensure that the other CPU sets these flags before any actual
														
 
															+		 * changes to the page tables are made. The comments in
														
 
															+		 * mmu_need_write_protect() describe what could go wrong if this
														
 
															+		 * requirement isn't satisfied.
														
 
															+		 */
														
 
															+		if (!smp_load_acquire(&sp->unsync) &&
														
 
															+		    !smp_load_acquire(&sp->unsync_children))
														
 
															+			return;
														
 
															+
														
 
															+		spin_lock(&vcpu->kvm->mmu_lock);
														
 
															+		kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
														
 
															+
														
 
															 		mmu_sync_children(vcpu, sp);
														
 
															+
														
 
															 		kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
														
 
															+		spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															 		return;
														
 
															 	}
														
 
															+
														
 
															+	spin_lock(&vcpu->kvm->mmu_lock);
														
 
															+	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
														
 
															+
														
 
															 	for (i = 0; i < 4; ++i) {
														
 
															 		hpa_t root = vcpu->arch.mmu.pae_root[i];
														
@@ -3575,13 +3731,8 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
 
															 			mmu_sync_children(vcpu, sp);
														
 
															 		}
														
 
															 	}
														
 
															-	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
														
 
															-}
														
 
															-void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
														
 
															-{
														
 
															-	spin_lock(&vcpu->kvm->mmu_lock);
														
 
															-	mmu_sync_roots(vcpu);
														
 
															+	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
														
 
															 	spin_unlock(&vcpu->kvm->mmu_lock);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
														
@@ -3948,16 +4099,107 @@ static void nonpaging_init_context(struct kvm_vcpu *vcpu,
 
															 	context->update_pte = nonpaging_update_pte;
														
 
															 	context->root_level = 0;
														
 
															 	context->shadow_root_level = PT32E_ROOT_LEVEL;
														
 
															-	context->root_hpa = INVALID_PAGE;
														
 
															 	context->direct_map = true;
														
 
															 	context->nx = false;
														
 
															 }
														
 
															-void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu)
														
 
															+/*
														
 
															+ * Find out if a previously cached root matching the new CR3/role is available.
														
 
															+ * The current root is also inserted into the cache.
														
 
															+ * If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
														
 
															+ * returned.
														
 
															+ * Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
														
 
															+ * false is returned. This root should now be freed by the caller.
														
 
															+ */
														
 
															+static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
														
 
															+				  union kvm_mmu_page_role new_role)
														
 
															+{
														
 
															+	uint i;
														
 
															+	struct kvm_mmu_root_info root;
														
 
															+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+
														
 
															+	root.cr3 = mmu->get_cr3(vcpu);
														
 
															+	root.hpa = mmu->root_hpa;
														
 
															+
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
														
 
															+		swap(root, mmu->prev_roots[i]);
														
 
															+
														
 
															+		if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
														
 
															+		    page_header(root.hpa) != NULL &&
														
 
															+		    new_role.word == page_header(root.hpa)->role.word)
														
 
															+			break;
														
 
															+	}
														
 
															+
														
 
															+	mmu->root_hpa = root.hpa;
														
 
															+
														
 
															+	return i < KVM_MMU_NUM_PREV_ROOTS;
														
 
															+}
														
 
															+
														
 
															+static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
														
 
															+			    union kvm_mmu_page_role new_role,
														
 
															+			    bool skip_tlb_flush)
														
 
															 {
														
 
															-	kvm_mmu_free_roots(vcpu);
														
 
															+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+
														
 
															+	/*
														
 
															+	 * For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
														
 
															+	 * having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
														
 
															+	 * later if necessary.
														
 
															+	 */
														
 
															+	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
														
 
															+	    mmu->root_level >= PT64_ROOT_4LEVEL) {
														
 
															+		if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
														
 
															+			return false;
														
 
															+
														
 
															+		if (cached_root_available(vcpu, new_cr3, new_role)) {
														
 
															+			/*
														
 
															+			 * It is possible that the cached previous root page is
														
 
															+			 * obsolete because of a change in the MMU
														
 
															+			 * generation number. However, that is accompanied by
														
 
															+			 * KVM_REQ_MMU_RELOAD, which will free the root that we
														
 
															+			 * have set here and allocate a new one.
														
 
															+			 */
														
 
															+
														
 
															+			kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
														
 
															+			if (!skip_tlb_flush) {
														
 
															+				kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
														
 
															+				kvm_x86_ops->tlb_flush(vcpu, true);
														
 
															+			}
														
 
															+
														
 
															+			/*
														
 
															+			 * The last MMIO access's GVA and GPA are cached in the
														
 
															+			 * VCPU. When switching to a new CR3, that GVA->GPA
														
 
															+			 * mapping may no longer be valid. So clear any cached
														
 
															+			 * MMIO info even when we don't need to sync the shadow
														
 
															+			 * page tables.
														
 
															+			 */
														
 
															+			vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
														
 
															+
														
 
															+			__clear_sp_write_flooding_count(
														
 
															+				page_header(mmu->root_hpa));
														
 
															+
														
 
															+			return true;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	return false;
														
 
															 }
														
 
															+static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
														
 
															+			      union kvm_mmu_page_role new_role,
														
 
															+			      bool skip_tlb_flush)
														
 
															+{
														
 
															+	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
														
 
															+		kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
														
 
															+}
														
 
															+
														
 
															+void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
														
 
															+{
														
 
															+	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
														
 
															+			  skip_tlb_flush);
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
														
 
															+
														
 
															 static unsigned long get_cr3(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	return kvm_read_cr3(vcpu);
														
@@ -4432,7 +4674,6 @@ static void paging64_init_context_common(struct kvm_vcpu *vcpu,
 
															 	context->invlpg = paging64_invlpg;
														
 
															 	context->update_pte = paging64_update_pte;
														
 
															 	context->shadow_root_level = level;
														
 
															-	context->root_hpa = INVALID_PAGE;
														
 
															 	context->direct_map = false;
														
 
															 }
														
@@ -4462,7 +4703,6 @@ static void paging32_init_context(struct kvm_vcpu *vcpu,
 
															 	context->invlpg = paging32_invlpg;
														
 
															 	context->update_pte = paging32_update_pte;
														
 
															 	context->shadow_root_level = PT32E_ROOT_LEVEL;
														
 
															-	context->root_hpa = INVALID_PAGE;
														
 
															 	context->direct_map = false;
														
 
															 }
														
@@ -4472,20 +4712,32 @@ static void paging32E_init_context(struct kvm_vcpu *vcpu,
 
															 	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
														
 
															 }
														
 
															+static union kvm_mmu_page_role
														
 
															+kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	union kvm_mmu_page_role role = {0};
														
 
															+
														
 
															+	role.guest_mode = is_guest_mode(vcpu);
														
 
															+	role.smm = is_smm(vcpu);
														
 
															+	role.ad_disabled = (shadow_accessed_mask == 0);
														
 
															+	role.level = kvm_x86_ops->get_tdp_level(vcpu);
														
 
															+	role.direct = true;
														
 
															+	role.access = ACC_ALL;
														
 
															+
														
 
															+	return role;
														
 
															+}
														
 
															+
														
 
															 static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															-	context->base_role.word = 0;
														
 
															-	context->base_role.guest_mode = is_guest_mode(vcpu);
														
 
															-	context->base_role.smm = is_smm(vcpu);
														
 
															-	context->base_role.ad_disabled = (shadow_accessed_mask == 0);
														
 
															+	context->base_role.word = mmu_base_role_mask.word &
														
 
															+				  kvm_calc_tdp_mmu_root_page_role(vcpu).word;
														
 
															 	context->page_fault = tdp_page_fault;
														
 
															 	context->sync_page = nonpaging_sync_page;
														
 
															 	context->invlpg = nonpaging_invlpg;
														
 
															 	context->update_pte = nonpaging_update_pte;
														
 
															 	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
														
 
															-	context->root_hpa = INVALID_PAGE;
														
 
															 	context->direct_map = true;
														
 
															 	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
														
 
															 	context->get_cr3 = get_cr3;
														
@@ -4520,13 +4772,36 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
 
															 	reset_tdp_shadow_zero_bits_mask(vcpu, context);
														
 
															 }
														
 
															-void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
														
 
															+static union kvm_mmu_page_role
														
 
															+kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	union kvm_mmu_page_role role = {0};
														
 
															 	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
														
 
															 	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
														
 
															-	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															-	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
														
 
															+	role.nxe = is_nx(vcpu);
														
 
															+	role.cr4_pae = !!is_pae(vcpu);
														
 
															+	role.cr0_wp  = is_write_protection(vcpu);
														
 
															+	role.smep_andnot_wp = smep && !is_write_protection(vcpu);
														
 
															+	role.smap_andnot_wp = smap && !is_write_protection(vcpu);
														
 
															+	role.guest_mode = is_guest_mode(vcpu);
														
 
															+	role.smm = is_smm(vcpu);
														
 
															+	role.direct = !is_paging(vcpu);
														
 
															+	role.access = ACC_ALL;
														
 
															+
														
 
															+	if (!is_long_mode(vcpu))
														
 
															+		role.level = PT32E_ROOT_LEVEL;
														
 
															+	else if (is_la57_mode(vcpu))
														
 
															+		role.level = PT64_ROOT_5LEVEL;
														
 
															+	else
														
 
															+		role.level = PT64_ROOT_4LEVEL;
														
 
															+
														
 
															+	return role;
														
 
															+}
														
 
															+
														
 
															+void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															 	if (!is_paging(vcpu))
														
 
															 		nonpaging_init_context(vcpu, context);
														
@@ -4537,26 +4812,34 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
 
															 	else
														
 
															 		paging32_init_context(vcpu, context);
														
 
															-	context->base_role.nxe = is_nx(vcpu);
														
 
															-	context->base_role.cr4_pae = !!is_pae(vcpu);
														
 
															-	context->base_role.cr0_wp  = is_write_protection(vcpu);
														
 
															-	context->base_role.smep_andnot_wp
														
 
															-		= smep && !is_write_protection(vcpu);
														
 
															-	context->base_role.smap_andnot_wp
														
 
															-		= smap && !is_write_protection(vcpu);
														
 
															-	context->base_role.guest_mode = is_guest_mode(vcpu);
														
 
															-	context->base_role.smm = is_smm(vcpu);
														
 
															+	context->base_role.word = mmu_base_role_mask.word &
														
 
															+				  kvm_calc_shadow_mmu_root_page_role(vcpu).word;
														
 
															 	reset_shadow_zero_bits_mask(vcpu, context);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
														
 
															+static union kvm_mmu_page_role
														
 
															+kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
														
 
															+{
														
 
															+	union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
														
 
															+
														
 
															+	role.level = PT64_ROOT_4LEVEL;
														
 
															+	role.direct = false;
														
 
															+	role.ad_disabled = !accessed_dirty;
														
 
															+	role.guest_mode = true;
														
 
															+	role.access = ACC_ALL;
														
 
															+
														
 
															+	return role;
														
 
															+}
														
 
															+
														
 
															 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
														
 
															-			     bool accessed_dirty)
														
 
															+			     bool accessed_dirty, gpa_t new_eptp)
														
 
															 {
														
 
															 	struct kvm_mmu *context = &vcpu->arch.mmu;
														
 
															+	union kvm_mmu_page_role root_page_role =
														
 
															+		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
														
 
															-	MMU_WARN_ON(VALID_PAGE(context->root_hpa));
														
 
															-
														
 
															+	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
														
 
															 	context->shadow_root_level = PT64_ROOT_4LEVEL;
														
 
															 	context->nx = true;
														
@@ -4567,10 +4850,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 
															 	context->invlpg = ept_invlpg;
														
 
															 	context->update_pte = ept_update_pte;
														
 
															 	context->root_level = PT64_ROOT_4LEVEL;
														
 
															-	context->root_hpa = INVALID_PAGE;
														
 
															 	context->direct_map = false;
														
 
															-	context->base_role.ad_disabled = !accessed_dirty;
														
 
															-	context->base_role.guest_mode = 1;
														
 
															+	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
														
 
															 	update_permission_bitmask(vcpu, context, true);
														
 
															 	update_pkru_bitmask(vcpu, context, true);
														
 
															 	update_last_nonleaf_level(vcpu, context);
														
@@ -4633,8 +4914,17 @@ static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
 
															 	update_last_nonleaf_level(vcpu, g_context);
														
 
															 }
														
 
															-static void init_kvm_mmu(struct kvm_vcpu *vcpu)
														
 
															+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
														
 
															 {
														
 
															+	if (reset_roots) {
														
 
															+		uint i;
														
 
															+
														
 
															+		vcpu->arch.mmu.root_hpa = INVALID_PAGE;
														
 
															+
														
 
															+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+			vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															+	}
														
 
															+
														
 
															 	if (mmu_is_nested(vcpu))
														
 
															 		init_kvm_nested_mmu(vcpu);
														
 
															 	else if (tdp_enabled)
														
@@ -4642,11 +4932,21 @@ static void init_kvm_mmu(struct kvm_vcpu *vcpu)
 
															 	else
														
 
															 		init_kvm_softmmu(vcpu);
														
 
															 }
														
 
															+EXPORT_SYMBOL_GPL(kvm_init_mmu);
														
 
															+
														
 
															+static union kvm_mmu_page_role
														
 
															+kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	if (tdp_enabled)
														
 
															+		return kvm_calc_tdp_mmu_root_page_role(vcpu);
														
 
															+	else
														
 
															+		return kvm_calc_shadow_mmu_root_page_role(vcpu);
														
 
															+}
														
 
															 void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	kvm_mmu_unload(vcpu);
														
 
															-	init_kvm_mmu(vcpu);
														
 
															+	kvm_init_mmu(vcpu, true);
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
														
@@ -4661,8 +4961,8 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
 
															 	kvm_mmu_sync_roots(vcpu);
														
 
															 	if (r)
														
 
															 		goto out;
														
 
															-	/* set_cr3() should ensure TLB has been flushed */
														
 
															-	vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
														
 
															+	kvm_mmu_load_cr3(vcpu);
														
 
															+	kvm_x86_ops->tlb_flush(vcpu, true);
														
 
															 out:
														
 
															 	return r;
														
 
															 }
														
@@ -4670,7 +4970,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_load);
 
															 void kvm_mmu_unload(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	kvm_mmu_free_roots(vcpu);
														
 
															+	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
														
 
															 	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_unload);
														
@@ -4823,16 +5123,6 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
															 	u64 entry, gentry, *spte;
														
 
															 	int npte;
														
 
															 	bool remote_flush, local_flush;
														
 
															-	union kvm_mmu_page_role mask = { };
														
 
															-
														
 
															-	mask.cr0_wp = 1;
														
 
															-	mask.cr4_pae = 1;
														
 
															-	mask.nxe = 1;
														
 
															-	mask.smep_andnot_wp = 1;
														
 
															-	mask.smap_andnot_wp = 1;
														
 
															-	mask.smm = 1;
														
 
															-	mask.guest_mode = 1;
														
 
															-	mask.ad_disabled = 1;
														
 
															 	/*
														
 
															 	 * If we don't have indirect shadow pages, it means no page is
														
@@ -4876,7 +5166,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
 
															 			mmu_page_zap_pte(vcpu->kvm, sp, spte);
														
 
															 			if (gentry &&
														
 
															 			      !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
														
 
															-			      & mask.word) && rmap_can_add(vcpu))
														
 
															+			      & mmu_base_role_mask.word) && rmap_can_add(vcpu))
														
 
															 				mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
														
 
															 			if (need_remote_flush(entry, *spte))
														
 
															 				remote_flush = true;
														
@@ -5001,12 +5291,67 @@ EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
 
															 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
														
 
															 {
														
 
															-	vcpu->arch.mmu.invlpg(vcpu, gva);
														
 
															-	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	int i;
														
 
															+
														
 
															+	/* INVLPG on a * non-canonical address is a NOP according to the SDM.  */
														
 
															+	if (is_noncanonical_address(gva, vcpu))
														
 
															+		return;
														
 
															+
														
 
															+	mmu->invlpg(vcpu, gva, mmu->root_hpa);
														
 
															+
														
 
															+	/*
														
 
															+	 * INVLPG is required to invalidate any global mappings for the VA,
														
 
															+	 * irrespective of PCID. Since it would take us roughly similar amount
														
 
															+	 * of work to determine whether any of the prev_root mappings of the VA
														
 
															+	 * is marked global, or to just sync it blindly, so we might as well
														
 
															+	 * just always sync it.
														
 
															+	 *
														
 
															+	 * Mappings not reachable via the current cr3 or the prev_roots will be
														
 
															+	 * synced when switching to that cr3, so nothing needs to be done here
														
 
															+	 * for them.
														
 
															+	 */
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+		if (VALID_PAGE(mmu->prev_roots[i].hpa))
														
 
															+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
														
 
															+
														
 
															+	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
														
 
															 	++vcpu->stat.invlpg;
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
														
 
															+void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
														
 
															+{
														
 
															+	struct kvm_mmu *mmu = &vcpu->arch.mmu;
														
 
															+	bool tlb_flush = false;
														
 
															+	uint i;
														
 
															+
														
 
															+	if (pcid == kvm_get_active_pcid(vcpu)) {
														
 
															+		mmu->invlpg(vcpu, gva, mmu->root_hpa);
														
 
															+		tlb_flush = true;
														
 
															+	}
														
 
															+
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
														
 
															+		if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
														
 
															+		    pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
														
 
															+			mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
														
 
															+			tlb_flush = true;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (tlb_flush)
														
 
															+		kvm_x86_ops->tlb_flush_gva(vcpu, gva);
														
 
															+
														
 
															+	++vcpu->stat.invlpg;
														
 
															+
														
 
															+	/*
														
 
															+	 * Mappings not reachable via the current cr3 or the prev_roots will be
														
 
															+	 * synced when switching to that cr3, so nothing needs to be done here
														
 
															+	 * for them.
														
 
															+	 */
														
 
															+}
														
 
															+EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
														
 
															+
														
 
															 void kvm_enable_tdp(void)
														
 
															 {
														
 
															 	tdp_enabled = true;
														
@@ -5030,6 +5375,9 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
															 	struct page *page;
														
 
															 	int i;
														
 
															+	if (tdp_enabled)
														
 
															+		return 0;
														
 
															+
														
 
															 	/*
														
 
															 	 * When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
														
 
															 	 * Therefore we need to allocate shadow page tables in the first
														
@@ -5048,11 +5396,16 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
 
															 int kvm_mmu_create(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	uint i;
														
 
															+
														
 
															 	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
														
 
															 	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
														
 
															 	vcpu->arch.mmu.translate_gpa = translate_gpa;
														
 
															 	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
														
 
															+	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+		vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
														
 
															+
														
 
															 	return alloc_mmu_pages(vcpu);
														
 
															 }
														
@@ -5060,7 +5413,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
														
 
															-	init_kvm_mmu(vcpu);
														
 
															+	kvm_init_mmu(vcpu, true);
														
 
															 }
														
 
															 static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
														
@@ -5500,7 +5853,7 @@ int kvm_mmu_module_init(void)
 
															 {
														
 
															 	int ret = -ENOMEM;
														
 
															-	kvm_mmu_clear_all_pte_masks();
														
 
															+	kvm_mmu_reset_all_pte_masks();
														
 
															 	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
														
 
															 					    sizeof(struct pte_list_desc),
														
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -61,9 +61,10 @@ void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value);
 
															 void
														
 
															 reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
														
 
															+void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots);
														
 
															 void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu);
														
 
															 void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
														
 
															-			     bool accessed_dirty);
														
 
															+			     bool accessed_dirty, gpa_t new_eptp);
														
 
															 bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu);
														
 
															 int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
														
 
															 				u64 fault_address, char *insn, int insn_len);
														
@@ -85,6 +86,27 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
 
															 	return kvm_mmu_load(vcpu);
														
 
															 }
														
 
															+static inline unsigned long kvm_get_pcid(struct kvm_vcpu *vcpu, gpa_t cr3)
														
 
															+{
														
 
															+	BUILD_BUG_ON((X86_CR3_PCID_MASK & PAGE_MASK) != 0);
														
 
															+
														
 
															+	return kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)
														
 
															+	       ? cr3 & X86_CR3_PCID_MASK
														
 
															+	       : 0;
														
 
															+}
														
 
															+
														
 
															+static inline unsigned long kvm_get_active_pcid(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	return kvm_get_pcid(vcpu, kvm_read_cr3(vcpu));
														
 
															+}
														
 
															+
														
 
															+static inline void kvm_mmu_load_cr3(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	if (VALID_PAGE(vcpu->arch.mmu.root_hpa))
														
 
															+		vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa |
														
 
															+					     kvm_get_active_pcid(vcpu));
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Currently, we have two sorts of write-protection, a) the first one
														
 
															  * write-protects guest page to sync the guest modification, b) another one is
														
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -181,7 +181,7 @@ no_present:
 
															  * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
														
 
															  * to signify readability since it isn't used in the EPT case
														
 
															  */
														
 
															-static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
														
 
															+static inline unsigned FNAME(gpte_access)(u64 gpte)
														
 
															 {
														
 
															 	unsigned access;
														
 
															 #if PTTYPE == PTTYPE_EPT
														
@@ -394,8 +394,8 @@ retry_walk:
 
															 	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
														
 
															 	/* Convert to ACC_*_MASK flags for struct guest_walker.  */
														
 
															-	walker->pt_access = FNAME(gpte_access)(vcpu, pt_access ^ walk_nx_mask);
														
 
															-	walker->pte_access = FNAME(gpte_access)(vcpu, pte_access ^ walk_nx_mask);
														
 
															+	walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
														
 
															+	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
														
 
															 	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
														
 
															 	if (unlikely(errcode))
														
 
															 		goto error;
														
@@ -508,7 +508,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
															 	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
														
 
															 	gfn = gpte_to_gfn(gpte);
														
 
															-	pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
														
 
															+	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
														
 
															 	FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
														
 
															 	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
														
 
															 			no_dirty_log && (pte_access & ACC_WRITE_MASK));
														
@@ -856,7 +856,7 @@ static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
 
															 	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
														
 
															 }
														
 
															-static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
														
 
															+static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
														
 
															 {
														
 
															 	struct kvm_shadow_walk_iterator iterator;
														
 
															 	struct kvm_mmu_page *sp;
														
@@ -871,13 +871,13 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
 
															 	 */
														
 
															 	mmu_topup_memory_caches(vcpu);
														
 
															-	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) {
														
 
															+	if (!VALID_PAGE(root_hpa)) {
														
 
															 		WARN_ON(1);
														
 
															 		return;
														
 
															 	}
														
 
															 	spin_lock(&vcpu->kvm->mmu_lock);
														
 
															-	for_each_shadow_entry(vcpu, gva, iterator) {
														
 
															+	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
														
 
															 		level = iterator.level;
														
 
															 		sptep = iterator.sptep;
														
@@ -968,6 +968,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
															 	int i, nr_present = 0;
														
 
															 	bool host_writable;
														
 
															 	gpa_t first_pte_gpa;
														
 
															+	int set_spte_ret = 0;
														
 
															 	/* direct kvm_mmu_page can not be unsync. */
														
 
															 	BUG_ON(sp->role.direct);
														
@@ -1002,7 +1003,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
															 		gfn = gpte_to_gfn(gpte);
														
 
															 		pte_access = sp->role.access;
														
 
															-		pte_access &= FNAME(gpte_access)(vcpu, gpte);
														
 
															+		pte_access &= FNAME(gpte_access)(gpte);
														
 
															 		FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte);
														
 
															 		if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
														
@@ -1024,12 +1025,15 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
 
															 		host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
														
 
															-		set_spte(vcpu, &sp->spt[i], pte_access,
														
 
															-			 PT_PAGE_TABLE_LEVEL, gfn,
														
 
															-			 spte_to_pfn(sp->spt[i]), true, false,
														
 
															-			 host_writable);
														
 
															+		set_spte_ret |= set_spte(vcpu, &sp->spt[i],
														
 
															+					 pte_access, PT_PAGE_TABLE_LEVEL,
														
 
															+					 gfn, spte_to_pfn(sp->spt[i]),
														
 
															+					 true, false, host_writable);
														
 
															 	}
														
 
															+	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
														
 
															+		kvm_flush_remote_tlbs(vcpu->kvm);
														
 
															+
														
 
															 	return nr_present;
														
 
															 }
														
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2884,7 +2884,6 @@ static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
 
															 	svm->vmcb->control.nested_cr3 = __sme_set(root);
														
 
															 	mark_dirty(svm->vmcb, VMCB_NPT);
														
 
															-	svm_flush_tlb(vcpu, true);
														
 
															 }
														
 
															 static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
														
@@ -5435,6 +5434,13 @@ static void svm_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 
															 		svm->asid_generation--;
														
 
															 }
														
 
															+static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva)
														
 
															+{
														
 
															+	struct vcpu_svm *svm = to_svm(vcpu);
														
 
															+
														
 
															+	invlpga(gva, svm->vmcb->control.asid);
														
 
															+}
														
 
															+
														
 
															 static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 }
														
@@ -5766,7 +5772,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
															 	svm->vmcb->save.cr3 = __sme_set(root);
														
 
															 	mark_dirty(svm->vmcb, VMCB_CR);
														
 
															-	svm_flush_tlb(vcpu, true);
														
 
															 }
														
 
															 static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
														
@@ -5779,8 +5784,6 @@ static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
 
															 	/* Also sync guest cr3 here in case we live migrate */
														
 
															 	svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
														
 
															 	mark_dirty(svm->vmcb, VMCB_CR);
														
 
															-
														
 
															-	svm_flush_tlb(vcpu, true);
														
 
															 }
														
 
															 static int is_disabled(void)
														
@@ -7090,6 +7093,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
 
															 	.set_rflags = svm_set_rflags,
														
 
															 	.tlb_flush = svm_flush_tlb,
														
 
															+	.tlb_flush_gva = svm_flush_tlb_gva,
														
 
															 	.run = svm_vcpu_run,
														
 
															 	.handle_exit = handle_exit,
														
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -38,6 +38,7 @@
 
															 #include "kvm_cache_regs.h"
														
 
															 #include "x86.h"
														
 
															+#include <asm/asm.h>
														
 
															 #include <asm/cpu.h>
														
 
															 #include <asm/io.h>
														
 
															 #include <asm/desc.h>
														
@@ -332,22 +333,53 @@ static const struct kernel_param_ops vmentry_l1d_flush_ops = {
 
															 };
														
 
															 module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
														
 
															+enum ept_pointers_status {
														
 
															+	EPT_POINTERS_CHECK = 0,
														
 
															+	EPT_POINTERS_MATCH = 1,
														
 
															+	EPT_POINTERS_MISMATCH = 2
														
 
															+};
														
 
															+
														
 
															 struct kvm_vmx {
														
 
															 	struct kvm kvm;
														
 
															 	unsigned int tss_addr;
														
 
															 	bool ept_identity_pagetable_done;
														
 
															 	gpa_t ept_identity_map_addr;
														
 
															+
														
 
															+	enum ept_pointers_status ept_pointers_match;
														
 
															+	spinlock_t ept_pointer_lock;
														
 
															 };
														
 
															 #define NR_AUTOLOAD_MSRS 8
														
 
															+struct vmcs_hdr {
														
 
															+	u32 revision_id:31;
														
 
															+	u32 shadow_vmcs:1;
														
 
															+};
														
 
															+
														
 
															 struct vmcs {
														
 
															-	u32 revision_id;
														
 
															+	struct vmcs_hdr hdr;
														
 
															 	u32 abort;
														
 
															 	char data[0];
														
 
															 };
														
 
															+/*
														
 
															+ * vmcs_host_state tracks registers that are loaded from the VMCS on VMEXIT
														
 
															+ * and whose values change infrequently, but are not constant.  I.e. this is
														
 
															+ * used as a write-through cache of the corresponding VMCS fields.
														
 
															+ */
														
 
															+struct vmcs_host_state {
														
 
															+	unsigned long cr3;	/* May not match real cr3 */
														
 
															+	unsigned long cr4;	/* May not match real cr4 */
														
 
															+	unsigned long gs_base;
														
 
															+	unsigned long fs_base;
														
 
															+
														
 
															+	u16           fs_sel, gs_sel, ldt_sel;
														
 
															+#ifdef CONFIG_X86_64
														
 
															+	u16           ds_sel, es_sel;
														
 
															+#endif
														
 
															+};
														
 
															+
														
 
															 /*
														
 
															  * Track a VMCS that may be loaded on a certain CPU. If it is (cpu!=-1), also
														
 
															  * remember whether it was VMLAUNCHed, and maintain a linked list of all VMCSs
														
@@ -359,14 +391,13 @@ struct loaded_vmcs {
 
															 	int cpu;
														
 
															 	bool launched;
														
 
															 	bool nmi_known_unmasked;
														
 
															-	unsigned long vmcs_host_cr3;	/* May not match real cr3 */
														
 
															-	unsigned long vmcs_host_cr4;	/* May not match real cr4 */
														
 
															 	/* Support for vnmi-less CPUs */
														
 
															 	int soft_vnmi_blocked;
														
 
															 	ktime_t entry_time;
														
 
															 	s64 vnmi_blocked_time;
														
 
															 	unsigned long *msr_bitmap;
														
 
															 	struct list_head loaded_vmcss_on_cpu_link;
														
 
															+	struct vmcs_host_state host_state;
														
 
															 };
														
 
															 struct shared_msr_entry {
														
@@ -397,7 +428,7 @@ struct __packed vmcs12 {
 
															 	/* According to the Intel spec, a VMCS region must start with the
														
 
															 	 * following two fields. Then follow implementation-specific data.
														
 
															 	 */
														
 
															-	u32 revision_id;
														
 
															+	struct vmcs_hdr hdr;
														
 
															 	u32 abort;
														
 
															 	u32 launch_state; /* set to 0 by VMCLEAR, to 1 by VMLAUNCH */
														
@@ -565,7 +596,7 @@ struct __packed vmcs12 {
 
															 		"Offset of " #field " in struct vmcs12 has changed.")
														
 
															 static inline void vmx_check_vmcs12_offsets(void) {
														
 
															-	CHECK_OFFSET(revision_id, 0);
														
 
															+	CHECK_OFFSET(hdr, 0);
														
 
															 	CHECK_OFFSET(abort, 4);
														
 
															 	CHECK_OFFSET(launch_state, 8);
														
 
															 	CHECK_OFFSET(io_bitmap_a, 40);
														
@@ -783,6 +814,12 @@ struct nested_vmx {
 
															 	 * memory during VMCLEAR and VMPTRLD.
														
 
															 	 */
														
 
															 	struct vmcs12 *cached_vmcs12;
														
 
															+	/*
														
 
															+	 * Cache of the guest's shadow VMCS, existing outside of guest
														
 
															+	 * memory. Loaded from guest memory during VM entry. Flushed
														
 
															+	 * to guest memory during VM exit.
														
 
															+	 */
														
 
															+	struct vmcs12 *cached_shadow_vmcs12;
														
 
															 	/*
														
 
															 	 * Indicates if the shadow vmcs must be updated with the
														
 
															 	 * data hold by vmcs12
														
@@ -933,25 +970,20 @@ struct vcpu_vmx {
 
															 	/*
														
 
															 	 * loaded_vmcs points to the VMCS currently used in this vcpu. For a
														
 
															 	 * non-nested (L1) guest, it always points to vmcs01. For a nested
														
 
															-	 * guest (L2), it points to a different VMCS.
														
 
															+	 * guest (L2), it points to a different VMCS.  loaded_cpu_state points
														
 
															+	 * to the VMCS whose state is loaded into the CPU registers that only
														
 
															+	 * need to be switched when transitioning to/from the kernel; a NULL
														
 
															+	 * value indicates that host state is loaded.
														
 
															 	 */
														
 
															 	struct loaded_vmcs    vmcs01;
														
 
															 	struct loaded_vmcs   *loaded_vmcs;
														
 
															+	struct loaded_vmcs   *loaded_cpu_state;
														
 
															 	bool                  __launched; /* temporary, used in vmx_vcpu_run */
														
 
															 	struct msr_autoload {
														
 
															 		struct vmx_msrs guest;
														
 
															 		struct vmx_msrs host;
														
 
															 	} msr_autoload;
														
 
															-	struct {
														
 
															-		int           loaded;
														
 
															-		u16           fs_sel, gs_sel, ldt_sel;
														
 
															-#ifdef CONFIG_X86_64
														
 
															-		u16           ds_sel, es_sel;
														
 
															-#endif
														
 
															-		int           gs_ldt_reload_needed;
														
 
															-		int           fs_reload_needed;
														
 
															-		u64           msr_host_bndcfgs;
														
 
															-	} host_state;
														
 
															+
														
 
															 	struct {
														
 
															 		int vm86_active;
														
 
															 		ulong save_rflags;
														
@@ -1001,6 +1033,7 @@ struct vcpu_vmx {
 
															 	 */
														
 
															 	u64 msr_ia32_feature_control;
														
 
															 	u64 msr_ia32_feature_control_valid_bits;
														
 
															+	u64 ept_pointer;
														
 
															 };
														
 
															 enum segment_cache_field {
														
@@ -1220,6 +1253,11 @@ static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
 
															 	return to_vmx(vcpu)->nested.cached_vmcs12;
														
 
															 }
														
 
															+static inline struct vmcs12 *get_shadow_vmcs12(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
														
 
															+}
														
 
															+
														
 
															 static bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu);
														
 
															 static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
														
 
															 static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
														
@@ -1490,6 +1528,48 @@ static void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
 
															 	 *	GUEST_IA32_RTIT_CTL		= 0x00002814,
														
 
															 	 */
														
 
															 }
														
 
															+
														
 
															+/* check_ept_pointer() should be under protection of ept_pointer_lock. */
														
 
															+static void check_ept_pointer_match(struct kvm *kvm)
														
 
															+{
														
 
															+	struct kvm_vcpu *vcpu;
														
 
															+	u64 tmp_eptp = INVALID_PAGE;
														
 
															+	int i;
														
 
															+
														
 
															+	kvm_for_each_vcpu(i, vcpu, kvm) {
														
 
															+		if (!VALID_PAGE(tmp_eptp)) {
														
 
															+			tmp_eptp = to_vmx(vcpu)->ept_pointer;
														
 
															+		} else if (tmp_eptp != to_vmx(vcpu)->ept_pointer) {
														
 
															+			to_kvm_vmx(kvm)->ept_pointers_match
														
 
															+				= EPT_POINTERS_MISMATCH;
														
 
															+			return;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	to_kvm_vmx(kvm)->ept_pointers_match = EPT_POINTERS_MATCH;
														
 
															+}
														
 
															+
														
 
															+static int vmx_hv_remote_flush_tlb(struct kvm *kvm)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															+
														
 
															+	if (to_kvm_vmx(kvm)->ept_pointers_match == EPT_POINTERS_CHECK)
														
 
															+		check_ept_pointer_match(kvm);
														
 
															+
														
 
															+	if (to_kvm_vmx(kvm)->ept_pointers_match != EPT_POINTERS_MATCH) {
														
 
															+		ret = -ENOTSUPP;
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															+	ret = hyperv_flush_guest_mapping(
														
 
															+			to_vmx(kvm_get_vcpu(kvm, 0))->ept_pointer);
														
 
															+
														
 
															+out:
														
 
															+	spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															+	return ret;
														
 
															+}
														
 
															 #else /* !IS_ENABLED(CONFIG_HYPERV) */
														
 
															 static inline void evmcs_write64(unsigned long field, u64 value) {}
														
 
															 static inline void evmcs_write32(unsigned long field, u32 value) {}
														
@@ -1864,6 +1944,12 @@ static inline bool nested_cpu_supports_monitor_trap_flag(struct kvm_vcpu *vcpu)
 
															 			CPU_BASED_MONITOR_TRAP_FLAG;
														
 
															 }
														
 
															+static inline bool nested_cpu_has_vmx_shadow_vmcs(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	return to_vmx(vcpu)->nested.msrs.secondary_ctls_high &
														
 
															+		SECONDARY_EXEC_SHADOW_VMCS;
														
 
															+}
														
 
															+
														
 
															 static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
														
 
															 {
														
 
															 	return vmcs12->cpu_based_vm_exec_control & bit;
														
@@ -1944,6 +2030,11 @@ static inline bool nested_cpu_has_eptp_switching(struct vmcs12 *vmcs12)
 
															 		 VMX_VMFUNC_EPTP_SWITCHING);
														
 
															 }
														
 
															+static inline bool nested_cpu_has_shadow_vmcs(struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	return nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS);
														
 
															+}
														
 
															+
														
 
															 static inline bool is_nmi(u32 intr_info)
														
 
															 {
														
 
															 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
														
@@ -1974,11 +2065,12 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
 
															 	u64 rsvd : 48;
														
 
															 	u64 gva;
														
 
															     } operand = { vpid, 0, gva };
														
 
															+    bool error;
														
 
															-    asm volatile (__ex(ASM_VMX_INVVPID)
														
 
															-		  /* CF==1 or ZF==1 --> rc = -1 */
														
 
															-		  "; ja 1f ; ud2 ; 1:"
														
 
															-		  : : "a"(&operand), "c"(ext) : "cc", "memory");
														
 
															+    asm volatile (__ex(ASM_VMX_INVVPID) CC_SET(na)
														
 
															+		  : CC_OUT(na) (error) : "a"(&operand), "c"(ext)
														
 
															+		  : "memory");
														
 
															+    BUG_ON(error);
														
 
															 }
														
 
															 static inline void __invept(int ext, u64 eptp, gpa_t gpa)
														
@@ -1986,11 +2078,12 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
 
															 	struct {
														
 
															 		u64 eptp, gpa;
														
 
															 	} operand = {eptp, gpa};
														
 
															+	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_INVEPT)
														
 
															-			/* CF==1 or ZF==1 --> rc = -1 */
														
 
															-			"; ja 1f ; ud2 ; 1:\n"
														
 
															-			: : "a" (&operand), "c" (ext) : "cc", "memory");
														
 
															+	asm volatile (__ex(ASM_VMX_INVEPT) CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "a" (&operand), "c" (ext)
														
 
															+		      : "memory");
														
 
															+	BUG_ON(error);
														
 
															 }
														
 
															 static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
														
@@ -2006,12 +2099,12 @@ static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
 
															 static void vmcs_clear(struct vmcs *vmcs)
														
 
															 {
														
 
															 	u64 phys_addr = __pa(vmcs);
														
 
															-	u8 error;
														
 
															+	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
														
 
															-		      : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															-		      : "cc", "memory");
														
 
															-	if (error)
														
 
															+	asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															+		      : "memory");
														
 
															+	if (unlikely(error))
														
 
															 		printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
														
 
															 		       vmcs, phys_addr);
														
 
															 }
														
@@ -2028,15 +2121,15 @@ static inline void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
 
															 static void vmcs_load(struct vmcs *vmcs)
														
 
															 {
														
 
															 	u64 phys_addr = __pa(vmcs);
														
 
															-	u8 error;
														
 
															+	bool error;
														
 
															 	if (static_branch_unlikely(&enable_evmcs))
														
 
															 		return evmcs_load(phys_addr);
														
 
															-	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
														
 
															-			: "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															-			: "cc", "memory");
														
 
															-	if (error)
														
 
															+	asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "a"(&phys_addr), "m"(phys_addr)
														
 
															+		      : "memory");
														
 
															+	if (unlikely(error))
														
 
															 		printk(KERN_ERR "kvm: vmptrld %p/%llx failed\n",
														
 
															 		       vmcs, phys_addr);
														
 
															 }
														
@@ -2114,6 +2207,19 @@ static void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
 
															 			 __loaded_vmcs_clear, loaded_vmcs, 1);
														
 
															 }
														
 
															+static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
														
 
															+{
														
 
															+	if (vpid == 0)
														
 
															+		return true;
														
 
															+
														
 
															+	if (cpu_has_vmx_invvpid_individual_addr()) {
														
 
															+		__invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
														
 
															+		return true;
														
 
															+	}
														
 
															+
														
 
															+	return false;
														
 
															+}
														
 
															+
														
 
															 static inline void vpid_sync_vcpu_single(int vpid)
														
 
															 {
														
 
															 	if (vpid == 0)
														
@@ -2248,10 +2354,10 @@ static noinline void vmwrite_error(unsigned long field, unsigned long value)
 
															 static __always_inline void __vmcs_writel(unsigned long field, unsigned long value)
														
 
															 {
														
 
															-	u8 error;
														
 
															+	bool error;
														
 
															-	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
														
 
															-		       : "=q"(error) : "a"(value), "d"(field) : "cc");
														
 
															+	asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) CC_SET(na)
														
 
															+		      : CC_OUT(na) (error) : "a"(value), "d"(field));
														
 
															 	if (unlikely(error))
														
 
															 		vmwrite_error(field, value);
														
 
															 }
														
@@ -2735,121 +2841,150 @@ static unsigned long segment_base(u16 selector)
 
															 }
														
 
															 #endif
														
 
															-static void vmx_save_host_state(struct kvm_vcpu *vcpu)
														
 
															+static void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct vmcs_host_state *host_state;
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	int cpu = raw_smp_processor_id();
														
 
															-	unsigned long fs_base, kernel_gs_base;
														
 
															 #endif
														
 
															+	unsigned long fs_base, gs_base;
														
 
															+	u16 fs_sel, gs_sel;
														
 
															 	int i;
														
 
															-	if (vmx->host_state.loaded)
														
 
															+	if (vmx->loaded_cpu_state)
														
 
															 		return;
														
 
															-	vmx->host_state.loaded = 1;
														
 
															+	vmx->loaded_cpu_state = vmx->loaded_vmcs;
														
 
															+	host_state = &vmx->loaded_cpu_state->host_state;
														
 
															+
														
 
															 	/*
														
 
															 	 * Set host fs and gs selectors.  Unfortunately, 22.2.3 does not
														
 
															 	 * allow segment selectors with cpl > 0 or ti == 1.
														
 
															 	 */
														
 
															-	vmx->host_state.ldt_sel = kvm_read_ldt();
														
 
															-	vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
														
 
															+	host_state->ldt_sel = kvm_read_ldt();
														
 
															 #ifdef CONFIG_X86_64
														
 
															+	savesegment(ds, host_state->ds_sel);
														
 
															+	savesegment(es, host_state->es_sel);
														
 
															+
														
 
															+	gs_base = cpu_kernelmode_gs_base(cpu);
														
 
															 	if (likely(is_64bit_mm(current->mm))) {
														
 
															 		save_fsgs_for_kvm();
														
 
															-		vmx->host_state.fs_sel = current->thread.fsindex;
														
 
															-		vmx->host_state.gs_sel = current->thread.gsindex;
														
 
															+		fs_sel = current->thread.fsindex;
														
 
															+		gs_sel = current->thread.gsindex;
														
 
															 		fs_base = current->thread.fsbase;
														
 
															-		kernel_gs_base = current->thread.gsbase;
														
 
															+		vmx->msr_host_kernel_gs_base = current->thread.gsbase;
														
 
															 	} else {
														
 
															-#endif
														
 
															-		savesegment(fs, vmx->host_state.fs_sel);
														
 
															-		savesegment(gs, vmx->host_state.gs_sel);
														
 
															-#ifdef CONFIG_X86_64
														
 
															+		savesegment(fs, fs_sel);
														
 
															+		savesegment(gs, gs_sel);
														
 
															 		fs_base = read_msr(MSR_FS_BASE);
														
 
															-		kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
														
 
															-	}
														
 
															-#endif
														
 
															-	if (!(vmx->host_state.fs_sel & 7)) {
														
 
															-		vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
														
 
															-		vmx->host_state.fs_reload_needed = 0;
														
 
															-	} else {
														
 
															-		vmcs_write16(HOST_FS_SELECTOR, 0);
														
 
															-		vmx->host_state.fs_reload_needed = 1;
														
 
															+		vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
														
 
															 	}
														
 
															-	if (!(vmx->host_state.gs_sel & 7))
														
 
															-		vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
														
 
															-	else {
														
 
															-		vmcs_write16(HOST_GS_SELECTOR, 0);
														
 
															-		vmx->host_state.gs_ldt_reload_needed = 1;
														
 
															-	}
														
 
															-
														
 
															-#ifdef CONFIG_X86_64
														
 
															-	savesegment(ds, vmx->host_state.ds_sel);
														
 
															-	savesegment(es, vmx->host_state.es_sel);
														
 
															-	vmcs_writel(HOST_FS_BASE, fs_base);
														
 
															-	vmcs_writel(HOST_GS_BASE, cpu_kernelmode_gs_base(cpu));
														
 
															-
														
 
															-	vmx->msr_host_kernel_gs_base = kernel_gs_base;
														
 
															 	if (is_long_mode(&vmx->vcpu))
														
 
															 		wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
														
 
															 #else
														
 
															-	vmcs_writel(HOST_FS_BASE, segment_base(vmx->host_state.fs_sel));
														
 
															-	vmcs_writel(HOST_GS_BASE, segment_base(vmx->host_state.gs_sel));
														
 
															+	savesegment(fs, fs_sel);
														
 
															+	savesegment(gs, gs_sel);
														
 
															+	fs_base = segment_base(fs_sel);
														
 
															+	gs_base = segment_base(gs_sel);
														
 
															 #endif
														
 
															-	if (boot_cpu_has(X86_FEATURE_MPX))
														
 
															-		rdmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
														
 
															+
														
 
															+	if (unlikely(fs_sel != host_state->fs_sel)) {
														
 
															+		if (!(fs_sel & 7))
														
 
															+			vmcs_write16(HOST_FS_SELECTOR, fs_sel);
														
 
															+		else
														
 
															+			vmcs_write16(HOST_FS_SELECTOR, 0);
														
 
															+		host_state->fs_sel = fs_sel;
														
 
															+	}
														
 
															+	if (unlikely(gs_sel != host_state->gs_sel)) {
														
 
															+		if (!(gs_sel & 7))
														
 
															+			vmcs_write16(HOST_GS_SELECTOR, gs_sel);
														
 
															+		else
														
 
															+			vmcs_write16(HOST_GS_SELECTOR, 0);
														
 
															+		host_state->gs_sel = gs_sel;
														
 
															+	}
														
 
															+	if (unlikely(fs_base != host_state->fs_base)) {
														
 
															+		vmcs_writel(HOST_FS_BASE, fs_base);
														
 
															+		host_state->fs_base = fs_base;
														
 
															+	}
														
 
															+	if (unlikely(gs_base != host_state->gs_base)) {
														
 
															+		vmcs_writel(HOST_GS_BASE, gs_base);
														
 
															+		host_state->gs_base = gs_base;
														
 
															+	}
														
 
															+
														
 
															 	for (i = 0; i < vmx->save_nmsrs; ++i)
														
 
															 		kvm_set_shared_msr(vmx->guest_msrs[i].index,
														
 
															 				   vmx->guest_msrs[i].data,
														
 
															 				   vmx->guest_msrs[i].mask);
														
 
															 }
														
 
															-static void __vmx_load_host_state(struct vcpu_vmx *vmx)
														
 
															+static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
														
 
															 {
														
 
															-	if (!vmx->host_state.loaded)
														
 
															+	struct vmcs_host_state *host_state;
														
 
															+
														
 
															+	if (!vmx->loaded_cpu_state)
														
 
															 		return;
														
 
															+	WARN_ON_ONCE(vmx->loaded_cpu_state != vmx->loaded_vmcs);
														
 
															+	host_state = &vmx->loaded_cpu_state->host_state;
														
 
															+
														
 
															 	++vmx->vcpu.stat.host_state_reload;
														
 
															-	vmx->host_state.loaded = 0;
														
 
															+	vmx->loaded_cpu_state = NULL;
														
 
															+
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	if (is_long_mode(&vmx->vcpu))
														
 
															 		rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
														
 
															 #endif
														
 
															-	if (vmx->host_state.gs_ldt_reload_needed) {
														
 
															-		kvm_load_ldt(vmx->host_state.ldt_sel);
														
 
															+	if (host_state->ldt_sel || (host_state->gs_sel & 7)) {
														
 
															+		kvm_load_ldt(host_state->ldt_sel);
														
 
															 #ifdef CONFIG_X86_64
														
 
															-		load_gs_index(vmx->host_state.gs_sel);
														
 
															+		load_gs_index(host_state->gs_sel);
														
 
															 #else
														
 
															-		loadsegment(gs, vmx->host_state.gs_sel);
														
 
															+		loadsegment(gs, host_state->gs_sel);
														
 
															 #endif
														
 
															 	}
														
 
															-	if (vmx->host_state.fs_reload_needed)
														
 
															-		loadsegment(fs, vmx->host_state.fs_sel);
														
 
															+	if (host_state->fs_sel & 7)
														
 
															+		loadsegment(fs, host_state->fs_sel);
														
 
															 #ifdef CONFIG_X86_64
														
 
															-	if (unlikely(vmx->host_state.ds_sel | vmx->host_state.es_sel)) {
														
 
															-		loadsegment(ds, vmx->host_state.ds_sel);
														
 
															-		loadsegment(es, vmx->host_state.es_sel);
														
 
															+	if (unlikely(host_state->ds_sel | host_state->es_sel)) {
														
 
															+		loadsegment(ds, host_state->ds_sel);
														
 
															+		loadsegment(es, host_state->es_sel);
														
 
															 	}
														
 
															 #endif
														
 
															 	invalidate_tss_limit();
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
														
 
															 #endif
														
 
															-	if (vmx->host_state.msr_host_bndcfgs)
														
 
															-		wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs);
														
 
															 	load_fixmap_gdt(raw_smp_processor_id());
														
 
															 }
														
 
															-static void vmx_load_host_state(struct vcpu_vmx *vmx)
														
 
															+#ifdef CONFIG_X86_64
														
 
															+static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
														
 
															 {
														
 
															-	preempt_disable();
														
 
															-	__vmx_load_host_state(vmx);
														
 
															-	preempt_enable();
														
 
															+	if (is_long_mode(&vmx->vcpu)) {
														
 
															+		preempt_disable();
														
 
															+		if (vmx->loaded_cpu_state)
														
 
															+			rdmsrl(MSR_KERNEL_GS_BASE,
														
 
															+			       vmx->msr_guest_kernel_gs_base);
														
 
															+		preempt_enable();
														
 
															+	}
														
 
															+	return vmx->msr_guest_kernel_gs_base;
														
 
															 }
														
 
															+static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
														
 
															+{
														
 
															+	if (is_long_mode(&vmx->vcpu)) {
														
 
															+		preempt_disable();
														
 
															+		if (vmx->loaded_cpu_state)
														
 
															+			wrmsrl(MSR_KERNEL_GS_BASE, data);
														
 
															+		preempt_enable();
														
 
															+	}
														
 
															+	vmx->msr_guest_kernel_gs_base = data;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
														
 
															 {
														
 
															 	struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
														
@@ -2991,7 +3126,7 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
 
															 {
														
 
															 	vmx_vcpu_pi_put(vcpu);
														
 
															-	__vmx_load_host_state(to_vmx(vcpu));
														
 
															+	vmx_prepare_switch_to_host(to_vmx(vcpu));
														
 
															 }
														
 
															 static bool emulation_required(struct kvm_vcpu *vcpu)
														
@@ -3212,7 +3347,7 @@ static bool vmx_rdtscp_supported(void)
 
															 static bool vmx_invpcid_supported(void)
														
 
															 {
														
 
															-	return cpu_has_vmx_invpcid() && enable_ept;
														
 
															+	return cpu_has_vmx_invpcid();
														
 
															 }
														
 
															 /*
														
@@ -3455,6 +3590,12 @@ static void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, bool apicv)
 
															 		SECONDARY_EXEC_APIC_REGISTER_VIRT |
														
 
															 		SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
														
 
															 		SECONDARY_EXEC_WBINVD_EXITING;
														
 
															+	/*
														
 
															+	 * We can emulate "VMCS shadowing," even if the hardware
														
 
															+	 * doesn't support it.
														
 
															+	 */
														
 
															+	msrs->secondary_ctls_high |=
														
 
															+		SECONDARY_EXEC_SHADOW_VMCS;
														
 
															 	if (enable_ept) {
														
 
															 		/* nested EPT: emulate EPT also to L1 */
														
@@ -3922,8 +4063,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 		msr_info->data = vmcs_readl(GUEST_GS_BASE);
														
 
															 		break;
														
 
															 	case MSR_KERNEL_GS_BASE:
														
 
															-		vmx_load_host_state(vmx);
														
 
															-		msr_info->data = vmx->msr_guest_kernel_gs_base;
														
 
															+		msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
														
 
															 		break;
														
 
															 #endif
														
 
															 	case MSR_EFER:
														
@@ -4023,8 +4163,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 		vmcs_writel(GUEST_GS_BASE, data);
														
 
															 		break;
														
 
															 	case MSR_KERNEL_GS_BASE:
														
 
															-		vmx_load_host_state(vmx);
														
 
															-		vmx->msr_guest_kernel_gs_base = data;
														
 
															+		vmx_write_guest_kernel_gs_base(vmx, data);
														
 
															 		break;
														
 
															 #endif
														
 
															 	case MSR_IA32_SYSENTER_CS:
														
@@ -4559,7 +4698,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
 
															 	return 0;
														
 
															 }
														
 
															-static struct vmcs *alloc_vmcs_cpu(int cpu)
														
 
															+static struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu)
														
 
															 {
														
 
															 	int node = cpu_to_node(cpu);
														
 
															 	struct page *pages;
														
@@ -4573,10 +4712,12 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
 
															 	/* KVM supports Enlightened VMCS v1 only */
														
 
															 	if (static_branch_unlikely(&enable_evmcs))
														
 
															-		vmcs->revision_id = KVM_EVMCS_VERSION;
														
 
															+		vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
														
 
															 	else
														
 
															-		vmcs->revision_id = vmcs_config.revision_id;
														
 
															+		vmcs->hdr.revision_id = vmcs_config.revision_id;
														
 
															+	if (shadow)
														
 
															+		vmcs->hdr.shadow_vmcs = 1;
														
 
															 	return vmcs;
														
 
															 }
														
@@ -4600,14 +4741,14 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 
															 	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
														
 
															 }
														
 
															-static struct vmcs *alloc_vmcs(void)
														
 
															+static struct vmcs *alloc_vmcs(bool shadow)
														
 
															 {
														
 
															-	return alloc_vmcs_cpu(raw_smp_processor_id());
														
 
															+	return alloc_vmcs_cpu(shadow, raw_smp_processor_id());
														
 
															 }
														
 
															 static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
														
 
															 {
														
 
															-	loaded_vmcs->vmcs = alloc_vmcs();
														
 
															+	loaded_vmcs->vmcs = alloc_vmcs(false);
														
 
															 	if (!loaded_vmcs->vmcs)
														
 
															 		return -ENOMEM;
														
@@ -4629,6 +4770,9 @@ static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
 
															 			evmcs->hv_enlightenments_control.msr_bitmap = 1;
														
 
															 		}
														
 
															 	}
														
 
															+
														
 
															+	memset(&loaded_vmcs->host_state, 0, sizeof(struct vmcs_host_state));
														
 
															+
														
 
															 	return 0;
														
 
															 out_vmcs:
														
@@ -4738,7 +4882,7 @@ static __init int alloc_kvm_area(void)
 
															 	for_each_possible_cpu(cpu) {
														
 
															 		struct vmcs *vmcs;
														
 
															-		vmcs = alloc_vmcs_cpu(cpu);
														
 
															+		vmcs = alloc_vmcs_cpu(false, cpu);
														
 
															 		if (!vmcs) {
														
 
															 			free_kvm_area();
														
 
															 			return -ENOMEM;
														
@@ -4755,7 +4899,7 @@ static __init int alloc_kvm_area(void)
 
															 		 * physical CPU.
														
 
															 		 */
														
 
															 		if (static_branch_unlikely(&enable_evmcs))
														
 
															-			vmcs->revision_id = vmcs_config.revision_id;
														
 
															+			vmcs->hdr.revision_id = vmcs_config.revision_id;
														
 
															 		per_cpu(vmxarea, cpu) = vmcs;
														
 
															 	}
														
@@ -4912,10 +5056,18 @@ static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
 
															 		return;
														
 
															 	/*
														
 
															-	 * Force kernel_gs_base reloading before EFER changes, as control
														
 
															-	 * of this msr depends on is_long_mode().
														
 
															+	 * MSR_KERNEL_GS_BASE is not intercepted when the guest is in
														
 
															+	 * 64-bit mode as a 64-bit kernel may frequently access the
														
 
															+	 * MSR.  This means we need to manually save/restore the MSR
														
 
															+	 * when switching between guest and host state, but only if
														
 
															+	 * the guest is in 64-bit mode.  Sync our cached value if the
														
 
															+	 * guest is transitioning to 32-bit mode and the CPU contains
														
 
															+	 * guest state, i.e. the cache is stale.
														
 
															 	 */
														
 
															-	vmx_load_host_state(to_vmx(vcpu));
														
 
															+#ifdef CONFIG_X86_64
														
 
															+	if (!(efer & EFER_LMA))
														
 
															+		(void)vmx_read_guest_kernel_gs_base(vmx);
														
 
															+#endif
														
 
															 	vcpu->arch.efer = efer;
														
 
															 	if (efer & EFER_LMA) {
														
 
															 		vm_entry_controls_setbit(to_vmx(vcpu), VM_ENTRY_IA32E_MODE);
														
@@ -4972,6 +5124,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
 
															 	__vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
														
 
															 }
														
 
															+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
														
 
															+{
														
 
															+	int vpid = to_vmx(vcpu)->vpid;
														
 
															+
														
 
															+	if (!vpid_sync_vcpu_addr(vpid, addr))
														
 
															+		vpid_sync_context(vpid);
														
 
															+
														
 
															+	/*
														
 
															+	 * If VPIDs are not supported or enabled, then the above is a no-op.
														
 
															+	 * But we don't really need a TLB flush in that case anyway, because
														
 
															+	 * each VM entry/exit includes an implicit flush when VPID is 0.
														
 
															+	 */
														
 
															+}
														
 
															+
														
 
															 static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
														
@@ -5153,6 +5319,7 @@ static u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
 
															 static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
														
 
															 {
														
 
															+	struct kvm *kvm = vcpu->kvm;
														
 
															 	unsigned long guest_cr3;
														
 
															 	u64 eptp;
														
@@ -5160,15 +5327,23 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
															 	if (enable_ept) {
														
 
															 		eptp = construct_eptp(vcpu, cr3);
														
 
															 		vmcs_write64(EPT_POINTER, eptp);
														
 
															+
														
 
															+		if (kvm_x86_ops->tlb_remote_flush) {
														
 
															+			spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															+			to_vmx(vcpu)->ept_pointer = eptp;
														
 
															+			to_kvm_vmx(kvm)->ept_pointers_match
														
 
															+				= EPT_POINTERS_CHECK;
														
 
															+			spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															+		}
														
 
															+
														
 
															 		if (enable_unrestricted_guest || is_paging(vcpu) ||
														
 
															 		    is_guest_mode(vcpu))
														
 
															 			guest_cr3 = kvm_read_cr3(vcpu);
														
 
															 		else
														
 
															-			guest_cr3 = to_kvm_vmx(vcpu->kvm)->ept_identity_map_addr;
														
 
															+			guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
														
 
															 		ept_load_pdptrs(vcpu);
														
 
															 	}
														
 
															-	vmx_flush_tlb(vcpu, true);
														
 
															 	vmcs_writel(GUEST_CR3, guest_cr3);
														
 
															 }
														
@@ -6104,19 +6279,19 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
 
															 	 */
														
 
															 	cr3 = __read_cr3();
														
 
															 	vmcs_writel(HOST_CR3, cr3);		/* 22.2.3  FIXME: shadow tables */
														
 
															-	vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
														
 
															+	vmx->loaded_vmcs->host_state.cr3 = cr3;
														
 
															 	/* Save the most likely value for this task's CR4 in the VMCS. */
														
 
															 	cr4 = cr4_read_shadow();
														
 
															 	vmcs_writel(HOST_CR4, cr4);			/* 22.2.3, 22.2.5 */
														
 
															-	vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
														
 
															+	vmx->loaded_vmcs->host_state.cr4 = cr4;
														
 
															 	vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS);  /* 22.2.4 */
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	/*
														
 
															 	 * Load null selectors, so we can avoid reloading them in
														
 
															-	 * __vmx_load_host_state(), in case userspace uses the null selectors
														
 
															-	 * too (the expected case).
														
 
															+	 * vmx_prepare_switch_to_host(), in case userspace uses
														
 
															+	 * the null selectors too (the expected case).
														
 
															 	 */
														
 
															 	vmcs_write16(HOST_DS_SELECTOR, 0);
														
 
															 	vmcs_write16(HOST_ES_SELECTOR, 0);
														
@@ -6241,8 +6416,6 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
 
															 	if (!enable_ept) {
														
 
															 		exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
														
 
															 		enable_unrestricted_guest = 0;
														
 
															-		/* Enable INVPCID for non-ept guests may cause performance regression. */
														
 
															-		exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
														
 
															 	}
														
 
															 	if (!enable_unrestricted_guest)
														
 
															 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
														
@@ -6371,9 +6544,6 @@ static void ept_set_mmio_spte_mask(void)
 
															  */
														
 
															 static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
														
 
															 {
														
 
															-#ifdef CONFIG_X86_64
														
 
															-	unsigned long a;
														
 
															-#endif
														
 
															 	int i;
														
 
															 	if (enable_shadow_vmcs) {
														
@@ -6428,15 +6598,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
 
															 	vmcs_write16(HOST_FS_SELECTOR, 0);            /* 22.2.4 */
														
 
															 	vmcs_write16(HOST_GS_SELECTOR, 0);            /* 22.2.4 */
														
 
															 	vmx_set_constant_host_state(vmx);
														
 
															-#ifdef CONFIG_X86_64
														
 
															-	rdmsrl(MSR_FS_BASE, a);
														
 
															-	vmcs_writel(HOST_FS_BASE, a); /* 22.2.4 */
														
 
															-	rdmsrl(MSR_GS_BASE, a);
														
 
															-	vmcs_writel(HOST_GS_BASE, a); /* 22.2.4 */
														
 
															-#else
														
 
															 	vmcs_writel(HOST_FS_BASE, 0); /* 22.2.4 */
														
 
															 	vmcs_writel(HOST_GS_BASE, 0); /* 22.2.4 */
														
 
															-#endif
														
 
															 	if (cpu_has_vmx_vmfunc())
														
 
															 		vmcs_write64(VM_FUNCTION_CONTROL, 0);
														
@@ -7670,6 +7833,7 @@ static void vmx_enable_tdp(void)
 
															 static __init int hardware_setup(void)
														
 
															 {
														
 
															+	unsigned long host_bndcfgs;
														
 
															 	int r = -ENOMEM, i;
														
 
															 	rdmsrl_safe(MSR_EFER, &host_efer);
														
@@ -7694,6 +7858,11 @@ static __init int hardware_setup(void)
 
															 	if (boot_cpu_has(X86_FEATURE_NX))
														
 
															 		kvm_enable_efer_bits(EFER_NX);
														
 
															+	if (boot_cpu_has(X86_FEATURE_MPX)) {
														
 
															+		rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
														
 
															+		WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
														
 
															+	}
														
 
															+
														
 
															 	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
														
 
															 		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
														
 
															 		enable_vpid = 0;
														
@@ -7730,6 +7899,12 @@ static __init int hardware_setup(void)
 
															 	if (enable_ept && !cpu_has_vmx_ept_2m_page())
														
 
															 		kvm_disable_largepages();
														
 
															+#if IS_ENABLED(CONFIG_HYPERV)
														
 
															+	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
														
 
															+	    && enable_ept)
														
 
															+		kvm_x86_ops->tlb_remote_flush = vmx_hv_remote_flush_tlb;
														
 
															+#endif
														
 
															+
														
 
															 	if (!cpu_has_vmx_ple()) {
														
 
															 		ple_gap = 0;
														
 
															 		ple_window = 0;
														
@@ -7756,6 +7931,11 @@ static __init int hardware_setup(void)
 
															 	else
														
 
															 		kvm_disable_tdp();
														
 
															+	if (!nested) {
														
 
															+		kvm_x86_ops->get_nested_state = NULL;
														
 
															+		kvm_x86_ops->set_nested_state = NULL;
														
 
															+	}
														
 
															+
														
 
															 	/*
														
 
															 	 * Only enable PML when hardware supports PML feature, and both EPT
														
 
															 	 * and EPT A/D bit features are enabled -- PML depends on them to work.
														
@@ -8032,10 +8212,35 @@ static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
 
															 	return 0;
														
 
															 }
														
 
															+/*
														
 
															+ * Allocate a shadow VMCS and associate it with the currently loaded
														
 
															+ * VMCS, unless such a shadow VMCS already exists. The newly allocated
														
 
															+ * VMCS is also VMCLEARed, so that it is ready for use.
														
 
															+ */
														
 
															+static struct vmcs *alloc_shadow_vmcs(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct loaded_vmcs *loaded_vmcs = vmx->loaded_vmcs;
														
 
															+
														
 
															+	/*
														
 
															+	 * We should allocate a shadow vmcs for vmcs01 only when L1
														
 
															+	 * executes VMXON and free it when L1 executes VMXOFF.
														
 
															+	 * As it is invalid to execute VMXON twice, we shouldn't reach
														
 
															+	 * here when vmcs01 already have an allocated shadow vmcs.
														
 
															+	 */
														
 
															+	WARN_ON(loaded_vmcs == &vmx->vmcs01 && loaded_vmcs->shadow_vmcs);
														
 
															+
														
 
															+	if (!loaded_vmcs->shadow_vmcs) {
														
 
															+		loaded_vmcs->shadow_vmcs = alloc_vmcs(true);
														
 
															+		if (loaded_vmcs->shadow_vmcs)
														
 
															+			vmcs_clear(loaded_vmcs->shadow_vmcs);
														
 
															+	}
														
 
															+	return loaded_vmcs->shadow_vmcs;
														
 
															+}
														
 
															+
														
 
															 static int enter_vmx_operation(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															-	struct vmcs *shadow_vmcs;
														
 
															 	int r;
														
 
															 	r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
														
@@ -8046,16 +8251,12 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
															 	if (!vmx->nested.cached_vmcs12)
														
 
															 		goto out_cached_vmcs12;
														
 
															-	if (enable_shadow_vmcs) {
														
 
															-		shadow_vmcs = alloc_vmcs();
														
 
															-		if (!shadow_vmcs)
														
 
															-			goto out_shadow_vmcs;
														
 
															-		/* mark vmcs as shadow */
														
 
															-		shadow_vmcs->revision_id |= (1u << 31);
														
 
															-		/* init shadow vmcs */
														
 
															-		vmcs_clear(shadow_vmcs);
														
 
															-		vmx->vmcs01.shadow_vmcs = shadow_vmcs;
														
 
															-	}
														
 
															+	vmx->nested.cached_shadow_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
														
 
															+	if (!vmx->nested.cached_shadow_vmcs12)
														
 
															+		goto out_cached_shadow_vmcs12;
														
 
															+
														
 
															+	if (enable_shadow_vmcs && !alloc_shadow_vmcs(vcpu))
														
 
															+		goto out_shadow_vmcs;
														
 
															 	hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
														
 
															 		     HRTIMER_MODE_REL_PINNED);
														
@@ -8067,6 +8268,9 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
 
															 	return 0;
														
 
															 out_shadow_vmcs:
														
 
															+	kfree(vmx->nested.cached_shadow_vmcs12);
														
 
															+
														
 
															+out_cached_shadow_vmcs12:
														
 
															 	kfree(vmx->nested.cached_vmcs12);
														
 
															 out_cached_vmcs12:
														
@@ -8109,7 +8313,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
															 	/* CPL=0 must be checked manually. */
														
 
															 	if (vmx_get_cpl(vcpu)) {
														
 
															-		kvm_queue_exception(vcpu, UD_VECTOR);
														
 
															+		kvm_inject_gp(vcpu, 0);
														
 
															 		return 1;
														
 
															 	}
														
@@ -8172,15 +8376,16 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
 
															  */
														
 
															 static int nested_vmx_check_permission(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	if (vmx_get_cpl(vcpu)) {
														
 
															+	if (!to_vmx(vcpu)->nested.vmxon) {
														
 
															 		kvm_queue_exception(vcpu, UD_VECTOR);
														
 
															 		return 0;
														
 
															 	}
														
 
															-	if (!to_vmx(vcpu)->nested.vmxon) {
														
 
															-		kvm_queue_exception(vcpu, UD_VECTOR);
														
 
															+	if (vmx_get_cpl(vcpu)) {
														
 
															+		kvm_inject_gp(vcpu, 0);
														
 
															 		return 0;
														
 
															 	}
														
 
															+
														
 
															 	return 1;
														
 
															 }
														
@@ -8233,6 +8438,7 @@ static void free_nested(struct vcpu_vmx *vmx)
 
															 		vmx->vmcs01.shadow_vmcs = NULL;
														
 
															 	}
														
 
															 	kfree(vmx->nested.cached_vmcs12);
														
 
															+	kfree(vmx->nested.cached_shadow_vmcs12);
														
 
															 	/* Unpin physical memory we referred to in the vmcs02 */
														
 
															 	if (vmx->nested.apic_access_page) {
														
 
															 		kvm_release_page_dirty(vmx->nested.apic_access_page);
														
@@ -8318,7 +8524,7 @@ static int handle_vmresume(struct kvm_vcpu *vcpu)
 
															  * some of the bits we return here (e.g., on 32-bit guests, only 32 bits of
														
 
															  * 64-bit fields are to be returned).
														
 
															  */
														
 
															-static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
														
 
															+static inline int vmcs12_read_any(struct vmcs12 *vmcs12,
														
 
															 				  unsigned long field, u64 *ret)
														
 
															 {
														
 
															 	short offset = vmcs_field_to_offset(field);
														
@@ -8327,7 +8533,7 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 
															 	if (offset < 0)
														
 
															 		return offset;
														
 
															-	p = ((char *)(get_vmcs12(vcpu))) + offset;
														
 
															+	p = (char *)vmcs12 + offset;
														
 
															 	switch (vmcs_field_width(field)) {
														
 
															 	case VMCS_FIELD_WIDTH_NATURAL_WIDTH:
														
@@ -8349,10 +8555,10 @@ static inline int vmcs12_read_any(struct kvm_vcpu *vcpu,
 
															 }
														
 
															-static inline int vmcs12_write_any(struct kvm_vcpu *vcpu,
														
 
															+static inline int vmcs12_write_any(struct vmcs12 *vmcs12,
														
 
															 				   unsigned long field, u64 field_value){
														
 
															 	short offset = vmcs_field_to_offset(field);
														
 
															-	char *p = ((char *) get_vmcs12(vcpu)) + offset;
														
 
															+	char *p = (char *)vmcs12 + offset;
														
 
															 	if (offset < 0)
														
 
															 		return offset;
														
@@ -8405,7 +8611,7 @@ static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx)
 
															 		for (i = 0; i < max_fields[q]; i++) {
														
 
															 			field = fields[q][i];
														
 
															 			field_value = __vmcs_readl(field);
														
 
															-			vmcs12_write_any(&vmx->vcpu, field, field_value);
														
 
															+			vmcs12_write_any(get_vmcs12(&vmx->vcpu), field, field_value);
														
 
															 		}
														
 
															 		/*
														
 
															 		 * Skip the VM-exit information fields if they are read-only.
														
@@ -8440,7 +8646,7 @@ static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx)
 
															 	for (q = 0; q < ARRAY_SIZE(fields); q++) {
														
 
															 		for (i = 0; i < max_fields[q]; i++) {
														
 
															 			field = fields[q][i];
														
 
															-			vmcs12_read_any(&vmx->vcpu, field, &field_value);
														
 
															+			vmcs12_read_any(get_vmcs12(&vmx->vcpu), field, &field_value);
														
 
															 			__vmcs_writel(field, field_value);
														
 
															 		}
														
 
															 	}
														
@@ -8470,6 +8676,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
															 	unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
														
 
															 	u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
														
 
															 	gva_t gva = 0;
														
 
															+	struct vmcs12 *vmcs12;
														
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
@@ -8477,10 +8684,24 @@ static int handle_vmread(struct kvm_vcpu *vcpu)
 
															 	if (!nested_vmx_check_vmcs12(vcpu))
														
 
															 		return kvm_skip_emulated_instruction(vcpu);
														
 
															+	if (!is_guest_mode(vcpu))
														
 
															+		vmcs12 = get_vmcs12(vcpu);
														
 
															+	else {
														
 
															+		/*
														
 
															+		 * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
														
 
															+		 * to shadowed-field sets the ALU flags for VMfailInvalid.
														
 
															+		 */
														
 
															+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
														
 
															+			nested_vmx_failInvalid(vcpu);
														
 
															+			return kvm_skip_emulated_instruction(vcpu);
														
 
															+		}
														
 
															+		vmcs12 = get_shadow_vmcs12(vcpu);
														
 
															+	}
														
 
															+
														
 
															 	/* Decode instruction info and find the field to read */
														
 
															 	field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
														
 
															 	/* Read the field, zero-extended to a u64 field_value */
														
 
															-	if (vmcs12_read_any(vcpu, field, &field_value) < 0) {
														
 
															+	if (vmcs12_read_any(vmcs12, field, &field_value) < 0) {
														
 
															 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															 		return kvm_skip_emulated_instruction(vcpu);
														
 
															 	}
														
@@ -8522,6 +8743,7 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 	 */
														
 
															 	u64 field_value = 0;
														
 
															 	struct x86_exception e;
														
 
															+	struct vmcs12 *vmcs12;
														
 
															 	if (!nested_vmx_check_permission(vcpu))
														
 
															 		return 1;
														
@@ -8556,23 +8778,44 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu)
 
															 		return kvm_skip_emulated_instruction(vcpu);
														
 
															 	}
														
 
															-	if (vmcs12_write_any(vcpu, field, field_value) < 0) {
														
 
															+	if (!is_guest_mode(vcpu))
														
 
															+		vmcs12 = get_vmcs12(vcpu);
														
 
															+	else {
														
 
															+		/*
														
 
															+		 * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
														
 
															+		 * to shadowed-field sets the ALU flags for VMfailInvalid.
														
 
															+		 */
														
 
															+		if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull) {
														
 
															+			nested_vmx_failInvalid(vcpu);
														
 
															+			return kvm_skip_emulated_instruction(vcpu);
														
 
															+		}
														
 
															+		vmcs12 = get_shadow_vmcs12(vcpu);
														
 
															+
														
 
															+	}
														
 
															+
														
 
															+	if (vmcs12_write_any(vmcs12, field, field_value) < 0) {
														
 
															 		nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
														
 
															 		return kvm_skip_emulated_instruction(vcpu);
														
 
															 	}
														
 
															-	switch (field) {
														
 
															+	/*
														
 
															+	 * Do not track vmcs12 dirty-state if in guest-mode
														
 
															+	 * as we actually dirty shadow vmcs12 instead of vmcs12.
														
 
															+	 */
														
 
															+	if (!is_guest_mode(vcpu)) {
														
 
															+		switch (field) {
														
 
															 #define SHADOW_FIELD_RW(x) case x:
														
 
															 #include "vmx_shadow_fields.h"
														
 
															-		/*
														
 
															-		 * The fields that can be updated by L1 without a vmexit are
														
 
															-		 * always updated in the vmcs02, the others go down the slow
														
 
															-		 * path of prepare_vmcs02.
														
 
															-		 */
														
 
															-		break;
														
 
															-	default:
														
 
															-		vmx->nested.dirty_vmcs12 = true;
														
 
															-		break;
														
 
															+			/*
														
 
															+			 * The fields that can be updated by L1 without a vmexit are
														
 
															+			 * always updated in the vmcs02, the others go down the slow
														
 
															+			 * path of prepare_vmcs02.
														
 
															+			 */
														
 
															+			break;
														
 
															+		default:
														
 
															+			vmx->nested.dirty_vmcs12 = true;
														
 
															+			break;
														
 
															+		}
														
 
															 	}
														
 
															 	nested_vmx_succeed(vcpu);
														
@@ -8623,7 +8866,9 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
 
															 			return kvm_skip_emulated_instruction(vcpu);
														
 
															 		}
														
 
															 		new_vmcs12 = kmap(page);
														
 
															-		if (new_vmcs12->revision_id != VMCS12_REVISION) {
														
 
															+		if (new_vmcs12->hdr.revision_id != VMCS12_REVISION ||
														
 
															+		    (new_vmcs12->hdr.shadow_vmcs &&
														
 
															+		     !nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
														
 
															 			kunmap(page);
														
 
															 			kvm_release_page_clean(page);
														
 
															 			nested_vmx_failValid(vcpu,
														
@@ -8821,6 +9066,105 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 
															 	return kvm_skip_emulated_instruction(vcpu);
														
 
															 }
														
 
															+static int handle_invpcid(struct kvm_vcpu *vcpu)
														
 
															+{
														
 
															+	u32 vmx_instruction_info;
														
 
															+	unsigned long type;
														
 
															+	bool pcid_enabled;
														
 
															+	gva_t gva;
														
 
															+	struct x86_exception e;
														
 
															+	unsigned i;
														
 
															+	unsigned long roots_to_free = 0;
														
 
															+	struct {
														
 
															+		u64 pcid;
														
 
															+		u64 gla;
														
 
															+	} operand;
														
 
															+
														
 
															+	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
														
 
															+		kvm_queue_exception(vcpu, UD_VECTOR);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
														
 
															+	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
														
 
															+
														
 
															+	if (type > 3) {
														
 
															+		kvm_inject_gp(vcpu, 0);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	/* According to the Intel instruction reference, the memory operand
														
 
															+	 * is read even if it isn't needed (e.g., for type==all)
														
 
															+	 */
														
 
															+	if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
														
 
															+				vmx_instruction_info, false, &gva))
														
 
															+		return 1;
														
 
															+
														
 
															+	if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
														
 
															+		kvm_inject_page_fault(vcpu, &e);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	if (operand.pcid >> 12 != 0) {
														
 
															+		kvm_inject_gp(vcpu, 0);
														
 
															+		return 1;
														
 
															+	}
														
 
															+
														
 
															+	pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
														
 
															+
														
 
															+	switch (type) {
														
 
															+	case INVPCID_TYPE_INDIV_ADDR:
														
 
															+		if ((!pcid_enabled && (operand.pcid != 0)) ||
														
 
															+		    is_noncanonical_address(operand.gla, vcpu)) {
														
 
															+			kvm_inject_gp(vcpu, 0);
														
 
															+			return 1;
														
 
															+		}
														
 
															+		kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
														
 
															+		return kvm_skip_emulated_instruction(vcpu);
														
 
															+
														
 
															+	case INVPCID_TYPE_SINGLE_CTXT:
														
 
															+		if (!pcid_enabled && (operand.pcid != 0)) {
														
 
															+			kvm_inject_gp(vcpu, 0);
														
 
															+			return 1;
														
 
															+		}
														
 
															+
														
 
															+		if (kvm_get_active_pcid(vcpu) == operand.pcid) {
														
 
															+			kvm_mmu_sync_roots(vcpu);
														
 
															+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															+		}
														
 
															+
														
 
															+		for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
														
 
															+			if (kvm_get_pcid(vcpu, vcpu->arch.mmu.prev_roots[i].cr3)
														
 
															+			    == operand.pcid)
														
 
															+				roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
														
 
															+
														
 
															+		kvm_mmu_free_roots(vcpu, roots_to_free);
														
 
															+		/*
														
 
															+		 * If neither the current cr3 nor any of the prev_roots use the
														
 
															+		 * given PCID, then nothing needs to be done here because a
														
 
															+		 * resync will happen anyway before switching to any other CR3.
														
 
															+		 */
														
 
															+
														
 
															+		return kvm_skip_emulated_instruction(vcpu);
														
 
															+
														
 
															+	case INVPCID_TYPE_ALL_NON_GLOBAL:
														
 
															+		/*
														
 
															+		 * Currently, KVM doesn't mark global entries in the shadow
														
 
															+		 * page tables, so a non-global flush just degenerates to a
														
 
															+		 * global flush. If needed, we could optimize this later by
														
 
															+		 * keeping track of global entries in shadow page tables.
														
 
															+		 */
														
 
															+
														
 
															+		/* fall-through */
														
 
															+	case INVPCID_TYPE_ALL_INCL_GLOBAL:
														
 
															+		kvm_mmu_unload(vcpu);
														
 
															+		return kvm_skip_emulated_instruction(vcpu);
														
 
															+
														
 
															+	default:
														
 
															+		BUG(); /* We have already checked above that type <= 3 */
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 static int handle_pml_full(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															 	unsigned long exit_qualification;
														
@@ -9024,6 +9368,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
 
															 	[EXIT_REASON_XSAVES]                  = handle_xsaves,
														
 
															 	[EXIT_REASON_XRSTORS]                 = handle_xrstors,
														
 
															 	[EXIT_REASON_PML_FULL]		      = handle_pml_full,
														
 
															+	[EXIT_REASON_INVPCID]                 = handle_invpcid,
														
 
															 	[EXIT_REASON_VMFUNC]                  = handle_vmfunc,
														
 
															 	[EXIT_REASON_PREEMPTION_TIMER]	      = handle_preemption_timer,
														
 
															 };
														
@@ -9196,6 +9541,30 @@ static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
 
															 	return false;
														
 
															 }
														
 
															+static bool nested_vmx_exit_handled_vmcs_access(struct kvm_vcpu *vcpu,
														
 
															+	struct vmcs12 *vmcs12, gpa_t bitmap)
														
 
															+{
														
 
															+	u32 vmx_instruction_info;
														
 
															+	unsigned long field;
														
 
															+	u8 b;
														
 
															+
														
 
															+	if (!nested_cpu_has_shadow_vmcs(vmcs12))
														
 
															+		return true;
														
 
															+
														
 
															+	/* Decode instruction info and find the field to access */
														
 
															+	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
														
 
															+	field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
														
 
															+
														
 
															+	/* Out-of-range fields always cause a VM exit from L2 to L1 */
														
 
															+	if (field >> 15)
														
 
															+		return true;
														
 
															+
														
 
															+	if (kvm_vcpu_read_guest(vcpu, bitmap + field/8, &b, 1))
														
 
															+		return true;
														
 
															+
														
 
															+	return 1 & (b >> (field & 7));
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
														
 
															  * should handle it ourselves in L0 (and then continue L2). Only call this
														
@@ -9280,10 +9649,15 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
 
															 		return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED_EXITING);
														
 
															 	case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP:
														
 
															 		return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING);
														
 
															+	case EXIT_REASON_VMREAD:
														
 
															+		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
														
 
															+			vmcs12->vmread_bitmap);
														
 
															+	case EXIT_REASON_VMWRITE:
														
 
															+		return nested_vmx_exit_handled_vmcs_access(vcpu, vmcs12,
														
 
															+			vmcs12->vmwrite_bitmap);
														
 
															 	case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR:
														
 
															 	case EXIT_REASON_VMLAUNCH: case EXIT_REASON_VMPTRLD:
														
 
															-	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMREAD:
														
 
															-	case EXIT_REASON_VMRESUME: case EXIT_REASON_VMWRITE:
														
 
															+	case EXIT_REASON_VMPTRST: case EXIT_REASON_VMRESUME:
														
 
															 	case EXIT_REASON_VMOFF: case EXIT_REASON_VMON:
														
 
															 	case EXIT_REASON_INVEPT: case EXIT_REASON_INVVPID:
														
 
															 		/*
														
@@ -10244,15 +10618,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 		vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
														
 
															 	cr3 = __get_current_cr3_fast();
														
 
															-	if (unlikely(cr3 != vmx->loaded_vmcs->vmcs_host_cr3)) {
														
 
															+	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
														
 
															 		vmcs_writel(HOST_CR3, cr3);
														
 
															-		vmx->loaded_vmcs->vmcs_host_cr3 = cr3;
														
 
															+		vmx->loaded_vmcs->host_state.cr3 = cr3;
														
 
															 	}
														
 
															 	cr4 = cr4_read_shadow();
														
 
															-	if (unlikely(cr4 != vmx->loaded_vmcs->vmcs_host_cr4)) {
														
 
															+	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
														
 
															 		vmcs_writel(HOST_CR4, cr4);
														
 
															-		vmx->loaded_vmcs->vmcs_host_cr4 = cr4;
														
 
															+		vmx->loaded_vmcs->host_state.cr4 = cr4;
														
 
															 	}
														
 
															 	/* When single-stepping over STI and MOV SS, we must clear the
														
@@ -10448,9 +10822,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 
															 	 * The sysexit path does not restore ds/es, so we must set them to
														
 
															 	 * a reasonable value ourselves.
														
 
															 	 *
														
 
															-	 * We can't defer this to vmx_load_host_state() since that function
														
 
															-	 * may be executed in interrupt context, which saves and restore segments
														
 
															-	 * around it, nullifying its effect.
														
 
															+	 * We can't defer this to vmx_prepare_switch_to_host() since that
														
 
															+	 * function may be executed in interrupt context, which saves and
														
 
															+	 * restore segments around it, nullifying its effect.
														
 
															 	 */
														
 
															 	loadsegment(ds, __USER_DS);
														
 
															 	loadsegment(es, __USER_DS);
														
@@ -10511,8 +10885,8 @@ static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs)
 
															 		return;
														
 
															 	cpu = get_cpu();
														
 
															-	vmx->loaded_vmcs = vmcs;
														
 
															 	vmx_vcpu_put(vcpu);
														
 
															+	vmx->loaded_vmcs = vmcs;
														
 
															 	vmx_vcpu_load(vcpu, cpu);
														
 
															 	put_cpu();
														
 
															 }
														
@@ -10652,6 +11026,8 @@ free_vcpu:
 
															 static int vmx_vm_init(struct kvm *kvm)
														
 
															 {
														
 
															+	spin_lock_init(&to_kvm_vmx(kvm)->ept_pointer_lock);
														
 
															+
														
 
															 	if (!ple_gap)
														
 
															 		kvm->arch.pause_in_guest = true;
														
@@ -10876,11 +11252,11 @@ static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 
															 	if (!valid_ept_address(vcpu, nested_ept_get_cr3(vcpu)))
														
 
															 		return 1;
														
 
															-	kvm_mmu_unload(vcpu);
														
 
															 	kvm_init_shadow_ept_mmu(vcpu,
														
 
															 			to_vmx(vcpu)->nested.msrs.ept_caps &
														
 
															 			VMX_EPT_EXECUTE_ONLY_BIT,
														
 
															-			nested_ept_ad_enabled(vcpu));
														
 
															+			nested_ept_ad_enabled(vcpu),
														
 
															+			nested_ept_get_cr3(vcpu));
														
 
															 	vcpu->arch.mmu.set_cr3           = vmx_set_cr3;
														
 
															 	vcpu->arch.mmu.get_cr3           = nested_ept_get_cr3;
														
 
															 	vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault;
														
@@ -10928,9 +11304,9 @@ static void vmx_inject_page_fault_nested(struct kvm_vcpu *vcpu,
 
															 static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
														
 
															 						 struct vmcs12 *vmcs12);
														
 
															-static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu,
														
 
															-					struct vmcs12 *vmcs12)
														
 
															+static void nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	struct page *page;
														
 
															 	u64 hpa;
														
@@ -11171,6 +11547,38 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
 
															 	return true;
														
 
															 }
														
 
															+static void nested_cache_shadow_vmcs12(struct kvm_vcpu *vcpu,
														
 
															+				       struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	struct vmcs12 *shadow;
														
 
															+	struct page *page;
														
 
															+
														
 
															+	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
														
 
															+	    vmcs12->vmcs_link_pointer == -1ull)
														
 
															+		return;
														
 
															+
														
 
															+	shadow = get_shadow_vmcs12(vcpu);
														
 
															+	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
														
 
															+
														
 
															+	memcpy(shadow, kmap(page), VMCS12_SIZE);
														
 
															+
														
 
															+	kunmap(page);
														
 
															+	kvm_release_page_clean(page);
														
 
															+}
														
 
															+
														
 
															+static void nested_flush_cached_shadow_vmcs12(struct kvm_vcpu *vcpu,
														
 
															+					      struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+
														
 
															+	if (!nested_cpu_has_shadow_vmcs(vmcs12) ||
														
 
															+	    vmcs12->vmcs_link_pointer == -1ull)
														
 
															+		return;
														
 
															+
														
 
															+	kvm_write_guest(vmx->vcpu.kvm, vmcs12->vmcs_link_pointer,
														
 
															+			get_shadow_vmcs12(vcpu), VMCS12_SIZE);
														
 
															+}
														
 
															+
														
 
															 static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
														
 
															 					  struct vmcs12 *vmcs12)
														
 
															 {
														
@@ -11228,11 +11636,12 @@ static int nested_vmx_check_msr_switch(struct kvm_vcpu *vcpu,
 
															 				       unsigned long count_field,
														
 
															 				       unsigned long addr_field)
														
 
															 {
														
 
															+	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															 	int maxphyaddr;
														
 
															 	u64 count, addr;
														
 
															-	if (vmcs12_read_any(vcpu, count_field, &count) ||
														
 
															-	    vmcs12_read_any(vcpu, addr_field, &addr)) {
														
 
															+	if (vmcs12_read_any(vmcs12, count_field, &count) ||
														
 
															+	    vmcs12_read_any(vmcs12, addr_field, &addr)) {
														
 
															 		WARN_ON(1);
														
 
															 		return -EINVAL;
														
 
															 	}
														
@@ -11282,6 +11691,19 @@ static int nested_vmx_check_pml_controls(struct kvm_vcpu *vcpu,
 
															 	return 0;
														
 
															 }
														
 
															+static int nested_vmx_check_shadow_vmcs_controls(struct kvm_vcpu *vcpu,
														
 
															+						 struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	if (!nested_cpu_has_shadow_vmcs(vmcs12))
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!page_address_valid(vcpu, vmcs12->vmread_bitmap) ||
														
 
															+	    !page_address_valid(vcpu, vmcs12->vmwrite_bitmap))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static int nested_vmx_msr_check_common(struct kvm_vcpu *vcpu,
														
 
															 				       struct vmx_msr_entry *e)
														
 
															 {
														
@@ -11431,12 +11853,16 @@ static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool ne
 
															 				return 1;
														
 
															 			}
														
 
															 		}
														
 
															-
														
 
															-		vcpu->arch.cr3 = cr3;
														
 
															-		__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
														
 
															 	}
														
 
															-	kvm_mmu_reset_context(vcpu);
														
 
															+	if (!nested_ept)
														
 
															+		kvm_mmu_new_cr3(vcpu, cr3, false);
														
 
															+
														
 
															+	vcpu->arch.cr3 = cr3;
														
 
															+	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
														
 
															+
														
 
															+	kvm_init_mmu(vcpu, false);
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
@@ -11523,7 +11949,8 @@ static void prepare_vmcs02_full(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 	 * Set host-state according to L0's settings (vmcs12 is irrelevant here)
														
 
															 	 * Some constant fields are set here by vmx_set_constant_host_state().
														
 
															 	 * Other fields are different per CPU, and will be set later when
														
 
															-	 * vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
														
 
															+	 * vmx_vcpu_load() is called, and when vmx_prepare_switch_to_guest()
														
 
															+	 * is called.
														
 
															 	 */
														
 
															 	vmx_set_constant_host_state(vmx);
														
@@ -11595,11 +12022,6 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
														
 
															 	vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
														
 
															-	/*
														
 
															-	 * Not in vmcs02: GUEST_PML_INDEX, HOST_FS_SELECTOR, HOST_GS_SELECTOR,
														
 
															-	 * HOST_FS_BASE, HOST_GS_BASE.
														
 
															-	 */
														
 
															-
														
 
															 	if (vmx->nested.nested_run_pending &&
														
 
															 	    (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) {
														
 
															 		kvm_set_dr(vcpu, 7, vmcs12->guest_dr7);
														
@@ -11664,6 +12086,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 			exec_control |= vmcs12_exec_ctrl;
														
 
															 		}
														
 
															+		/* VMCS shadowing for L2 is emulated for now */
														
 
															+		exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
														
 
															+
														
 
															 		if (exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
														
 
															 			vmcs_write16(GUEST_INTR_STATUS,
														
 
															 				vmcs12->guest_intr_status);
														
@@ -11883,6 +12308,9 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 	if (nested_vmx_check_pml_controls(vcpu, vmcs12))
														
 
															 		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
														
 
															+	if (nested_vmx_check_shadow_vmcs_controls(vcpu, vmcs12))
														
 
															+		return VMXERR_ENTRY_INVALID_CONTROL_FIELD;
														
 
															+
														
 
															 	if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control,
														
 
															 				vmx->nested.msrs.procbased_ctls_low,
														
 
															 				vmx->nested.msrs.procbased_ctls_high) ||
														
@@ -11983,6 +12411,33 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
															 	return 0;
														
 
															 }
														
 
															+static int nested_vmx_check_vmcs_link_ptr(struct kvm_vcpu *vcpu,
														
 
															+					  struct vmcs12 *vmcs12)
														
 
															+{
														
 
															+	int r;
														
 
															+	struct page *page;
														
 
															+	struct vmcs12 *shadow;
														
 
															+
														
 
															+	if (vmcs12->vmcs_link_pointer == -1ull)
														
 
															+		return 0;
														
 
															+
														
 
															+	if (!page_address_valid(vcpu, vmcs12->vmcs_link_pointer))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->vmcs_link_pointer);
														
 
															+	if (is_error_page(page))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	r = 0;
														
 
															+	shadow = kmap(page);
														
 
															+	if (shadow->hdr.revision_id != VMCS12_REVISION ||
														
 
															+	    shadow->hdr.shadow_vmcs != nested_cpu_has_shadow_vmcs(vmcs12))
														
 
															+		r = -EINVAL;
														
 
															+	kunmap(page);
														
 
															+	kvm_release_page_clean(page);
														
 
															+	return r;
														
 
															+}
														
 
															+
														
 
															 static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
														
 
															 				  u32 *exit_qual)
														
 
															 {
														
@@ -11994,8 +12449,7 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	    !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4))
														
 
															 		return 1;
														
 
															-	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_SHADOW_VMCS) &&
														
 
															-	    vmcs12->vmcs_link_pointer != -1ull) {
														
 
															+	if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
														
 
															 		*exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
														
 
															 		return 1;
														
 
															 	}
														
@@ -12042,12 +12496,17 @@ static int check_vmentry_postreqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
 
															 	return 0;
														
 
															 }
														
 
															-static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
														
 
															+/*
														
 
															+ * If exit_qual is NULL, this is being called from state restore (either RSM
														
 
															+ * or KVM_SET_NESTED_STATE).  Otherwise it's called from vmlaunch/vmresume.
														
 
															+ */
														
 
															+static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, u32 *exit_qual)
														
 
															 {
														
 
															 	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															 	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
														
 
															-	u32 exit_qual;
														
 
															-	int r;
														
 
															+	bool from_vmentry = !!exit_qual;
														
 
															+	u32 dummy_exit_qual;
														
 
															+	int r = 0;
														
 
															 	enter_guest_mode(vcpu);
														
@@ -12061,17 +12520,28 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu)
 
															 		vcpu->arch.tsc_offset += vmcs12->tsc_offset;
														
 
															 	r = EXIT_REASON_INVALID_STATE;
														
 
															-	if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
														
 
															+	if (prepare_vmcs02(vcpu, vmcs12, from_vmentry ? exit_qual : &dummy_exit_qual))
														
 
															 		goto fail;
														
 
															-	nested_get_vmcs12_pages(vcpu, vmcs12);
														
 
															+	if (from_vmentry) {
														
 
															+		nested_get_vmcs12_pages(vcpu);
														
 
															-	r = EXIT_REASON_MSR_LOAD_FAIL;
														
 
															-	exit_qual = nested_vmx_load_msr(vcpu,
														
 
															-					vmcs12->vm_entry_msr_load_addr,
														
 
															-					vmcs12->vm_entry_msr_load_count);
														
 
															-	if (exit_qual)
														
 
															-		goto fail;
														
 
															+		r = EXIT_REASON_MSR_LOAD_FAIL;
														
 
															+		*exit_qual = nested_vmx_load_msr(vcpu,
														
 
															+	     					 vmcs12->vm_entry_msr_load_addr,
														
 
															+					      	 vmcs12->vm_entry_msr_load_count);
														
 
															+		if (*exit_qual)
														
 
															+			goto fail;
														
 
															+	} else {
														
 
															+		/*
														
 
															+		 * The MMU is not initialized to point at the right entities yet and
														
 
															+		 * "get pages" would need to read data from the guest (i.e. we will
														
 
															+		 * need to perform gpa to hpa translation). Request a call
														
 
															+		 * to nested_get_vmcs12_pages before the next VM-entry.  The MSRs
														
 
															+		 * have already been set at vmentry time and should not be reset.
														
 
															+		 */
														
 
															+		kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
														
 
															+	}
														
 
															 	/*
														
 
															 	 * Note no nested_vmx_succeed or nested_vmx_fail here. At this point
														
@@ -12086,8 +12556,7 @@ fail:
 
															 		vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
														
 
															 	leave_guest_mode(vcpu);
														
 
															 	vmx_switch_vmcs(vcpu, &vmx->vmcs01);
														
 
															-	nested_vmx_entry_failure(vcpu, vmcs12, r, exit_qual);
														
 
															-	return 1;
														
 
															+	return r;
														
 
															 }
														
 
															 /*
														
@@ -12110,6 +12579,17 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 	vmcs12 = get_vmcs12(vcpu);
														
 
															+	/*
														
 
															+	 * Can't VMLAUNCH or VMRESUME a shadow VMCS. Despite the fact
														
 
															+	 * that there *is* a valid VMCS pointer, RFLAGS.CF is set
														
 
															+	 * rather than RFLAGS.ZF, and no error number is stored to the
														
 
															+	 * VM-instruction error field.
														
 
															+	 */
														
 
															+	if (vmcs12->hdr.shadow_vmcs) {
														
 
															+		nested_vmx_failInvalid(vcpu);
														
 
															+		goto out;
														
 
															+	}
														
 
															+
														
 
															 	if (enable_shadow_vmcs)
														
 
															 		copy_shadow_to_vmcs12(vmx);
														
@@ -12164,15 +12644,28 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
															 	 */
														
 
															 	vmx->nested.nested_run_pending = 1;
														
 
															-	ret = enter_vmx_non_root_mode(vcpu);
														
 
															+	ret = enter_vmx_non_root_mode(vcpu, &exit_qual);
														
 
															 	if (ret) {
														
 
															+		nested_vmx_entry_failure(vcpu, vmcs12, ret, exit_qual);
														
 
															 		vmx->nested.nested_run_pending = 0;
														
 
															-		return ret;
														
 
															+		return 1;
														
 
															 	}
														
 
															 	/* Hide L1D cache contents from the nested guest.  */
														
 
															 	vmx->vcpu.arch.l1tf_flush_l1d = true;
														
 
															+	/*
														
 
															+	 * Must happen outside of enter_vmx_non_root_mode() as it will
														
 
															+	 * also be used as part of restoring nVMX state for
														
 
															+	 * snapshot restore (migration).
														
 
															+	 *
														
 
															+	 * In this flow, it is assumed that vmcs12 cache was
														
 
															+	 * trasferred as part of captured nVMX state and should
														
 
															+	 * therefore not be read from guest memory (which may not
														
 
															+	 * exist on destination host yet).
														
 
															+	 */
														
 
															+	nested_cache_shadow_vmcs12(vcpu, vmcs12);
														
 
															+
														
 
															 	/*
														
 
															 	 * If we're entering a halted L2 vcpu and the L2 vcpu won't be woken
														
 
															 	 * by event injection, halt vcpu.
														
@@ -12682,6 +13175,17 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
															 			prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
														
 
															 				       exit_qualification);
														
 
															+		/*
														
 
															+		 * Must happen outside of sync_vmcs12() as it will
														
 
															+		 * also be used to capture vmcs12 cache as part of
														
 
															+		 * capturing nVMX state for snapshot (migration).
														
 
															+		 *
														
 
															+		 * Otherwise, this flush will dirty guest memory at a
														
 
															+		 * point it is already assumed by user-space to be
														
 
															+		 * immutable.
														
 
															+		 */
														
 
															+		nested_flush_cached_shadow_vmcs12(vcpu, vmcs12);
														
 
															+
														
 
															 		if (nested_vmx_store_msr(vcpu, vmcs12->vm_exit_msr_store_addr,
														
 
															 					 vmcs12->vm_exit_msr_store_count))
														
 
															 			nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL);
														
@@ -13256,7 +13760,7 @@ static int vmx_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
 
															 	if (vmx->nested.smm.guest_mode) {
														
 
															 		vcpu->arch.hflags &= ~HF_SMM_MASK;
														
 
															-		ret = enter_vmx_non_root_mode(vcpu);
														
 
															+		ret = enter_vmx_non_root_mode(vcpu, NULL);
														
 
															 		vcpu->arch.hflags |= HF_SMM_MASK;
														
 
															 		if (ret)
														
 
															 			return ret;
														
@@ -13271,6 +13775,199 @@ static int enable_smi_window(struct kvm_vcpu *vcpu)
 
															 	return 0;
														
 
															 }
														
 
															+static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
														
 
															+				struct kvm_nested_state __user *user_kvm_nested_state,
														
 
															+				u32 user_data_size)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx;
														
 
															+	struct vmcs12 *vmcs12;
														
 
															+	struct kvm_nested_state kvm_state = {
														
 
															+		.flags = 0,
														
 
															+		.format = 0,
														
 
															+		.size = sizeof(kvm_state),
														
 
															+		.vmx.vmxon_pa = -1ull,
														
 
															+		.vmx.vmcs_pa = -1ull,
														
 
															+	};
														
 
															+
														
 
															+	if (!vcpu)
														
 
															+		return kvm_state.size + 2 * VMCS12_SIZE;
														
 
															+
														
 
															+	vmx = to_vmx(vcpu);
														
 
															+	vmcs12 = get_vmcs12(vcpu);
														
 
															+	if (nested_vmx_allowed(vcpu) &&
														
 
															+	    (vmx->nested.vmxon || vmx->nested.smm.vmxon)) {
														
 
															+		kvm_state.vmx.vmxon_pa = vmx->nested.vmxon_ptr;
														
 
															+		kvm_state.vmx.vmcs_pa = vmx->nested.current_vmptr;
														
 
															+
														
 
															+		if (vmx->nested.current_vmptr != -1ull) {
														
 
															+			kvm_state.size += VMCS12_SIZE;
														
 
															+
														
 
															+			if (is_guest_mode(vcpu) &&
														
 
															+			    nested_cpu_has_shadow_vmcs(vmcs12) &&
														
 
															+			    vmcs12->vmcs_link_pointer != -1ull)
														
 
															+				kvm_state.size += VMCS12_SIZE;
														
 
															+		}
														
 
															+
														
 
															+		if (vmx->nested.smm.vmxon)
														
 
															+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_VMXON;
														
 
															+
														
 
															+		if (vmx->nested.smm.guest_mode)
														
 
															+			kvm_state.vmx.smm.flags |= KVM_STATE_NESTED_SMM_GUEST_MODE;
														
 
															+
														
 
															+		if (is_guest_mode(vcpu)) {
														
 
															+			kvm_state.flags |= KVM_STATE_NESTED_GUEST_MODE;
														
 
															+
														
 
															+			if (vmx->nested.nested_run_pending)
														
 
															+				kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	if (user_data_size < kvm_state.size)
														
 
															+		goto out;
														
 
															+
														
 
															+	if (copy_to_user(user_kvm_nested_state, &kvm_state, sizeof(kvm_state)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	if (vmx->nested.current_vmptr == -1ull)
														
 
															+		goto out;
														
 
															+
														
 
															+	/*
														
 
															+	 * When running L2, the authoritative vmcs12 state is in the
														
 
															+	 * vmcs02. When running L1, the authoritative vmcs12 state is
														
 
															+	 * in the shadow vmcs linked to vmcs01, unless
														
 
															+	 * sync_shadow_vmcs is set, in which case, the authoritative
														
 
															+	 * vmcs12 state is in the vmcs12 already.
														
 
															+	 */
														
 
															+	if (is_guest_mode(vcpu))
														
 
															+		sync_vmcs12(vcpu, vmcs12);
														
 
															+	else if (enable_shadow_vmcs && !vmx->nested.sync_shadow_vmcs)
														
 
															+		copy_shadow_to_vmcs12(vmx);
														
 
															+
														
 
															+	if (copy_to_user(user_kvm_nested_state->data, vmcs12, sizeof(*vmcs12)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
														
 
															+	    vmcs12->vmcs_link_pointer != -1ull) {
														
 
															+		if (copy_to_user(user_kvm_nested_state->data + VMCS12_SIZE,
														
 
															+				 get_shadow_vmcs12(vcpu), sizeof(*vmcs12)))
														
 
															+			return -EFAULT;
														
 
															+	}
														
 
															+
														
 
															+out:
														
 
															+	return kvm_state.size;
														
 
															+}
														
 
															+
														
 
															+static int vmx_set_nested_state(struct kvm_vcpu *vcpu,
														
 
															+				struct kvm_nested_state __user *user_kvm_nested_state,
														
 
															+				struct kvm_nested_state *kvm_state)
														
 
															+{
														
 
															+	struct vcpu_vmx *vmx = to_vmx(vcpu);
														
 
															+	struct vmcs12 *vmcs12;
														
 
															+	u32 exit_qual;
														
 
															+	int ret;
														
 
															+
														
 
															+	if (kvm_state->format != 0)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (!nested_vmx_allowed(vcpu))
														
 
															+		return kvm_state->vmx.vmxon_pa == -1ull ? 0 : -EINVAL;
														
 
															+
														
 
															+	if (kvm_state->vmx.vmxon_pa == -1ull) {
														
 
															+		if (kvm_state->vmx.smm.flags)
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		if (kvm_state->vmx.vmcs_pa != -1ull)
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		vmx_leave_nested(vcpu);
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	if (!page_address_valid(vcpu, kvm_state->vmx.vmxon_pa))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (kvm_state->size < sizeof(kvm_state) + sizeof(*vmcs12))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (kvm_state->vmx.vmcs_pa == kvm_state->vmx.vmxon_pa ||
														
 
															+	    !page_address_valid(vcpu, kvm_state->vmx.vmcs_pa))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
														
 
															+	    (kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (kvm_state->vmx.smm.flags &
														
 
															+	    ~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if ((kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE) &&
														
 
															+	    !(kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	vmx_leave_nested(vcpu);
														
 
															+	if (kvm_state->vmx.vmxon_pa == -1ull)
														
 
															+		return 0;
														
 
															+
														
 
															+	vmx->nested.vmxon_ptr = kvm_state->vmx.vmxon_pa;
														
 
															+	ret = enter_vmx_operation(vcpu);
														
 
															+	if (ret)
														
 
															+		return ret;
														
 
															+
														
 
															+	set_current_vmptr(vmx, kvm_state->vmx.vmcs_pa);
														
 
															+
														
 
															+	if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_VMXON) {
														
 
															+		vmx->nested.smm.vmxon = true;
														
 
															+		vmx->nested.vmxon = false;
														
 
															+
														
 
															+		if (kvm_state->vmx.smm.flags & KVM_STATE_NESTED_SMM_GUEST_MODE)
														
 
															+			vmx->nested.smm.guest_mode = true;
														
 
															+	}
														
 
															+
														
 
															+	vmcs12 = get_vmcs12(vcpu);
														
 
															+	if (copy_from_user(vmcs12, user_kvm_nested_state->data, sizeof(*vmcs12)))
														
 
															+		return -EFAULT;
														
 
															+
														
 
															+	if (vmcs12->hdr.revision_id != VMCS12_REVISION)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (!(kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE))
														
 
															+		return 0;
														
 
															+
														
 
															+	vmx->nested.nested_run_pending =
														
 
															+		!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
														
 
															+
														
 
															+	if (nested_cpu_has_shadow_vmcs(vmcs12) &&
														
 
															+	    vmcs12->vmcs_link_pointer != -1ull) {
														
 
															+		struct vmcs12 *shadow_vmcs12 = get_shadow_vmcs12(vcpu);
														
 
															+		if (kvm_state->size < sizeof(kvm_state) + 2 * sizeof(*vmcs12))
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		if (copy_from_user(shadow_vmcs12,
														
 
															+				   user_kvm_nested_state->data + VMCS12_SIZE,
														
 
															+				   sizeof(*vmcs12)))
														
 
															+			return -EFAULT;
														
 
															+
														
 
															+		if (shadow_vmcs12->hdr.revision_id != VMCS12_REVISION ||
														
 
															+		    !shadow_vmcs12->hdr.shadow_vmcs)
														
 
															+			return -EINVAL;
														
 
															+	}
														
 
															+
														
 
															+	if (check_vmentry_prereqs(vcpu, vmcs12) ||
														
 
															+	    check_vmentry_postreqs(vcpu, vmcs12, &exit_qual))
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	if (kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING)
														
 
															+		vmx->nested.nested_run_pending = 1;
														
 
															+
														
 
															+	vmx->nested.dirty_vmcs12 = true;
														
 
															+	ret = enter_vmx_non_root_mode(vcpu, NULL);
														
 
															+	if (ret)
														
 
															+		return -EINVAL;
														
 
															+
														
 
															+	return 0;
														
 
															+}
														
 
															+
														
 
															 static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
														
 
															 	.cpu_has_kvm_support = cpu_has_kvm_support,
														
 
															 	.disabled_by_bios = vmx_disabled_by_bios,
														
@@ -13290,7 +13987,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
															 	.vcpu_free = vmx_free_vcpu,
														
 
															 	.vcpu_reset = vmx_vcpu_reset,
														
 
															-	.prepare_guest_switch = vmx_save_host_state,
														
 
															+	.prepare_guest_switch = vmx_prepare_switch_to_guest,
														
 
															 	.vcpu_load = vmx_vcpu_load,
														
 
															 	.vcpu_put = vmx_vcpu_put,
														
@@ -13323,6 +14020,7 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
															 	.set_rflags = vmx_set_rflags,
														
 
															 	.tlb_flush = vmx_flush_tlb,
														
 
															+	.tlb_flush_gva = vmx_flush_tlb_gva,
														
 
															 	.run = vmx_vcpu_run,
														
 
															 	.handle_exit = vmx_handle_exit,
														
@@ -13405,6 +14103,10 @@ static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
 
															 	.setup_mce = vmx_setup_mce,
														
 
															+	.get_nested_state = vmx_get_nested_state,
														
 
															+	.set_nested_state = vmx_set_nested_state,
														
 
															+	.get_vmcs12_pages = nested_get_vmcs12_pages,
														
 
															+
														
 
															 	.smi_allowed = vmx_smi_allowed,
														
 
															 	.pre_enter_smm = vmx_pre_enter_smm,
														
 
															 	.pre_leave_smm = vmx_pre_leave_smm,
														
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -848,16 +848,21 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
 
															 int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
														
 
															 {
														
 
															+	bool skip_tlb_flush = false;
														
 
															 #ifdef CONFIG_X86_64
														
 
															 	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
														
 
															-	if (pcid_enabled)
														
 
															-		cr3 &= ~CR3_PCID_INVD;
														
 
															+	if (pcid_enabled) {
														
 
															+		skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
														
 
															+		cr3 &= ~X86_CR3_PCID_NOFLUSH;
														
 
															+	}
														
 
															 #endif
														
 
															 	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
														
 
															-		kvm_mmu_sync_roots(vcpu);
														
 
															-		kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															+		if (!skip_tlb_flush) {
														
 
															+			kvm_mmu_sync_roots(vcpu);
														
 
															+			kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
														
 
															+		}
														
 
															 		return 0;
														
 
															 	}
														
@@ -868,9 +873,10 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
 
															 		   !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
														
 
															 		return 1;
														
 
															+	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
														
 
															 	vcpu->arch.cr3 = cr3;
														
 
															 	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
														
 
															-	kvm_mmu_new_cr3(vcpu);
														
 
															+
														
 
															 	return 0;
														
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_set_cr3);
														
@@ -2185,10 +2191,11 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 		vcpu->arch.mcg_status = data;
														
 
															 		break;
														
 
															 	case MSR_IA32_MCG_CTL:
														
 
															-		if (!(mcg_cap & MCG_CTL_P))
														
 
															+		if (!(mcg_cap & MCG_CTL_P) &&
														
 
															+		    (data || !msr_info->host_initiated))
														
 
															 			return 1;
														
 
															 		if (data != 0 && data != ~(u64)0)
														
 
															-			return -1;
														
 
															+			return 1;
														
 
															 		vcpu->arch.mcg_ctl = data;
														
 
															 		break;
														
 
															 	default:
														
@@ -2576,7 +2583,7 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
															 }
														
 
															 EXPORT_SYMBOL_GPL(kvm_get_msr);
														
 
															-static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
														
 
															+static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host)
														
 
															 {
														
 
															 	u64 data;
														
 
															 	u64 mcg_cap = vcpu->arch.mcg_cap;
														
@@ -2591,7 +2598,7 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 
															 		data = vcpu->arch.mcg_cap;
														
 
															 		break;
														
 
															 	case MSR_IA32_MCG_CTL:
														
 
															-		if (!(mcg_cap & MCG_CTL_P))
														
 
															+		if (!(mcg_cap & MCG_CTL_P) && !host)
														
 
															 			return 1;
														
 
															 		data = vcpu->arch.mcg_ctl;
														
 
															 		break;
														
@@ -2724,7 +2731,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 	case MSR_IA32_MCG_CTL:
														
 
															 	case MSR_IA32_MCG_STATUS:
														
 
															 	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
														
 
															-		return get_msr_mce(vcpu, msr_info->index, &msr_info->data);
														
 
															+		return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
														
 
															+				   msr_info->host_initiated);
														
 
															 	case MSR_K7_CLK_CTL:
														
 
															 		/*
														
 
															 		 * Provide expected ramp-up count for K7. All other
														
@@ -2745,7 +2753,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 
															 	case HV_X64_MSR_TSC_EMULATION_CONTROL:
														
 
															 	case HV_X64_MSR_TSC_EMULATION_STATUS:
														
 
															 		return kvm_hv_get_msr_common(vcpu,
														
 
															-					     msr_info->index, &msr_info->data);
														
 
															+					     msr_info->index, &msr_info->data,
														
 
															+					     msr_info->host_initiated);
														
 
															 		break;
														
 
															 	case MSR_IA32_BBL_CR_CTL3:
														
 
															 		/* This legacy MSR exists but isn't fully documented in current
														
@@ -2969,6 +2978,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
															 	case KVM_CAP_X2APIC_API:
														
 
															 		r = KVM_X2APIC_API_VALID_FLAGS;
														
 
															 		break;
														
 
															+	case KVM_CAP_NESTED_STATE:
														
 
															+		r = kvm_x86_ops->get_nested_state ?
														
 
															+			kvm_x86_ops->get_nested_state(NULL, 0, 0) : 0;
														
 
															+		break;
														
 
															 	default:
														
 
															 		break;
														
 
															 	}
														
@@ -3985,6 +3998,56 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 
															 		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
														
 
															 		break;
														
 
															 	}
														
 
															+	case KVM_GET_NESTED_STATE: {
														
 
															+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
														
 
															+		u32 user_data_size;
														
 
															+
														
 
															+		r = -EINVAL;
														
 
															+		if (!kvm_x86_ops->get_nested_state)
														
 
															+			break;
														
 
															+
														
 
															+		BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
														
 
															+		if (get_user(user_data_size, &user_kvm_nested_state->size))
														
 
															+			return -EFAULT;
														
 
															+
														
 
															+		r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
														
 
															+						  user_data_size);
														
 
															+		if (r < 0)
														
 
															+			return r;
														
 
															+
														
 
															+		if (r > user_data_size) {
														
 
															+			if (put_user(r, &user_kvm_nested_state->size))
														
 
															+				return -EFAULT;
														
 
															+			return -E2BIG;
														
 
															+		}
														
 
															+		r = 0;
														
 
															+		break;
														
 
															+	}
														
 
															+	case KVM_SET_NESTED_STATE: {
														
 
															+		struct kvm_nested_state __user *user_kvm_nested_state = argp;
														
 
															+		struct kvm_nested_state kvm_state;
														
 
															+
														
 
															+		r = -EINVAL;
														
 
															+		if (!kvm_x86_ops->set_nested_state)
														
 
															+			break;
														
 
															+
														
 
															+		if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
														
 
															+			return -EFAULT;
														
 
															+
														
 
															+		if (kvm_state.size < sizeof(kvm_state))
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		if (kvm_state.flags &
														
 
															+		    ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE))
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		/* nested_run_pending implies guest_mode.  */
														
 
															+		if (kvm_state.flags == KVM_STATE_NESTED_RUN_PENDING)
														
 
															+			return -EINVAL;
														
 
															+
														
 
															+		r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
														
 
															+		break;
														
 
															+	}
														
 
															 	default:
														
 
															 		r = -EINVAL;
														
 
															 	}
														
@@ -6503,8 +6566,12 @@ static void kvm_set_mmio_spte_mask(void)
 
															 	 * Set the reserved bits and the present bit of an paging-structure
														
 
															 	 * entry to generate page fault with PFER.RSV = 1.
														
 
															 	 */
														
 
															-	 /* Mask the reserved physical address bits. */
														
 
															-	mask = rsvd_bits(maxphyaddr, 51);
														
 
															+
														
 
															+	/*
														
 
															+	 * Mask the uppermost physical address bit, which would be reserved as
														
 
															+	 * long as the supported physical address width is less than 52.
														
 
															+	 */
														
 
															+	mask = 1ull << 51;
														
 
															 	/* Set the present bit. */
														
 
															 	mask |= 1ull;
														
@@ -6769,6 +6836,9 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 
															 	case KVM_HC_CLOCK_PAIRING:
														
 
															 		ret = kvm_pv_clock_pairing(vcpu, a0, a1);
														
 
															 		break;
														
 
															+	case KVM_HC_SEND_IPI:
														
 
															+		ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
														
 
															+		break;
														
 
															 #endif
														
 
															 	default:
														
 
															 		ret = -KVM_ENOSYS;
														
@@ -7287,6 +7357,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
															 	bool req_immediate_exit = false;
														
 
															 	if (kvm_request_pending(vcpu)) {
														
 
															+		if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu))
														
 
															+			kvm_x86_ops->get_vmcs12_pages(vcpu);
														
 
															 		if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
														
 
															 			kvm_mmu_unload(vcpu);
														
 
															 		if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
														
@@ -7302,6 +7374,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 
															 		}
														
 
															 		if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
														
 
															 			kvm_mmu_sync_roots(vcpu);
														
 
															+		if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
														
 
															+			kvm_mmu_load_cr3(vcpu);
														
 
															 		if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
														
 
															 			kvm_vcpu_flush_tlb(vcpu, true);
														
 
															 		if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
														
@@ -8013,6 +8087,10 @@ EXPORT_SYMBOL_GPL(kvm_task_switch);
 
															 static int kvm_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
														
 
															 {
														
 
															+	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
														
 
															+			(sregs->cr4 & X86_CR4_OSXSAVE))
														
 
															+		return  -EINVAL;
														
 
															+
														
 
															 	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
														
 
															 		/*
														
 
															 		 * When EFER.LME and CR0.PG are set, the processor is in
														
@@ -8043,10 +8121,6 @@ static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs)
 
															 	struct desc_ptr dt;
														
 
															 	int ret = -EINVAL;
														
 
															-	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
														
 
															-			(sregs->cr4 & X86_CR4_OSXSAVE))
														
 
															-		goto out;
														
 
															-
														
 
															 	if (kvm_valid_sregs(vcpu, sregs))
														
 
															 		goto out;
														
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -130,7 +130,7 @@ static inline bool is_error_page(struct page *page)
 
															 #define KVM_REQUEST_ARCH_BASE     8
														
 
															 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \
														
 
															-	BUILD_BUG_ON((unsigned)(nr) >= 32 - KVM_REQUEST_ARCH_BASE); \
														
 
															+	BUILD_BUG_ON((unsigned)(nr) >= (FIELD_SIZEOF(struct kvm_vcpu, requests) * 8) - KVM_REQUEST_ARCH_BASE); \
														
 
															 	(unsigned)(((nr) + KVM_REQUEST_ARCH_BASE) | (flags)); \
														
 
															 })
														
 
															 #define KVM_ARCH_REQ(nr)           KVM_ARCH_REQ_FLAGS(nr, 0)
														
@@ -224,7 +224,7 @@ struct kvm_vcpu {
 
															 	int vcpu_id;
														
 
															 	int srcu_idx;
														
 
															 	int mode;
														
 
															-	unsigned long requests;
														
 
															+	u64 requests;
														
 
															 	unsigned long guest_debug;
														
 
															 	int pre_pcpu;
														
@@ -309,6 +309,13 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
 
															 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
														
 
															 }
														
 
															+static inline unsigned long *kvm_second_dirty_bitmap(struct kvm_memory_slot *memslot)
														
 
															+{
														
 
															+	unsigned long len = kvm_dirty_bitmap_bytes(memslot);
														
 
															+
														
 
															+	return memslot->dirty_bitmap + len / sizeof(*memslot->dirty_bitmap);
														
 
															+}
														
 
															+
														
 
															 struct kvm_s390_adapter_int {
														
 
															 	u64 ind_addr;
														
 
															 	u64 summary_addr;
														
@@ -827,6 +834,13 @@ static inline void kvm_arch_free_vm(struct kvm *kvm)
 
															 }
														
 
															 #endif
														
 
															+#ifndef __KVM_HAVE_ARCH_FLUSH_REMOTE_TLB
														
 
															+static inline int kvm_arch_flush_remote_tlb(struct kvm *kvm)
														
 
															+{
														
 
															+	return -ENOTSUPP;
														
 
															+}
														
 
															+#endif
														
 
															+
														
 
															 #ifdef __KVM_HAVE_ARCH_NONCOHERENT_DMA
														
 
															 void kvm_arch_register_noncoherent_dma(struct kvm *kvm);
														
 
															 void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm);
														
@@ -1124,7 +1138,7 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
 
															 	 * caller.  Paired with the smp_mb__after_atomic in kvm_check_request.
														
 
															 	 */
														
 
															 	smp_wmb();
														
 
															-	set_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
														
 
															+	set_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
														
 
															 }
														
 
															 static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
														
@@ -1134,12 +1148,12 @@ static inline bool kvm_request_pending(struct kvm_vcpu *vcpu)
 
															 static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
														
 
															+	return test_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
														
 
															 }
														
 
															 static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															-	clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests);
														
 
															+	clear_bit(req & KVM_REQUEST_MASK, (void *)&vcpu->requests);
														
 
															 }
														
 
															 static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
														
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -950,6 +950,7 @@ struct kvm_ppc_resize_hpt {
 
															 #define KVM_CAP_HYPERV_EVENTFD 154
														
 
															 #define KVM_CAP_HYPERV_TLBFLUSH 155
														
 
															 #define KVM_CAP_S390_HPAGE_1M 156
														
 
															+#define KVM_CAP_NESTED_STATE 157
														
 
															 #ifdef KVM_CAP_IRQ_ROUTING
														
@@ -1392,6 +1393,9 @@ struct kvm_enc_region {
 
															 /* Available with KVM_CAP_HYPERV_EVENTFD */
														
 
															 #define KVM_HYPERV_EVENTFD        _IOW(KVMIO,  0xbd, struct kvm_hyperv_eventfd)
														
 
															+/* Available with KVM_CAP_NESTED_STATE */
														
 
															+#define KVM_GET_NESTED_STATE         _IOWR(KVMIO, 0xbe, struct kvm_nested_state)
														
 
															+#define KVM_SET_NESTED_STATE         _IOW(KVMIO,  0xbf, struct kvm_nested_state)
														
 
															 /* Secure Encrypted Virtualization command */
														
 
															 enum sev_cmd_id {
														
--- a/include/uapi/linux/kvm_para.h
+++ b/include/uapi/linux/kvm_para.h
@@ -13,6 +13,7 @@
 
															 /* Return values for hypercalls */
														
 
															 #define KVM_ENOSYS		1000
														
 
															 #define KVM_EFAULT		EFAULT
														
 
															+#define KVM_EINVAL		EINVAL
														
 
															 #define KVM_E2BIG		E2BIG
														
 
															 #define KVM_EPERM		EPERM
														
 
															 #define KVM_EOPNOTSUPP		95
														
@@ -26,6 +27,7 @@
 
															 #define KVM_HC_MIPS_EXIT_VM		7
														
 
															 #define KVM_HC_MIPS_CONSOLE_OUTPUT	8
														
 
															 #define KVM_HC_CLOCK_PAIRING		9
														
 
															+#define KVM_HC_SEND_IPI		10
														
 
															 /*
														
 
															  * hypercalls use architecture specific
														
--- a/tools/testing/selftests/kvm/.gitignore
+++ b/tools/testing/selftests/kvm/.gitignore
@@ -1,3 +1,5 @@
 
															+cr4_cpuid_sync_test
														
 
															 set_sregs_test
														
 
															 sync_regs_test
														
 
															 vmx_tsc_adjust_test
														
 
															+state_test
														
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -9,6 +9,8 @@ LIBKVM_x86_64 = lib/x86.c lib/vmx.c
 
															 TEST_GEN_PROGS_x86_64 = set_sregs_test
														
 
															 TEST_GEN_PROGS_x86_64 += sync_regs_test
														
 
															 TEST_GEN_PROGS_x86_64 += vmx_tsc_adjust_test
														
 
															+TEST_GEN_PROGS_x86_64 += cr4_cpuid_sync_test
														
 
															+TEST_GEN_PROGS_x86_64 += state_test
														
 
															 TEST_GEN_PROGS += $(TEST_GEN_PROGS_$(UNAME_M))
														
 
															 LIBKVM += $(LIBKVM_$(UNAME_M))
														
--- a/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c
+++ b/tools/testing/selftests/kvm/cr4_cpuid_sync_test.c
@@ -0,0 +1,129 @@
 
															+// SPDX-License-Identifier: GPL-2.0
														
 
															+/*
														
 
															+ * CR4 and CPUID sync test
														
 
															+ *
														
 
															+ * Copyright 2018, Red Hat, Inc. and/or its affiliates.
														
 
															+ *
														
 
															+ * Author:
														
 
															+ *   Wei Huang <wei@redhat.com>
														
 
															+ */
														
 
															+
														
 
															+#include <fcntl.h>
														
 
															+#include <stdio.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <string.h>
														
 
															+#include <sys/ioctl.h>
														
 
															+
														
 
															+#include "test_util.h"
														
 
															+
														
 
															+#include "kvm_util.h"
														
 
															+#include "x86.h"
														
 
															+
														
 
															+#define X86_FEATURE_XSAVE	(1<<26)
														
 
															+#define X86_FEATURE_OSXSAVE	(1<<27)
														
 
															+#define VCPU_ID			1
														
 
															+
														
 
															+enum {
														
 
															+	GUEST_UPDATE_CR4 = 0x1000,
														
 
															+	GUEST_FAILED,
														
 
															+	GUEST_DONE,
														
 
															+};
														
 
															+
														
 
															+static void exit_to_hv(uint16_t port)
														
 
															+{
														
 
															+	__asm__ __volatile__("in %[port], %%al"
														
 
															+			     :
														
 
															+			     : [port]"d"(port)
														
 
															+			     : "rax");
														
 
															+}
														
 
															+
														
 
															+static inline bool cr4_cpuid_is_sync(void)
														
 
															+{
														
 
															+	int func, subfunc;
														
 
															+	uint32_t eax, ebx, ecx, edx;
														
 
															+	uint64_t cr4;
														
 
															+
														
 
															+	func = 0x1;
														
 
															+	subfunc = 0x0;
														
 
															+	__asm__ __volatile__("cpuid"
														
 
															+			     : "=a"(eax), "=b"(ebx), "=c"(ecx), "=d"(edx)
														
 
															+			     : "a"(func), "c"(subfunc));
														
 
															+
														
 
															+	cr4 = get_cr4();
														
 
															+
														
 
															+	return (!!(ecx & X86_FEATURE_OSXSAVE)) == (!!(cr4 & X86_CR4_OSXSAVE));
														
 
															+}
														
 
															+
														
 
															+static void guest_code(void)
														
 
															+{
														
 
															+	uint64_t cr4;
														
 
															+
														
 
															+	/* turn on CR4.OSXSAVE */
														
 
															+	cr4 = get_cr4();
														
 
															+	cr4 |= X86_CR4_OSXSAVE;
														
 
															+	set_cr4(cr4);
														
 
															+
														
 
															+	/* verify CR4.OSXSAVE == CPUID.OSXSAVE */
														
 
															+	if (!cr4_cpuid_is_sync())
														
 
															+		exit_to_hv(GUEST_FAILED);
														
 
															+
														
 
															+	/* notify hypervisor to change CR4 */
														
 
															+	exit_to_hv(GUEST_UPDATE_CR4);
														
 
															+
														
 
															+	/* check again */
														
 
															+	if (!cr4_cpuid_is_sync())
														
 
															+		exit_to_hv(GUEST_FAILED);
														
 
															+
														
 
															+	exit_to_hv(GUEST_DONE);
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	struct kvm_run *run;
														
 
															+	struct kvm_vm *vm;
														
 
															+	struct kvm_sregs sregs;
														
 
															+	struct kvm_cpuid_entry2 *entry;
														
 
															+	int rc;
														
 
															+
														
 
															+	entry = kvm_get_supported_cpuid_entry(1);
														
 
															+	if (!(entry->ecx & X86_FEATURE_XSAVE)) {
														
 
															+		printf("XSAVE feature not supported, skipping test\n");
														
 
															+		return 0;
														
 
															+	}
														
 
															+
														
 
															+	/* Tell stdout not to buffer its content */
														
 
															+	setbuf(stdout, NULL);
														
 
															+
														
 
															+	/* Create VM */
														
 
															+	vm = vm_create_default(VCPU_ID, guest_code);
														
 
															+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
														
 
															+	run = vcpu_state(vm, VCPU_ID);
														
 
															+
														
 
															+	while (1) {
														
 
															+		rc = _vcpu_run(vm, VCPU_ID);
														
 
															+
														
 
															+		if (run->exit_reason == KVM_EXIT_IO) {
														
 
															+			switch (run->io.port) {
														
 
															+			case GUEST_UPDATE_CR4:
														
 
															+				/* emulate hypervisor clearing CR4.OSXSAVE */
														
 
															+				vcpu_sregs_get(vm, VCPU_ID, &sregs);
														
 
															+				sregs.cr4 &= ~X86_CR4_OSXSAVE;
														
 
															+				vcpu_sregs_set(vm, VCPU_ID, &sregs);
														
 
															+				break;
														
 
															+			case GUEST_FAILED:
														
 
															+				TEST_ASSERT(false, "Guest CR4 bit (OSXSAVE) unsynchronized with CPUID bit.");
														
 
															+				break;
														
 
															+			case GUEST_DONE:
														
 
															+				goto done;
														
 
															+			default:
														
 
															+				TEST_ASSERT(false, "Unknown port 0x%x.",
														
 
															+					    run->io.port);
														
 
															+			}
														
 
															+		}
														
 
															+	}
														
 
															+
														
 
															+	kvm_vm_free(vm);
														
 
															+
														
 
															+done:
														
 
															+	return 0;
														
 
															+}
														
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -53,6 +53,8 @@ int kvm_check_cap(long cap);
 
															 struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
														
 
															 void kvm_vm_free(struct kvm_vm *vmp);
														
 
															+void kvm_vm_restart(struct kvm_vm *vmp, int perm);
														
 
															+void kvm_vm_release(struct kvm_vm *vmp);
														
 
															 int kvm_memcmp_hva_gva(void *hva,
														
 
															 	struct kvm_vm *vm, const vm_vaddr_t gva, size_t len);
														
@@ -75,7 +77,7 @@ void vcpu_ioctl(struct kvm_vm *vm,
 
															 	uint32_t vcpuid, unsigned long ioctl, void *arg);
														
 
															 void vm_ioctl(struct kvm_vm *vm, unsigned long ioctl, void *arg);
														
 
															 void vm_mem_region_set_flags(struct kvm_vm *vm, uint32_t slot, uint32_t flags);
														
 
															-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid);
														
 
															+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot);
														
 
															 vm_vaddr_t vm_vaddr_alloc(struct kvm_vm *vm, size_t sz, vm_vaddr_t vaddr_min,
														
 
															 	uint32_t data_memslot, uint32_t pgd_memslot);
														
 
															 void *addr_gpa2hva(struct kvm_vm *vm, vm_paddr_t gpa);
														
--- a/tools/testing/selftests/kvm/include/vmx.h
+++ b/tools/testing/selftests/kvm/include/vmx.h
@@ -380,6 +380,30 @@ static inline int vmptrld(uint64_t vmcs_pa)
 
															 	return ret;
														
 
															 }
														
 
															+static inline int vmptrst(uint64_t *value)
														
 
															+{
														
 
															+	uint64_t tmp;
														
 
															+	uint8_t ret;
														
 
															+
														
 
															+	__asm__ __volatile__("vmptrst %[value]; setna %[ret]"
														
 
															+		: [value]"=m"(tmp), [ret]"=rm"(ret)
														
 
															+		: : "cc", "memory");
														
 
															+
														
 
															+	*value = tmp;
														
 
															+	return ret;
														
 
															+}
														
 
															+
														
 
															+/*
														
 
															+ * A wrapper around vmptrst that ignores errors and returns zero if the
														
 
															+ * vmptrst instruction fails.
														
 
															+ */
														
 
															+static inline uint64_t vmptrstz(void)
														
 
															+{
														
 
															+	uint64_t value = 0;
														
 
															+	vmptrst(&value);
														
 
															+	return value;
														
 
															+}
														
 
															+
														
 
															 /*
														
 
															  * No guest state (e.g. GPRs) is established by this vmlaunch.
														
 
															  */
														
@@ -444,6 +468,15 @@ static inline int vmresume(void)
 
															 	return ret;
														
 
															 }
														
 
															+static inline void vmcall(void)
														
 
															+{
														
 
															+	/* Currently, L1 destroys our GPRs during vmexits.  */
														
 
															+	__asm__ __volatile__("push %%rbp; vmcall; pop %%rbp" : : :
														
 
															+			     "rax", "rbx", "rcx", "rdx",
														
 
															+			     "rsi", "rdi", "r8", "r9", "r10", "r11", "r12",
														
 
															+			     "r13", "r14", "r15");
														
 
															+}
														
 
															+
														
 
															 static inline int vmread(uint64_t encoding, uint64_t *value)
														
 
															 {
														
 
															 	uint64_t tmp;
														
@@ -486,9 +519,34 @@ static inline uint32_t vmcs_revision(void)
 
															 	return rdmsr(MSR_IA32_VMX_BASIC);
														
 
															 }
														
 
															-void prepare_for_vmx_operation(void);
														
 
															-void prepare_vmcs(void *guest_rip, void *guest_rsp);
														
 
															-struct kvm_vm *vm_create_default_vmx(uint32_t vcpuid,
														
 
															-				     vmx_guest_code_t guest_code);
														
 
															+struct vmx_pages {
														
 
															+	void *vmxon_hva;
														
 
															+	uint64_t vmxon_gpa;
														
 
															+	void *vmxon;
														
 
															+
														
 
															+	void *vmcs_hva;
														
 
															+	uint64_t vmcs_gpa;
														
 
															+	void *vmcs;
														
 
															+
														
 
															+	void *msr_hva;
														
 
															+	uint64_t msr_gpa;
														
 
															+	void *msr;
														
 
															+
														
 
															+	void *shadow_vmcs_hva;
														
 
															+	uint64_t shadow_vmcs_gpa;
														
 
															+	void *shadow_vmcs;
														
 
															+
														
 
															+	void *vmread_hva;
														
 
															+	uint64_t vmread_gpa;
														
 
															+	void *vmread;
														
 
															+
														
 
															+	void *vmwrite_hva;
														
 
															+	uint64_t vmwrite_gpa;
														
 
															+	void *vmwrite;
														
 
															+};
														
 
															+
														
 
															+struct vmx_pages *vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva);
														
 
															+bool prepare_for_vmx_operation(struct vmx_pages *vmx);
														
 
															+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp);
														
 
															 #endif /* !SELFTEST_KVM_VMX_H */
														
--- a/tools/testing/selftests/kvm/include/x86.h
+++ b/tools/testing/selftests/kvm/include/x86.h
@@ -59,8 +59,8 @@ enum x86_register {
 
															 struct desc64 {
														
 
															 	uint16_t limit0;
														
 
															 	uint16_t base0;
														
 
															-	unsigned base1:8, type:5, dpl:2, p:1;
														
 
															-	unsigned limit1:4, zero0:3, g:1, base2:8;
														
 
															+	unsigned base1:8, s:1, type:4, dpl:2, p:1;
														
 
															+	unsigned limit1:4, avl:1, l:1, db:1, g:1, base2:8;
														
 
															 	uint32_t base3;
														
 
															 	uint32_t zero1;
														
 
															 } __attribute__((packed));
														
@@ -303,6 +303,10 @@ static inline unsigned long get_xmm(int n)
 
															 	return 0;
														
 
															 }
														
 
															+struct kvm_x86_state;
														
 
															+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid);
														
 
															+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state);
														
 
															+
														
 
															 /*
														
 
															  * Basic CPU control in CR0
														
 
															  */
														
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -62,6 +62,18 @@ int kvm_check_cap(long cap)
 
															 	return ret;
														
 
															 }
														
 
															+static void vm_open(struct kvm_vm *vm, int perm)
														
 
															+{
														
 
															+	vm->kvm_fd = open(KVM_DEV_PATH, perm);
														
 
															+	if (vm->kvm_fd < 0)
														
 
															+		exit(KSFT_SKIP);
														
 
															+
														
 
															+	/* Create VM. */
														
 
															+	vm->fd = ioctl(vm->kvm_fd, KVM_CREATE_VM, NULL);
														
 
															+	TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
														
 
															+		"rc: %i errno: %i", vm->fd, errno);
														
 
															+}
														
 
															+
														
 
															 /* VM Create
														
 
															  *
														
 
															  * Input Args:
														
@@ -90,16 +102,7 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
															 	TEST_ASSERT(vm != NULL, "Insufficent Memory");
														
 
															 	vm->mode = mode;
														
 
															-	kvm_fd = open(KVM_DEV_PATH, perm);
														
 
															-	if (kvm_fd < 0)
														
 
															-		exit(KSFT_SKIP);
														
 
															-
														
 
															-	/* Create VM. */
														
 
															-	vm->fd = ioctl(kvm_fd, KVM_CREATE_VM, NULL);
														
 
															-	TEST_ASSERT(vm->fd >= 0, "KVM_CREATE_VM ioctl failed, "
														
 
															-		"rc: %i errno: %i", vm->fd, errno);
														
 
															-
														
 
															-	close(kvm_fd);
														
 
															+	vm_open(vm, perm);
														
 
															 	/* Setup mode specific traits. */
														
 
															 	switch (vm->mode) {
														
@@ -132,6 +135,39 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
 
															 	return vm;
														
 
															 }
														
 
															+/* VM Restart
														
 
															+ *
														
 
															+ * Input Args:
														
 
															+ *   vm - VM that has been released before
														
 
															+ *   perm - permission
														
 
															+ *
														
 
															+ * Output Args: None
														
 
															+ *
														
 
															+ * Reopens the file descriptors associated to the VM and reinstates the
														
 
															+ * global state, such as the irqchip and the memory regions that are mapped
														
 
															+ * into the guest.
														
 
															+ */
														
 
															+void kvm_vm_restart(struct kvm_vm *vmp, int perm)
														
 
															+{
														
 
															+	struct userspace_mem_region *region;
														
 
															+
														
 
															+	vm_open(vmp, perm);
														
 
															+	if (vmp->has_irqchip)
														
 
															+		vm_create_irqchip(vmp);
														
 
															+
														
 
															+	for (region = vmp->userspace_mem_region_head; region;
														
 
															+		region = region->next) {
														
 
															+		int ret = ioctl(vmp->fd, KVM_SET_USER_MEMORY_REGION, &region->region);
														
 
															+		TEST_ASSERT(ret == 0, "KVM_SET_USER_MEMORY_REGION IOCTL failed,\n"
														
 
															+			    "  rc: %i errno: %i\n"
														
 
															+			    "  slot: %u flags: 0x%x\n"
														
 
															+			    "  guest_phys_addr: 0x%lx size: 0x%lx",
														
 
															+			    ret, errno, region->region.slot, region->region.flags,
														
 
															+			    region->region.guest_phys_addr,
														
 
															+			    region->region.memory_size);
														
 
															+	}
														
 
															+}
														
 
															+
														
 
															 /* Userspace Memory Region Find
														
 
															  *
														
 
															  * Input Args:
														
@@ -238,8 +274,12 @@ struct vcpu *vcpu_find(struct kvm_vm *vm,
 
															 static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
														
 
															 {
														
 
															 	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
														
 
															+	int ret;
														
 
															-	int ret = close(vcpu->fd);
														
 
															+	ret = munmap(vcpu->state, sizeof(*vcpu->state));
														
 
															+	TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
														
 
															+		"errno: %i", ret, errno);
														
 
															+	close(vcpu->fd);
														
 
															 	TEST_ASSERT(ret == 0, "Close of VCPU fd failed, rc: %i "
														
 
															 		"errno: %i", ret, errno);
														
@@ -252,6 +292,23 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
 
															 	free(vcpu);
														
 
															 }
														
 
															+void kvm_vm_release(struct kvm_vm *vmp)
														
 
															+{
														
 
															+	int ret;
														
 
															+
														
 
															+	/* Free VCPUs. */
														
 
															+	while (vmp->vcpu_head)
														
 
															+		vm_vcpu_rm(vmp, vmp->vcpu_head->id);
														
 
															+
														
 
															+	/* Close file descriptor for the VM. */
														
 
															+	ret = close(vmp->fd);
														
 
															+	TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
														
 
															+		"  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
														
 
															+
														
 
															+	close(vmp->kvm_fd);
														
 
															+	TEST_ASSERT(ret == 0, "Close of /dev/kvm fd failed,\n"
														
 
															+		"  vmp->kvm_fd: %i rc: %i errno: %i", vmp->kvm_fd, ret, errno);
														
 
															+}
														
 
															 /* Destroys and frees the VM pointed to by vmp.
														
 
															  */
														
@@ -282,18 +339,11 @@ void kvm_vm_free(struct kvm_vm *vmp)
 
															 		free(region);
														
 
															 	}
														
 
															-	/* Free VCPUs. */
														
 
															-	while (vmp->vcpu_head)
														
 
															-		vm_vcpu_rm(vmp, vmp->vcpu_head->id);
														
 
															-
														
 
															 	/* Free sparsebit arrays. */
														
 
															 	sparsebit_free(&vmp->vpages_valid);
														
 
															 	sparsebit_free(&vmp->vpages_mapped);
														
 
															-	/* Close file descriptor for the VM. */
														
 
															-	ret = close(vmp->fd);
														
 
															-	TEST_ASSERT(ret == 0, "Close of vm fd failed,\n"
														
 
															-		"  vmp->fd: %i rc: %i errno: %i", vmp->fd, ret, errno);
														
 
															+	kvm_vm_release(vmp);
														
 
															 	/* Free the structure describing the VM. */
														
 
															 	free(vmp);
														
@@ -701,7 +751,7 @@ static int vcpu_mmap_sz(void)
 
															  * Creates and adds to the VM specified by vm and virtual CPU with
														
 
															  * the ID given by vcpuid.
														
 
															  */
														
 
															-void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
														
 
															+void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid, int pgd_memslot, int gdt_memslot)
														
 
															 {
														
 
															 	struct vcpu *vcpu;
														
@@ -736,7 +786,7 @@ void vm_vcpu_add(struct kvm_vm *vm, uint32_t vcpuid)
 
															 	vcpu->next = vm->vcpu_head;
														
 
															 	vm->vcpu_head = vcpu;
														
 
															-	vcpu_setup(vm, vcpuid);
														
 
															+	vcpu_setup(vm, vcpuid, pgd_memslot, gdt_memslot);
														
 
															 }
														
 
															 /* VM Virtual Address Unused Gap
														
@@ -957,6 +1007,8 @@ void vm_create_irqchip(struct kvm_vm *vm)
 
															 	ret = ioctl(vm->fd, KVM_CREATE_IRQCHIP, 0);
														
 
															 	TEST_ASSERT(ret == 0, "KVM_CREATE_IRQCHIP IOCTL failed, "
														
 
															 		"rc: %i errno: %i", ret, errno);
														
 
															+
														
 
															+	vm->has_irqchip = true;
														
 
															 }
														
 
															 /* VM VCPU State
														
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -43,6 +43,7 @@ struct vcpu {
 
															 struct kvm_vm {
														
 
															 	int mode;
														
 
															+	int kvm_fd;
														
 
															 	int fd;
														
 
															 	unsigned int page_size;
														
 
															 	unsigned int page_shift;
														
@@ -51,13 +52,17 @@ struct kvm_vm {
 
															 	struct userspace_mem_region *userspace_mem_region_head;
														
 
															 	struct sparsebit *vpages_valid;
														
 
															 	struct sparsebit *vpages_mapped;
														
 
															+
														
 
															+	bool has_irqchip;
														
 
															 	bool pgd_created;
														
 
															 	vm_paddr_t pgd;
														
 
															+	vm_vaddr_t gdt;
														
 
															+	vm_vaddr_t tss;
														
 
															 };
														
 
															 struct vcpu *vcpu_find(struct kvm_vm *vm,
														
 
															 	uint32_t vcpuid);
														
 
															-void vcpu_setup(struct kvm_vm *vm, int vcpuid);
														
 
															+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot);
														
 
															 void virt_dump(FILE *stream, struct kvm_vm *vm, uint8_t indent);
														
 
															 void regs_dump(FILE *stream, struct kvm_regs *regs,
														
 
															 	uint8_t indent);
														
--- a/tools/testing/selftests/kvm/lib/vmx.c
+++ b/tools/testing/selftests/kvm/lib/vmx.c
@@ -13,47 +13,60 @@
 
															 #include "x86.h"
														
 
															 #include "vmx.h"
														
 
															-/* Create a default VM for VMX tests.
														
 
															+/* Allocate memory regions for nested VMX tests.
														
 
															  *
														
 
															  * Input Args:
														
 
															- *   vcpuid - The id of the single VCPU to add to the VM.
														
 
															- *   guest_code - The vCPU's entry point
														
 
															+ *   vm - The VM to allocate guest-virtual addresses in.
														
 
															  *
														
 
															- * Output Args: None
														
 
															+ * Output Args:
														
 
															+ *   p_vmx_gva - The guest virtual address for the struct vmx_pages.
														
 
															  *
														
 
															  * Return:
														
 
															- *   Pointer to opaque structure that describes the created VM.
														
 
															+ *   Pointer to structure with the addresses of the VMX areas.
														
 
															  */
														
 
															-struct kvm_vm *
														
 
															-vm_create_default_vmx(uint32_t vcpuid, vmx_guest_code_t guest_code)
														
 
															+struct vmx_pages *
														
 
															+vcpu_alloc_vmx(struct kvm_vm *vm, vm_vaddr_t *p_vmx_gva)
														
 
															 {
														
 
															-	struct kvm_cpuid2 *cpuid;
														
 
															-	struct kvm_vm *vm;
														
 
															-	vm_vaddr_t vmxon_vaddr;
														
 
															-	vm_paddr_t vmxon_paddr;
														
 
															-	vm_vaddr_t vmcs_vaddr;
														
 
															-	vm_paddr_t vmcs_paddr;
														
 
															-
														
 
															-	vm = vm_create_default(vcpuid, (void *) guest_code);
														
 
															-
														
 
															-	/* Enable nesting in CPUID */
														
 
															-	vcpu_set_cpuid(vm, vcpuid, kvm_get_supported_cpuid());
														
 
															+	vm_vaddr_t vmx_gva = vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	struct vmx_pages *vmx = addr_gva2hva(vm, vmx_gva);
														
 
															 	/* Setup of a region of guest memory for the vmxon region. */
														
 
															-	vmxon_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
														
 
															-	vmxon_paddr = addr_gva2gpa(vm, vmxon_vaddr);
														
 
															+	vmx->vmxon = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->vmxon_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmxon);
														
 
															+	vmx->vmxon_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmxon);
														
 
															 	/* Setup of a region of guest memory for a vmcs. */
														
 
															-	vmcs_vaddr = vm_vaddr_alloc(vm, getpagesize(), 0, 0, 0);
														
 
															-	vmcs_paddr = addr_gva2gpa(vm, vmcs_vaddr);
														
 
															+	vmx->vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmcs);
														
 
															+	vmx->vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmcs);
														
 
															+
														
 
															+	/* Setup of a region of guest memory for the MSR bitmap. */
														
 
															+	vmx->msr = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->msr_hva = addr_gva2hva(vm, (uintptr_t)vmx->msr);
														
 
															+	vmx->msr_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->msr);
														
 
															+	memset(vmx->msr_hva, 0, getpagesize());
														
 
															-	vcpu_args_set(vm, vcpuid, 4, vmxon_vaddr, vmxon_paddr, vmcs_vaddr,
														
 
															-		      vmcs_paddr);
														
 
															+	/* Setup of a region of guest memory for the shadow VMCS. */
														
 
															+	vmx->shadow_vmcs = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->shadow_vmcs_hva = addr_gva2hva(vm, (uintptr_t)vmx->shadow_vmcs);
														
 
															+	vmx->shadow_vmcs_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->shadow_vmcs);
														
 
															-	return vm;
														
 
															+	/* Setup of a region of guest memory for the VMREAD and VMWRITE bitmaps. */
														
 
															+	vmx->vmread = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->vmread_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmread);
														
 
															+	vmx->vmread_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmread);
														
 
															+	memset(vmx->vmread_hva, 0, getpagesize());
														
 
															+
														
 
															+	vmx->vmwrite = (void *)vm_vaddr_alloc(vm, getpagesize(), 0x10000, 0, 0);
														
 
															+	vmx->vmwrite_hva = addr_gva2hva(vm, (uintptr_t)vmx->vmwrite);
														
 
															+	vmx->vmwrite_gpa = addr_gva2gpa(vm, (uintptr_t)vmx->vmwrite);
														
 
															+	memset(vmx->vmwrite_hva, 0, getpagesize());
														
 
															+
														
 
															+	*p_vmx_gva = vmx_gva;
														
 
															+	return vmx;
														
 
															 }
														
 
															-void prepare_for_vmx_operation(void)
														
 
															+bool prepare_for_vmx_operation(struct vmx_pages *vmx)
														
 
															 {
														
 
															 	uint64_t feature_control;
														
 
															 	uint64_t required;
														
@@ -88,18 +101,42 @@ void prepare_for_vmx_operation(void)
 
															 	feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
														
 
															 	if ((feature_control & required) != required)
														
 
															 		wrmsr(MSR_IA32_FEATURE_CONTROL, feature_control | required);
														
 
															+
														
 
															+	/* Enter VMX root operation. */
														
 
															+	*(uint32_t *)(vmx->vmxon) = vmcs_revision();
														
 
															+	if (vmxon(vmx->vmxon_gpa))
														
 
															+		return false;
														
 
															+
														
 
															+	/* Load a VMCS. */
														
 
															+	*(uint32_t *)(vmx->vmcs) = vmcs_revision();
														
 
															+	if (vmclear(vmx->vmcs_gpa))
														
 
															+		return false;
														
 
															+
														
 
															+	if (vmptrld(vmx->vmcs_gpa))
														
 
															+		return false;
														
 
															+
														
 
															+	/* Setup shadow VMCS, do not load it yet. */
														
 
															+	*(uint32_t *)(vmx->shadow_vmcs) = vmcs_revision() | 0x80000000ul;
														
 
															+	if (vmclear(vmx->shadow_vmcs_gpa))
														
 
															+		return false;
														
 
															+
														
 
															+	return true;
														
 
															 }
														
 
															 /*
														
 
															  * Initialize the control fields to the most basic settings possible.
														
 
															  */
														
 
															-static inline void init_vmcs_control_fields(void)
														
 
															+static inline void init_vmcs_control_fields(struct vmx_pages *vmx)
														
 
															 {
														
 
															 	vmwrite(VIRTUAL_PROCESSOR_ID, 0);
														
 
															 	vmwrite(POSTED_INTR_NV, 0);
														
 
															-	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PINBASED_CTLS));
														
 
															-	vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_PROCBASED_CTLS));
														
 
															+	vmwrite(PIN_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PINBASED_CTLS));
														
 
															+	if (!vmwrite(SECONDARY_VM_EXEC_CONTROL, 0))
														
 
															+		vmwrite(CPU_BASED_VM_EXEC_CONTROL,
														
 
															+			rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS) | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
														
 
															+	else
														
 
															+		vmwrite(CPU_BASED_VM_EXEC_CONTROL, rdmsr(MSR_IA32_VMX_TRUE_PROCBASED_CTLS));
														
 
															 	vmwrite(EXCEPTION_BITMAP, 0);
														
 
															 	vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
														
 
															 	vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, -1); /* Never match */
														
@@ -113,12 +150,15 @@ static inline void init_vmcs_control_fields(void)
 
															 	vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
														
 
															 	vmwrite(VM_ENTRY_INTR_INFO_FIELD, 0);
														
 
															 	vmwrite(TPR_THRESHOLD, 0);
														
 
															-	vmwrite(SECONDARY_VM_EXEC_CONTROL, 0);
														
 
															 	vmwrite(CR0_GUEST_HOST_MASK, 0);
														
 
															 	vmwrite(CR4_GUEST_HOST_MASK, 0);
														
 
															 	vmwrite(CR0_READ_SHADOW, get_cr0());
														
 
															 	vmwrite(CR4_READ_SHADOW, get_cr4());
														
 
															+
														
 
															+	vmwrite(MSR_BITMAP, vmx->msr_gpa);
														
 
															+	vmwrite(VMREAD_BITMAP, vmx->vmread_gpa);
														
 
															+	vmwrite(VMWRITE_BITMAP, vmx->vmwrite_gpa);
														
 
															 }
														
 
															 /*
														
@@ -235,9 +275,9 @@ static inline void init_vmcs_guest_state(void *rip, void *rsp)
 
															 	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
														
 
															 }
														
 
															-void prepare_vmcs(void *guest_rip, void *guest_rsp)
														
 
															+void prepare_vmcs(struct vmx_pages *vmx, void *guest_rip, void *guest_rsp)
														
 
															 {
														
 
															-	init_vmcs_control_fields();
														
 
															+	init_vmcs_control_fields(vmx);
														
 
															 	init_vmcs_host_state();
														
 
															 	init_vmcs_guest_state(guest_rip, guest_rsp);
														
 
															 }
														
--- a/tools/testing/selftests/kvm/lib/x86.c
+++ b/tools/testing/selftests/kvm/lib/x86.c
@@ -239,25 +239,6 @@ void virt_pgd_alloc(struct kvm_vm *vm, uint32_t pgd_memslot)
 
															 		vm_paddr_t paddr = vm_phy_page_alloc(vm,
														
 
															 			KVM_GUEST_PAGE_TABLE_MIN_PADDR, pgd_memslot);
														
 
															 		vm->pgd = paddr;
														
 
															-
														
 
															-		/* Set pointer to pgd tables in all the VCPUs that
														
 
															-		 * have already been created.  Future VCPUs will have
														
 
															-		 * the value set as each one is created.
														
 
															-		 */
														
 
															-		for (struct vcpu *vcpu = vm->vcpu_head; vcpu;
														
 
															-			vcpu = vcpu->next) {
														
 
															-			struct kvm_sregs sregs;
														
 
															-
														
 
															-			/* Obtain the current system register settings */
														
 
															-			vcpu_sregs_get(vm, vcpu->id, &sregs);
														
 
															-
														
 
															-			/* Set and store the pointer to the start of the
														
 
															-			 * pgd tables.
														
 
															-			 */
														
 
															-			sregs.cr3 = vm->pgd;
														
 
															-			vcpu_sregs_set(vm, vcpu->id, &sregs);
														
 
															-		}
														
 
															-
														
 
															 		vm->pgd_created = true;
														
 
															 	}
														
 
															 }
														
@@ -460,9 +441,32 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp)
 
															 	segp->unusable = true;
														
 
															 }
														
 
															+static void kvm_seg_fill_gdt_64bit(struct kvm_vm *vm, struct kvm_segment *segp)
														
 
															+{
														
 
															+	void *gdt = addr_gva2hva(vm, vm->gdt);
														
 
															+	struct desc64 *desc = gdt + (segp->selector >> 3) * 8;
														
 
															+
														
 
															+	desc->limit0 = segp->limit & 0xFFFF;
														
 
															+	desc->base0 = segp->base & 0xFFFF;
														
 
															+	desc->base1 = segp->base >> 16;
														
 
															+	desc->s = segp->s;
														
 
															+	desc->type = segp->type;
														
 
															+	desc->dpl = segp->dpl;
														
 
															+	desc->p = segp->present;
														
 
															+	desc->limit1 = segp->limit >> 16;
														
 
															+	desc->l = segp->l;
														
 
															+	desc->db = segp->db;
														
 
															+	desc->g = segp->g;
														
 
															+	desc->base2 = segp->base >> 24;
														
 
															+	if (!segp->s)
														
 
															+		desc->base3 = segp->base >> 32;
														
 
															+}
														
 
															+
														
 
															+
														
 
															 /* Set Long Mode Flat Kernel Code Segment
														
 
															  *
														
 
															  * Input Args:
														
 
															+ *   vm - VM whose GDT is being filled, or NULL to only write segp
														
 
															  *   selector - selector value
														
 
															  *
														
 
															  * Output Args:
														
@@ -473,7 +477,7 @@ static void kvm_seg_set_unusable(struct kvm_segment *segp)
 
															  * Sets up the KVM segment pointed to by segp, to be a code segment
														
 
															  * with the selector value given by selector.
														
 
															  */
														
 
															-static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
														
 
															+static void kvm_seg_set_kernel_code_64bit(struct kvm_vm *vm, uint16_t selector,
														
 
															 	struct kvm_segment *segp)
														
 
															 {
														
 
															 	memset(segp, 0, sizeof(*segp));
														
@@ -486,11 +490,14 @@ static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
 
															 	segp->g = true;
														
 
															 	segp->l = true;
														
 
															 	segp->present = 1;
														
 
															+	if (vm)
														
 
															+		kvm_seg_fill_gdt_64bit(vm, segp);
														
 
															 }
														
 
															 /* Set Long Mode Flat Kernel Data Segment
														
 
															  *
														
 
															  * Input Args:
														
 
															+ *   vm - VM whose GDT is being filled, or NULL to only write segp
														
 
															  *   selector - selector value
														
 
															  *
														
 
															  * Output Args:
														
@@ -501,7 +508,7 @@ static void kvm_seg_set_kernel_code_64bit(uint16_t selector,
 
															  * Sets up the KVM segment pointed to by segp, to be a data segment
														
 
															  * with the selector value given by selector.
														
 
															  */
														
 
															-static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
														
 
															+static void kvm_seg_set_kernel_data_64bit(struct kvm_vm *vm, uint16_t selector,
														
 
															 	struct kvm_segment *segp)
														
 
															 {
														
 
															 	memset(segp, 0, sizeof(*segp));
														
@@ -513,6 +520,8 @@ static void kvm_seg_set_kernel_data_64bit(uint16_t selector,
 
															 					  */
														
 
															 	segp->g = true;
														
 
															 	segp->present = true;
														
 
															+	if (vm)
														
 
															+		kvm_seg_fill_gdt_64bit(vm, segp);
														
 
															 }
														
 
															 /* Address Guest Virtual to Guest Physical
														
@@ -575,13 +584,45 @@ unmapped_gva:
 
															 		    "gva: 0x%lx", gva);
														
 
															 }
														
 
															-void vcpu_setup(struct kvm_vm *vm, int vcpuid)
														
 
															+static void kvm_setup_gdt(struct kvm_vm *vm, struct kvm_dtable *dt, int gdt_memslot,
														
 
															+			  int pgd_memslot)
														
 
															+{
														
 
															+	if (!vm->gdt)
														
 
															+		vm->gdt = vm_vaddr_alloc(vm, getpagesize(),
														
 
															+			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
														
 
															+
														
 
															+	dt->base = vm->gdt;
														
 
															+	dt->limit = getpagesize();
														
 
															+}
														
 
															+
														
 
															+static void kvm_setup_tss_64bit(struct kvm_vm *vm, struct kvm_segment *segp,
														
 
															+				int selector, int gdt_memslot,
														
 
															+				int pgd_memslot)
														
 
															+{
														
 
															+	if (!vm->tss)
														
 
															+		vm->tss = vm_vaddr_alloc(vm, getpagesize(),
														
 
															+			KVM_UTIL_MIN_VADDR, gdt_memslot, pgd_memslot);
														
 
															+
														
 
															+	memset(segp, 0, sizeof(*segp));
														
 
															+	segp->base = vm->tss;
														
 
															+	segp->limit = 0x67;
														
 
															+	segp->selector = selector;
														
 
															+	segp->type = 0xb;
														
 
															+	segp->present = 1;
														
 
															+	kvm_seg_fill_gdt_64bit(vm, segp);
														
 
															+}
														
 
															+
														
 
															+void vcpu_setup(struct kvm_vm *vm, int vcpuid, int pgd_memslot, int gdt_memslot)
														
 
															 {
														
 
															 	struct kvm_sregs sregs;
														
 
															 	/* Set mode specific system register values. */
														
 
															 	vcpu_sregs_get(vm, vcpuid, &sregs);
														
 
															+	sregs.idt.limit = 0;
														
 
															+
														
 
															+	kvm_setup_gdt(vm, &sregs.gdt, gdt_memslot, pgd_memslot);
														
 
															+
														
 
															 	switch (vm->mode) {
														
 
															 	case VM_MODE_FLAT48PG:
														
 
															 		sregs.cr0 = X86_CR0_PE | X86_CR0_NE | X86_CR0_PG;
														
@@ -589,30 +630,18 @@ void vcpu_setup(struct kvm_vm *vm, int vcpuid)
 
															 		sregs.efer |= (EFER_LME | EFER_LMA | EFER_NX);
														
 
															 		kvm_seg_set_unusable(&sregs.ldt);
														
 
															-		kvm_seg_set_kernel_code_64bit(0x8, &sregs.cs);
														
 
															-		kvm_seg_set_kernel_data_64bit(0x10, &sregs.ds);
														
 
															-		kvm_seg_set_kernel_data_64bit(0x10, &sregs.es);
														
 
															+		kvm_seg_set_kernel_code_64bit(vm, 0x8, &sregs.cs);
														
 
															+		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.ds);
														
 
															+		kvm_seg_set_kernel_data_64bit(vm, 0x10, &sregs.es);
														
 
															+		kvm_setup_tss_64bit(vm, &sregs.tr, 0x18, gdt_memslot, pgd_memslot);
														
 
															 		break;
														
 
															 	default:
														
 
															 		TEST_ASSERT(false, "Unknown guest mode, mode: 0x%x", vm->mode);
														
 
															 	}
														
 
															-	vcpu_sregs_set(vm, vcpuid, &sregs);
														
 
															-	/* If virtual translation table have been setup, set system register
														
 
															-	 * to point to the tables.  It's okay if they haven't been setup yet,
														
 
															-	 * in that the code that sets up the virtual translation tables, will
														
 
															-	 * go back through any VCPUs that have already been created and set
														
 
															-	 * their values.
														
 
															-	 */
														
 
															-	if (vm->pgd_created) {
														
 
															-		struct kvm_sregs sregs;
														
 
															-
														
 
															-		vcpu_sregs_get(vm, vcpuid, &sregs);
														
 
															-
														
 
															-		sregs.cr3 = vm->pgd;
														
 
															-		vcpu_sregs_set(vm, vcpuid, &sregs);
														
 
															-	}
														
 
															+	sregs.cr3 = vm->pgd;
														
 
															+	vcpu_sregs_set(vm, vcpuid, &sregs);
														
 
															 }
														
 
															 /* Adds a vCPU with reasonable defaults (i.e., a stack)
														
 
															  *
														
@@ -629,7 +658,7 @@ void vm_vcpu_add_default(struct kvm_vm *vm, uint32_t vcpuid, void *guest_code)
 
															 				     DEFAULT_GUEST_STACK_VADDR_MIN, 0, 0);
														
 
															 	/* Create VCPU */
														
 
															-	vm_vcpu_add(vm, vcpuid);
														
 
															+	vm_vcpu_add(vm, vcpuid, 0, 0);
														
 
															 	/* Setup guest general purpose registers */
														
 
															 	vcpu_regs_get(vm, vcpuid, &regs);
														
@@ -698,3 +727,148 @@ struct kvm_vm *vm_create_default(uint32_t vcpuid, void *guest_code)
 
															 	return vm;
														
 
															 }
														
 
															+
														
 
															+struct kvm_x86_state {
														
 
															+	struct kvm_vcpu_events events;
														
 
															+	struct kvm_mp_state mp_state;
														
 
															+	struct kvm_regs regs;
														
 
															+	struct kvm_xsave xsave;
														
 
															+	struct kvm_xcrs xcrs;
														
 
															+	struct kvm_sregs sregs;
														
 
															+	struct kvm_debugregs debugregs;
														
 
															+	union {
														
 
															+		struct kvm_nested_state nested;
														
 
															+		char nested_[16384];
														
 
															+	};
														
 
															+	struct kvm_msrs msrs;
														
 
															+};
														
 
															+
														
 
															+static int kvm_get_num_msrs(struct kvm_vm *vm)
														
 
															+{
														
 
															+	struct kvm_msr_list nmsrs;
														
 
															+	int r;
														
 
															+
														
 
															+	nmsrs.nmsrs = 0;
														
 
															+	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, &nmsrs);
														
 
															+	TEST_ASSERT(r == -1 && errno == E2BIG, "Unexpected result from KVM_GET_MSR_INDEX_LIST probe, r: %i",
														
 
															+		r);
														
 
															+
														
 
															+	return nmsrs.nmsrs;
														
 
															+}
														
 
															+
														
 
															+struct kvm_x86_state *vcpu_save_state(struct kvm_vm *vm, uint32_t vcpuid)
														
 
															+{
														
 
															+	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
														
 
															+	struct kvm_msr_list *list;
														
 
															+	struct kvm_x86_state *state;
														
 
															+	int nmsrs, r, i;
														
 
															+	static int nested_size = -1;
														
 
															+
														
 
															+	if (nested_size == -1) {
														
 
															+		nested_size = kvm_check_cap(KVM_CAP_NESTED_STATE);
														
 
															+		TEST_ASSERT(nested_size <= sizeof(state->nested_),
														
 
															+			    "Nested state size too big, %i > %zi",
														
 
															+			    nested_size, sizeof(state->nested_));
														
 
															+	}
														
 
															+
														
 
															+	nmsrs = kvm_get_num_msrs(vm);
														
 
															+	list = malloc(sizeof(*list) + nmsrs * sizeof(list->indices[0]));
														
 
															+	list->nmsrs = nmsrs;
														
 
															+	r = ioctl(vm->kvm_fd, KVM_GET_MSR_INDEX_LIST, list);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MSR_INDEX_LIST, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	state = malloc(sizeof(*state) + nmsrs * sizeof(state->msrs.entries[0]));
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_VCPU_EVENTS, &state->events);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_VCPU_EVENTS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_MP_STATE, &state->mp_state);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_MP_STATE, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_REGS, &state->regs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_REGS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_XSAVE, &state->xsave);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XSAVE, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_XCRS, &state->xcrs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_XCRS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_SREGS, &state->sregs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_SREGS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	if (nested_size) {
														
 
															+		state->nested.size = sizeof(state->nested_);
														
 
															+		r = ioctl(vcpu->fd, KVM_GET_NESTED_STATE, &state->nested);
														
 
															+		TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_NESTED_STATE, r: %i",
														
 
															+			r);
														
 
															+		TEST_ASSERT(state->nested.size <= nested_size,
														
 
															+			"Nested state size too big, %i (KVM_CHECK_CAP gave %i)",
														
 
															+			state->nested.size, nested_size);
														
 
															+	} else
														
 
															+		state->nested.size = 0;
														
 
															+
														
 
															+	state->msrs.nmsrs = nmsrs;
														
 
															+	for (i = 0; i < nmsrs; i++)
														
 
															+		state->msrs.entries[i].index = list->indices[i];
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_MSRS, &state->msrs);
														
 
															+        TEST_ASSERT(r == nmsrs, "Unexpected result from KVM_GET_MSRS, r: %i (failed at %x)",
														
 
															+                r, r == nmsrs ? -1 : list->indices[r]);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_GET_DEBUGREGS, &state->debugregs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_GET_DEBUGREGS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	free(list);
														
 
															+	return state;
														
 
															+}
														
 
															+
														
 
															+void vcpu_load_state(struct kvm_vm *vm, uint32_t vcpuid, struct kvm_x86_state *state)
														
 
															+{
														
 
															+	struct vcpu *vcpu = vcpu_find(vm, vcpuid);
														
 
															+	int r;
														
 
															+
														
 
															+	if (state->nested.size) {
														
 
															+		r = ioctl(vcpu->fd, KVM_SET_NESTED_STATE, &state->nested);
														
 
															+		TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_NESTED_STATE, r: %i",
														
 
															+			r);
														
 
															+	}
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_XSAVE, &state->xsave);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XSAVE, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_XCRS, &state->xcrs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_XCRS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_SREGS, &state->sregs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_SREGS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_MSRS, &state->msrs);
														
 
															+        TEST_ASSERT(r == state->msrs.nmsrs, "Unexpected result from KVM_SET_MSRS, r: %i (failed at %x)",
														
 
															+                r, r == state->msrs.nmsrs ? -1 : state->msrs.entries[r].index);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_VCPU_EVENTS, &state->events);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_VCPU_EVENTS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_MP_STATE, &state->mp_state);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_MP_STATE, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_DEBUGREGS, &state->debugregs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_DEBUGREGS, r: %i",
														
 
															+                r);
														
 
															+
														
 
															+	r = ioctl(vcpu->fd, KVM_SET_REGS, &state->regs);
														
 
															+        TEST_ASSERT(r == 0, "Unexpected result from KVM_SET_REGS, r: %i",
														
 
															+                r);
														
 
															+}
														
--- a/tools/testing/selftests/kvm/state_test.c
+++ b/tools/testing/selftests/kvm/state_test.c
@@ -0,0 +1,218 @@
 
															+/*
														
 
															+ * KVM_GET/SET_* tests
														
 
															+ *
														
 
															+ * Copyright (C) 2018, Red Hat, Inc.
														
 
															+ *
														
 
															+ * This work is licensed under the terms of the GNU GPL, version 2.
														
 
															+ *
														
 
															+ * Tests for vCPU state save/restore, including nested guest state.
														
 
															+ */
														
 
															+#define _GNU_SOURCE /* for program_invocation_short_name */
														
 
															+#include <fcntl.h>
														
 
															+#include <stdio.h>
														
 
															+#include <stdlib.h>
														
 
															+#include <string.h>
														
 
															+#include <sys/ioctl.h>
														
 
															+
														
 
															+#include "test_util.h"
														
 
															+
														
 
															+#include "kvm_util.h"
														
 
															+#include "x86.h"
														
 
															+#include "vmx.h"
														
 
															+
														
 
															+#define VCPU_ID		5
														
 
															+#define PORT_SYNC	0x1000
														
 
															+#define PORT_ABORT	0x1001
														
 
															+#define PORT_DONE	0x1002
														
 
															+
														
 
															+static inline void __exit_to_l0(uint16_t port, uint64_t arg0, uint64_t arg1)
														
 
															+{
														
 
															+	__asm__ __volatile__("in %[port], %%al"
														
 
															+			     :
														
 
															+			     : [port]"d"(port), "D"(arg0), "S"(arg1)
														
 
															+			     : "rax");
														
 
															+}
														
 
															+
														
 
															+#define exit_to_l0(_port, _arg0, _arg1) \
														
 
															+	__exit_to_l0(_port, (uint64_t) (_arg0), (uint64_t) (_arg1))
														
 
															+
														
 
															+#define GUEST_ASSERT(_condition) do { \
														
 
															+	if (!(_condition)) \
														
 
															+		exit_to_l0(PORT_ABORT, "Failed guest assert: " #_condition, __LINE__);\
														
 
															+} while (0)
														
 
															+
														
 
															+#define GUEST_SYNC(stage) \
														
 
															+	exit_to_l0(PORT_SYNC, "hello", stage);
														
 
															+
														
 
															+static bool have_nested_state;
														
 
															+
														
 
															+void l2_guest_code(void)
														
 
															+{
														
 
															+	GUEST_SYNC(5);
														
 
															+
														
 
															+        /* Exit to L1 */
														
 
															+	vmcall();
														
 
															+
														
 
															+	/* L1 has now set up a shadow VMCS for us.  */
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
														
 
															+	GUEST_SYNC(9);
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
														
 
															+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0fffee));
														
 
															+	GUEST_SYNC(10);
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0fffee);
														
 
															+	GUEST_ASSERT(!vmwrite(GUEST_RIP, 0xc0ffffee));
														
 
															+	GUEST_SYNC(11);
														
 
															+
														
 
															+	/* Done, exit to L1 and never come back.  */
														
 
															+	vmcall();
														
 
															+}
														
 
															+
														
 
															+void l1_guest_code(struct vmx_pages *vmx_pages)
														
 
															+{
														
 
															+#define L2_GUEST_STACK_SIZE 64
														
 
															+        unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
														
 
															+
														
 
															+	GUEST_ASSERT(vmx_pages->vmcs_gpa);
														
 
															+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
														
 
															+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
														
 
															+
														
 
															+	GUEST_SYNC(3);
														
 
															+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
														
 
															+
														
 
															+	prepare_vmcs(vmx_pages, l2_guest_code,
														
 
															+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
														
 
															+
														
 
															+	GUEST_SYNC(4);
														
 
															+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
														
 
															+	GUEST_ASSERT(!vmlaunch());
														
 
															+	GUEST_ASSERT(vmptrstz() == vmx_pages->vmcs_gpa);
														
 
															+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
														
 
															+
														
 
															+	/* Check that the launched state is preserved.  */
														
 
															+	GUEST_ASSERT(vmlaunch());
														
 
															+
														
 
															+	GUEST_ASSERT(!vmresume());
														
 
															+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
														
 
															+
														
 
															+	GUEST_SYNC(6);
														
 
															+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
														
 
															+
														
 
															+	GUEST_ASSERT(!vmresume());
														
 
															+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
														
 
															+
														
 
															+	vmwrite(GUEST_RIP, vmreadz(GUEST_RIP) + 3);
														
 
															+
														
 
															+	vmwrite(SECONDARY_VM_EXEC_CONTROL, SECONDARY_EXEC_SHADOW_VMCS);
														
 
															+	vmwrite(VMCS_LINK_POINTER, vmx_pages->shadow_vmcs_gpa);
														
 
															+
														
 
															+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
														
 
															+	GUEST_ASSERT(vmlaunch());
														
 
															+	GUEST_SYNC(7);
														
 
															+	GUEST_ASSERT(vmlaunch());
														
 
															+	GUEST_ASSERT(vmresume());
														
 
															+
														
 
															+	vmwrite(GUEST_RIP, 0xc0ffee);
														
 
															+	GUEST_SYNC(8);
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffee);
														
 
															+
														
 
															+	GUEST_ASSERT(!vmptrld(vmx_pages->vmcs_gpa));
														
 
															+	GUEST_ASSERT(!vmresume());
														
 
															+	GUEST_ASSERT(vmreadz(VM_EXIT_REASON) == EXIT_REASON_VMCALL);
														
 
															+
														
 
															+	GUEST_ASSERT(!vmptrld(vmx_pages->shadow_vmcs_gpa));
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
														
 
															+	GUEST_ASSERT(vmlaunch());
														
 
															+	GUEST_ASSERT(vmresume());
														
 
															+	GUEST_SYNC(12);
														
 
															+	GUEST_ASSERT(vmreadz(GUEST_RIP) == 0xc0ffffee);
														
 
															+	GUEST_ASSERT(vmlaunch());
														
 
															+	GUEST_ASSERT(vmresume());
														
 
															+}
														
 
															+
														
 
															+void guest_code(struct vmx_pages *vmx_pages)
														
 
															+{
														
 
															+	GUEST_SYNC(1);
														
 
															+	GUEST_SYNC(2);
														
 
															+
														
 
															+	if (vmx_pages)
														
 
															+		l1_guest_code(vmx_pages);
														
 
															+
														
 
															+	exit_to_l0(PORT_DONE, 0, 0);
														
 
															+}
														
 
															+
														
 
															+int main(int argc, char *argv[])
														
 
															+{
														
 
															+	struct vmx_pages *vmx_pages = NULL;
														
 
															+	vm_vaddr_t vmx_pages_gva = 0;
														
 
															+
														
 
															+	struct kvm_regs regs1, regs2;
														
 
															+	struct kvm_vm *vm;
														
 
															+	struct kvm_run *run;
														
 
															+	struct kvm_x86_state *state;
														
 
															+	int stage;
														
 
															+
														
 
															+	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
														
 
															+
														
 
															+	/* Create VM */
														
 
															+	vm = vm_create_default(VCPU_ID, guest_code);
														
 
															+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
														
 
															+	run = vcpu_state(vm, VCPU_ID);
														
 
															+
														
 
															+	vcpu_regs_get(vm, VCPU_ID, &regs1);
														
 
															+
														
 
															+	if (kvm_check_cap(KVM_CAP_NESTED_STATE)) {
														
 
															+		vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
														
 
															+		vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
														
 
															+	} else {
														
 
															+		printf("will skip nested state checks\n");
														
 
															+		vcpu_args_set(vm, VCPU_ID, 1, 0);
														
 
															+	}
														
 
															+
														
 
															+	for (stage = 1;; stage++) {
														
 
															+		_vcpu_run(vm, VCPU_ID);
														
 
															+		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
														
 
															+			    "Unexpected exit reason: %u (%s),\n",
														
 
															+			    run->exit_reason,
														
 
															+			    exit_reason_str(run->exit_reason));
														
 
															+
														
 
															+		memset(&regs1, 0, sizeof(regs1));
														
 
															+		vcpu_regs_get(vm, VCPU_ID, &regs1);
														
 
															+		switch (run->io.port) {
														
 
															+		case PORT_ABORT:
														
 
															+			TEST_ASSERT(false, "%s at %s:%d", (const char *) regs1.rdi,
														
 
															+				    __FILE__, regs1.rsi);
														
 
															+			/* NOT REACHED */
														
 
															+		case PORT_SYNC:
														
 
															+			break;
														
 
															+		case PORT_DONE:
														
 
															+			goto done;
														
 
															+		default:
														
 
															+			TEST_ASSERT(false, "Unknown port 0x%x.", run->io.port);
														
 
															+		}
														
 
															+
														
 
															+		/* PORT_SYNC is handled here.  */
														
 
															+		TEST_ASSERT(!strcmp((const char *)regs1.rdi, "hello") &&
														
 
															+			    regs1.rsi == stage, "Unexpected register values vmexit #%lx, got %lx",
														
 
															+			    stage, (ulong) regs1.rsi);
														
 
															+
														
 
															+		state = vcpu_save_state(vm, VCPU_ID);
														
 
															+		kvm_vm_release(vm);
														
 
															+
														
 
															+		/* Restore state in a new VM.  */
														
 
															+		kvm_vm_restart(vm, O_RDWR);
														
 
															+		vm_vcpu_add(vm, VCPU_ID, 0, 0);
														
 
															+		vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
														
 
															+		vcpu_load_state(vm, VCPU_ID, state);
														
 
															+		run = vcpu_state(vm, VCPU_ID);
														
 
															+		free(state);
														
 
															+
														
 
															+		memset(&regs2, 0, sizeof(regs2));
														
 
															+		vcpu_regs_get(vm, VCPU_ID, &regs2);
														
 
															+		TEST_ASSERT(!memcmp(&regs1, &regs2, sizeof(regs2)),
														
 
															+			    "Unexpected register values after vcpu_load_state; rdi: %lx rsi: %lx",
														
 
															+			    (ulong) regs2.rdi, (ulong) regs2.rsi);
														
 
															+	}
														
 
															+
														
 
															+done:
														
 
															+	kvm_vm_free(vm);
														
 
															+}
														
--- a/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
+++ b/tools/testing/selftests/kvm/vmx_tsc_adjust_test.c
@@ -46,11 +46,6 @@ enum {
 
															 	PORT_DONE,
														
 
															 };
														
 
															-struct vmx_page {
														
 
															-	vm_vaddr_t virt;
														
 
															-	vm_paddr_t phys;
														
 
															-};
														
 
															-
														
 
															 enum {
														
 
															 	VMXON_PAGE = 0,
														
 
															 	VMCS_PAGE,
														
@@ -67,9 +62,6 @@ struct kvm_single_msr {
 
															 /* The virtual machine object. */
														
 
															 static struct kvm_vm *vm;
														
 
															-/* Array of vmx_page descriptors that is shared with the guest. */
														
 
															-struct vmx_page *vmx_pages;
														
 
															-
														
 
															 #define exit_to_l0(_port, _arg) do_exit_to_l0(_port, (unsigned long) (_arg))
														
 
															 static void do_exit_to_l0(uint16_t port, unsigned long arg)
														
 
															 {
														
@@ -105,7 +97,7 @@ static void l2_guest_code(void)
 
															 	__asm__ __volatile__("vmcall");
														
 
															 }
														
 
															-static void l1_guest_code(struct vmx_page *vmx_pages)
														
 
															+static void l1_guest_code(struct vmx_pages *vmx_pages)
														
 
															 {
														
 
															 #define L2_GUEST_STACK_SIZE 64
														
 
															 	unsigned long l2_guest_stack[L2_GUEST_STACK_SIZE];
														
@@ -116,23 +108,14 @@ static void l1_guest_code(struct vmx_page *vmx_pages)
 
															 	wrmsr(MSR_IA32_TSC, rdtsc() - TSC_ADJUST_VALUE);
														
 
															 	check_ia32_tsc_adjust(-1 * TSC_ADJUST_VALUE);
														
 
															-	prepare_for_vmx_operation();
														
 
															-
														
 
															-	/* Enter VMX root operation. */
														
 
															-	*(uint32_t *)vmx_pages[VMXON_PAGE].virt = vmcs_revision();
														
 
															-	GUEST_ASSERT(!vmxon(vmx_pages[VMXON_PAGE].phys));
														
 
															-
														
 
															-	/* Load a VMCS. */
														
 
															-	*(uint32_t *)vmx_pages[VMCS_PAGE].virt = vmcs_revision();
														
 
															-	GUEST_ASSERT(!vmclear(vmx_pages[VMCS_PAGE].phys));
														
 
															-	GUEST_ASSERT(!vmptrld(vmx_pages[VMCS_PAGE].phys));
														
 
															+	GUEST_ASSERT(prepare_for_vmx_operation(vmx_pages));
														
 
															 	/* Prepare the VMCS for L2 execution. */
														
 
															-	prepare_vmcs(l2_guest_code, &l2_guest_stack[L2_GUEST_STACK_SIZE]);
														
 
															+	prepare_vmcs(vmx_pages, l2_guest_code,
														
 
															+		     &l2_guest_stack[L2_GUEST_STACK_SIZE]);
														
 
															 	control = vmreadz(CPU_BASED_VM_EXEC_CONTROL);
														
 
															 	control |= CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_USE_TSC_OFFSETING;
														
 
															 	vmwrite(CPU_BASED_VM_EXEC_CONTROL, control);
														
 
															-	vmwrite(MSR_BITMAP, vmx_pages[MSR_BITMAP_PAGE].phys);
														
 
															 	vmwrite(TSC_OFFSET, TSC_OFFSET_VALUE);
														
 
															 	/* Jump into L2.  First, test failure to load guest CR3.  */
														
@@ -152,33 +135,6 @@ static void l1_guest_code(struct vmx_page *vmx_pages)
 
															 	exit_to_l0(PORT_DONE, 0);
														
 
															 }
														
 
															-static void allocate_vmx_page(struct vmx_page *page)
														
 
															-{
														
 
															-	vm_vaddr_t virt;
														
 
															-
														
 
															-	virt = vm_vaddr_alloc(vm, PAGE_SIZE, 0, 0, 0);
														
 
															-	memset(addr_gva2hva(vm, virt), 0, PAGE_SIZE);
														
 
															-
														
 
															-	page->virt = virt;
														
 
															-	page->phys = addr_gva2gpa(vm, virt);
														
 
															-}
														
 
															-
														
 
															-static vm_vaddr_t allocate_vmx_pages(void)
														
 
															-{
														
 
															-	vm_vaddr_t vmx_pages_vaddr;
														
 
															-	int i;
														
 
															-
														
 
															-	vmx_pages_vaddr = vm_vaddr_alloc(
														
 
															-		vm, sizeof(struct vmx_page) * NUM_VMX_PAGES, 0, 0, 0);
														
 
															-
														
 
															-	vmx_pages = (void *) addr_gva2hva(vm, vmx_pages_vaddr);
														
 
															-
														
 
															-	for (i = 0; i < NUM_VMX_PAGES; i++)
														
 
															-		allocate_vmx_page(&vmx_pages[i]);
														
 
															-
														
 
															-	return vmx_pages_vaddr;
														
 
															-}
														
 
															-
														
 
															 void report(int64_t val)
														
 
															 {
														
 
															 	printf("IA32_TSC_ADJUST is %ld (%lld * TSC_ADJUST_VALUE + %lld).\n",
														
@@ -187,7 +143,8 @@ void report(int64_t val)
 
															 int main(int argc, char *argv[])
														
 
															 {
														
 
															-	vm_vaddr_t vmx_pages_vaddr;
														
 
															+	struct vmx_pages *vmx_pages;
														
 
															+	vm_vaddr_t vmx_pages_gva;
														
 
															 	struct kvm_cpuid_entry2 *entry = kvm_get_supported_cpuid_entry(1);
														
 
															 	if (!(entry->ecx & CPUID_VMX)) {
														
@@ -195,23 +152,23 @@ int main(int argc, char *argv[])
 
															 		exit(KSFT_SKIP);
														
 
															 	}
														
 
															-	vm = vm_create_default_vmx(VCPU_ID, (void *) l1_guest_code);
														
 
															+	vm = vm_create_default(VCPU_ID, (void *) l1_guest_code);
														
 
															+	vcpu_set_cpuid(vm, VCPU_ID, kvm_get_supported_cpuid());
														
 
															 	/* Allocate VMX pages and shared descriptors (vmx_pages). */
														
 
															-	vmx_pages_vaddr = allocate_vmx_pages();
														
 
															-	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_vaddr);
														
 
															+	vmx_pages = vcpu_alloc_vmx(vm, &vmx_pages_gva);
														
 
															+	vcpu_args_set(vm, VCPU_ID, 1, vmx_pages_gva);
														
 
															 	for (;;) {
														
 
															 		volatile struct kvm_run *run = vcpu_state(vm, VCPU_ID);
														
 
															 		struct kvm_regs regs;
														
 
															 		vcpu_run(vm, VCPU_ID);
														
 
															+		vcpu_regs_get(vm, VCPU_ID, &regs);
														
 
															 		TEST_ASSERT(run->exit_reason == KVM_EXIT_IO,
														
 
															-			    "Got exit_reason other than KVM_EXIT_IO: %u (%s),\n",
														
 
															+			    "Got exit_reason other than KVM_EXIT_IO: %u (%s), rip=%lx\n",
														
 
															 			    run->exit_reason,
														
 
															-			    exit_reason_str(run->exit_reason));
														
 
															-
														
 
															-		vcpu_regs_get(vm, VCPU_ID, &regs);
														
 
															+			    exit_reason_str(run->exit_reason), regs.rip);
														
 
															 		switch (run->io.port) {
														
 
															 		case PORT_ABORT:
														
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -273,7 +273,8 @@ void kvm_flush_remote_tlbs(struct kvm *kvm)
 
															 	 * kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
														
 
															 	 * barrier here.
														
 
															 	 */
														
 
															-	if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
														
 
															+	if (!kvm_arch_flush_remote_tlb(kvm)
														
 
															+	    || kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
														
 
															 		++kvm->stat.remote_tlb_flush;
														
 
															 	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
														
 
															 }
														
@@ -1169,7 +1170,7 @@ int kvm_get_dirty_log_protect(struct kvm *kvm,
 
															 	n = kvm_dirty_bitmap_bytes(memslot);
														
 
															-	dirty_bitmap_buffer = dirty_bitmap + n / sizeof(long);
														
 
															+	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
														
 
															 	memset(dirty_bitmap_buffer, 0, n);
														
 
															 	spin_lock(&kvm->mmu_lock);
														
@@ -1342,18 +1343,16 @@ static inline int check_user_page_hwpoison(unsigned long addr)
 
															 }
														
 
															 /*
														
 
															- * The atomic path to get the writable pfn which will be stored in @pfn,
														
 
															- * true indicates success, otherwise false is returned.
														
 
															+ * The fast path to get the writable pfn which will be stored in @pfn,
														
 
															+ * true indicates success, otherwise false is returned.  It's also the
														
 
															+ * only part that runs if we can are in atomic context.
														
 
															  */
														
 
															-static bool hva_to_pfn_fast(unsigned long addr, bool atomic, bool *async,
														
 
															-			    bool write_fault, bool *writable, kvm_pfn_t *pfn)
														
 
															+static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
														
 
															+			    bool *writable, kvm_pfn_t *pfn)
														
 
															 {
														
 
															 	struct page *page[1];
														
 
															 	int npages;
														
 
															-	if (!(async || atomic))
														
 
															-		return false;
														
 
															-
														
 
															 	/*
														
 
															 	 * Fast pin a writable pfn only if it is a write fault request
														
 
															 	 * or the caller allows to map a writable pfn for a read fault
														
@@ -1497,7 +1496,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
 
															 	/* we can do it either atomically or asynchronously, not both */
														
 
															 	BUG_ON(atomic && async);
														
 
															-	if (hva_to_pfn_fast(addr, atomic, async, write_fault, writable, &pfn))
														
 
															+	if (hva_to_pfn_fast(addr, write_fault, writable, &pfn))
														
 
															 		return pfn;
														
 
															 	if (atomic)
														
@@ -2127,16 +2126,22 @@ static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 
															 static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
														
 
															 {
														
 
															+	int ret = -EINTR;
														
 
															+	int idx = srcu_read_lock(&vcpu->kvm->srcu);
														
 
															+
														
 
															 	if (kvm_arch_vcpu_runnable(vcpu)) {
														
 
															 		kvm_make_request(KVM_REQ_UNHALT, vcpu);
														
 
															-		return -EINTR;
														
 
															+		goto out;
														
 
															 	}
														
 
															 	if (kvm_cpu_has_pending_timer(vcpu))
														
 
															-		return -EINTR;
														
 
															+		goto out;
														
 
															 	if (signal_pending(current))
														
 
															-		return -EINTR;
														
 
															+		goto out;
														
 
															-	return 0;
														
 
															+	ret = 0;
														
 
															+out:
														
 
															+	srcu_read_unlock(&vcpu->kvm->srcu, idx);
														
 
															+	return ret;
														
 
															 }
														
 
															 /*