|
@@ -34,6 +34,7 @@
|
|
|
#include <linux/tboot.h>
|
|
|
#include <linux/hrtimer.h>
|
|
|
#include <linux/frame.h>
|
|
|
+#include <linux/nospec.h>
|
|
|
#include "kvm_cache_regs.h"
|
|
|
#include "x86.h"
|
|
|
|
|
@@ -111,6 +112,14 @@ static u64 __read_mostly host_xss;
|
|
|
static bool __read_mostly enable_pml = 1;
|
|
|
module_param_named(pml, enable_pml, bool, S_IRUGO);
|
|
|
|
|
|
+#define MSR_TYPE_R 1
|
|
|
+#define MSR_TYPE_W 2
|
|
|
+#define MSR_TYPE_RW 3
|
|
|
+
|
|
|
+#define MSR_BITMAP_MODE_X2APIC 1
|
|
|
+#define MSR_BITMAP_MODE_X2APIC_APICV 2
|
|
|
+#define MSR_BITMAP_MODE_LM 4
|
|
|
+
|
|
|
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
|
|
|
|
|
|
/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
|
|
@@ -185,7 +194,6 @@ module_param(ple_window_max, int, S_IRUGO);
|
|
|
extern const ulong vmx_return;
|
|
|
|
|
|
#define NR_AUTOLOAD_MSRS 8
|
|
|
-#define VMCS02_POOL_SIZE 1
|
|
|
|
|
|
struct vmcs {
|
|
|
u32 revision_id;
|
|
@@ -210,6 +218,7 @@ struct loaded_vmcs {
|
|
|
int soft_vnmi_blocked;
|
|
|
ktime_t entry_time;
|
|
|
s64 vnmi_blocked_time;
|
|
|
+ unsigned long *msr_bitmap;
|
|
|
struct list_head loaded_vmcss_on_cpu_link;
|
|
|
};
|
|
|
|
|
@@ -226,7 +235,7 @@ struct shared_msr_entry {
|
|
|
* stored in guest memory specified by VMPTRLD, but is opaque to the guest,
|
|
|
* which must access it using VMREAD/VMWRITE/VMCLEAR instructions.
|
|
|
* More than one of these structures may exist, if L1 runs multiple L2 guests.
|
|
|
- * nested_vmx_run() will use the data here to build a vmcs02: a VMCS for the
|
|
|
+ * nested_vmx_run() will use the data here to build the vmcs02: a VMCS for the
|
|
|
* underlying hardware which will be used to run L2.
|
|
|
* This structure is packed to ensure that its layout is identical across
|
|
|
* machines (necessary for live migration).
|
|
@@ -409,13 +418,6 @@ struct __packed vmcs12 {
|
|
|
*/
|
|
|
#define VMCS12_SIZE 0x1000
|
|
|
|
|
|
-/* Used to remember the last vmcs02 used for some recently used vmcs12s */
|
|
|
-struct vmcs02_list {
|
|
|
- struct list_head list;
|
|
|
- gpa_t vmptr;
|
|
|
- struct loaded_vmcs vmcs02;
|
|
|
-};
|
|
|
-
|
|
|
/*
|
|
|
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
|
|
|
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
|
|
@@ -440,15 +442,15 @@ struct nested_vmx {
|
|
|
*/
|
|
|
bool sync_shadow_vmcs;
|
|
|
|
|
|
- /* vmcs02_list cache of VMCSs recently used to run L2 guests */
|
|
|
- struct list_head vmcs02_pool;
|
|
|
- int vmcs02_num;
|
|
|
bool change_vmcs01_virtual_x2apic_mode;
|
|
|
/* L2 must run next, and mustn't decide to exit to L1. */
|
|
|
bool nested_run_pending;
|
|
|
+
|
|
|
+ struct loaded_vmcs vmcs02;
|
|
|
+
|
|
|
/*
|
|
|
- * Guest pages referred to in vmcs02 with host-physical pointers, so
|
|
|
- * we must keep them pinned while L2 runs.
|
|
|
+ * Guest pages referred to in the vmcs02 with host-physical
|
|
|
+ * pointers, so we must keep them pinned while L2 runs.
|
|
|
*/
|
|
|
struct page *apic_access_page;
|
|
|
struct page *virtual_apic_page;
|
|
@@ -457,8 +459,6 @@ struct nested_vmx {
|
|
|
bool pi_pending;
|
|
|
u16 posted_intr_nv;
|
|
|
|
|
|
- unsigned long *msr_bitmap;
|
|
|
-
|
|
|
struct hrtimer preemption_timer;
|
|
|
bool preemption_timer_expired;
|
|
|
|
|
@@ -581,6 +581,7 @@ struct vcpu_vmx {
|
|
|
struct kvm_vcpu vcpu;
|
|
|
unsigned long host_rsp;
|
|
|
u8 fail;
|
|
|
+ u8 msr_bitmap_mode;
|
|
|
u32 exit_intr_info;
|
|
|
u32 idt_vectoring_info;
|
|
|
ulong rflags;
|
|
@@ -592,6 +593,10 @@ struct vcpu_vmx {
|
|
|
u64 msr_host_kernel_gs_base;
|
|
|
u64 msr_guest_kernel_gs_base;
|
|
|
#endif
|
|
|
+
|
|
|
+ u64 arch_capabilities;
|
|
|
+ u64 spec_ctrl;
|
|
|
+
|
|
|
u32 vm_entry_controls_shadow;
|
|
|
u32 vm_exit_controls_shadow;
|
|
|
u32 secondary_exec_control;
|
|
@@ -898,21 +903,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
|
|
|
|
|
|
static inline short vmcs_field_to_offset(unsigned long field)
|
|
|
{
|
|
|
- BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
|
|
|
+ const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
|
|
|
+ unsigned short offset;
|
|
|
|
|
|
- if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
|
|
|
+ BUILD_BUG_ON(size > SHRT_MAX);
|
|
|
+ if (field >= size)
|
|
|
return -ENOENT;
|
|
|
|
|
|
- /*
|
|
|
- * FIXME: Mitigation for CVE-2017-5753. To be replaced with a
|
|
|
- * generic mechanism.
|
|
|
- */
|
|
|
- asm("lfence");
|
|
|
-
|
|
|
- if (vmcs_field_to_offset_table[field] == 0)
|
|
|
+ field = array_index_nospec(field, size);
|
|
|
+ offset = vmcs_field_to_offset_table[field];
|
|
|
+ if (offset == 0)
|
|
|
return -ENOENT;
|
|
|
-
|
|
|
- return vmcs_field_to_offset_table[field];
|
|
|
+ return offset;
|
|
|
}
|
|
|
|
|
|
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
|
|
@@ -935,6 +937,9 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
|
|
|
static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
|
|
|
static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
|
|
|
u16 error_code);
|
|
|
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
|
|
|
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
+ u32 msr, int type);
|
|
|
|
|
|
static DEFINE_PER_CPU(struct vmcs *, vmxarea);
|
|
|
static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
|
|
@@ -954,12 +959,6 @@ static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
|
|
|
enum {
|
|
|
VMX_IO_BITMAP_A,
|
|
|
VMX_IO_BITMAP_B,
|
|
|
- VMX_MSR_BITMAP_LEGACY,
|
|
|
- VMX_MSR_BITMAP_LONGMODE,
|
|
|
- VMX_MSR_BITMAP_LEGACY_X2APIC_APICV,
|
|
|
- VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV,
|
|
|
- VMX_MSR_BITMAP_LEGACY_X2APIC,
|
|
|
- VMX_MSR_BITMAP_LONGMODE_X2APIC,
|
|
|
VMX_VMREAD_BITMAP,
|
|
|
VMX_VMWRITE_BITMAP,
|
|
|
VMX_BITMAP_NR
|
|
@@ -969,12 +968,6 @@ static unsigned long *vmx_bitmap[VMX_BITMAP_NR];
|
|
|
|
|
|
#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A])
|
|
|
#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B])
|
|
|
-#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY])
|
|
|
-#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE])
|
|
|
-#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV])
|
|
|
-#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV])
|
|
|
-#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC])
|
|
|
-#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC])
|
|
|
#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP])
|
|
|
#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP])
|
|
|
|
|
@@ -1918,6 +1911,52 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
|
|
|
vmcs_write32(EXCEPTION_BITMAP, eb);
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Check if MSR is intercepted for currently loaded MSR bitmap.
|
|
|
+ */
|
|
|
+static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
|
|
|
+{
|
|
|
+ unsigned long *msr_bitmap;
|
|
|
+ int f = sizeof(unsigned long);
|
|
|
+
|
|
|
+ if (!cpu_has_vmx_msr_bitmap())
|
|
|
+ return true;
|
|
|
+
|
|
|
+ msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
|
|
|
+
|
|
|
+ if (msr <= 0x1fff) {
|
|
|
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
|
|
|
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
|
|
|
+ msr &= 0x1fff;
|
|
|
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Check if MSR is intercepted for L01 MSR bitmap.
|
|
|
+ */
|
|
|
+static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr)
|
|
|
+{
|
|
|
+ unsigned long *msr_bitmap;
|
|
|
+ int f = sizeof(unsigned long);
|
|
|
+
|
|
|
+ if (!cpu_has_vmx_msr_bitmap())
|
|
|
+ return true;
|
|
|
+
|
|
|
+ msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
|
|
|
+
|
|
|
+ if (msr <= 0x1fff) {
|
|
|
+ return !!test_bit(msr, msr_bitmap + 0x800 / f);
|
|
|
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
|
|
|
+ msr &= 0x1fff;
|
|
|
+ return !!test_bit(msr, msr_bitmap + 0xc00 / f);
|
|
|
+ }
|
|
|
+
|
|
|
+ return true;
|
|
|
+}
|
|
|
+
|
|
|
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
|
|
|
unsigned long entry, unsigned long exit)
|
|
|
{
|
|
@@ -2296,6 +2335,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
|
|
if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
|
|
|
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
|
|
|
vmcs_load(vmx->loaded_vmcs->vmcs);
|
|
|
+ indirect_branch_prediction_barrier();
|
|
|
}
|
|
|
|
|
|
if (!already_loaded) {
|
|
@@ -2572,36 +2612,6 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
|
|
|
vmx->guest_msrs[from] = tmp;
|
|
|
}
|
|
|
|
|
|
-static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
|
|
|
-{
|
|
|
- unsigned long *msr_bitmap;
|
|
|
-
|
|
|
- if (is_guest_mode(vcpu))
|
|
|
- msr_bitmap = to_vmx(vcpu)->nested.msr_bitmap;
|
|
|
- else if (cpu_has_secondary_exec_ctrls() &&
|
|
|
- (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
|
|
|
- SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
|
|
|
- if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) {
|
|
|
- if (is_long_mode(vcpu))
|
|
|
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv;
|
|
|
- else
|
|
|
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv;
|
|
|
- } else {
|
|
|
- if (is_long_mode(vcpu))
|
|
|
- msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
|
|
|
- else
|
|
|
- msr_bitmap = vmx_msr_bitmap_legacy_x2apic;
|
|
|
- }
|
|
|
- } else {
|
|
|
- if (is_long_mode(vcpu))
|
|
|
- msr_bitmap = vmx_msr_bitmap_longmode;
|
|
|
- else
|
|
|
- msr_bitmap = vmx_msr_bitmap_legacy;
|
|
|
- }
|
|
|
-
|
|
|
- vmcs_write64(MSR_BITMAP, __pa(msr_bitmap));
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* Set up the vmcs to automatically save and restore system
|
|
|
* msrs. Don't touch the 64-bit msrs if the guest is in legacy
|
|
@@ -2642,7 +2652,7 @@ static void setup_msrs(struct vcpu_vmx *vmx)
|
|
|
vmx->save_nmsrs = save_nmsrs;
|
|
|
|
|
|
if (cpu_has_vmx_msr_bitmap())
|
|
|
- vmx_set_msr_bitmap(&vmx->vcpu);
|
|
|
+ vmx_update_msr_bitmap(&vmx->vcpu);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -3276,6 +3286,20 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|
|
case MSR_IA32_TSC:
|
|
|
msr_info->data = guest_read_tsc(vcpu);
|
|
|
break;
|
|
|
+ case MSR_IA32_SPEC_CTRL:
|
|
|
+ if (!msr_info->host_initiated &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ msr_info->data = to_vmx(vcpu)->spec_ctrl;
|
|
|
+ break;
|
|
|
+ case MSR_IA32_ARCH_CAPABILITIES:
|
|
|
+ if (!msr_info->host_initiated &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
|
|
|
+ return 1;
|
|
|
+ msr_info->data = to_vmx(vcpu)->arch_capabilities;
|
|
|
+ break;
|
|
|
case MSR_IA32_SYSENTER_CS:
|
|
|
msr_info->data = vmcs_read32(GUEST_SYSENTER_CS);
|
|
|
break;
|
|
@@ -3383,6 +3407,70 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
|
|
|
case MSR_IA32_TSC:
|
|
|
kvm_write_tsc(vcpu, msr_info);
|
|
|
break;
|
|
|
+ case MSR_IA32_SPEC_CTRL:
|
|
|
+ if (!msr_info->host_initiated &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBRS) &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ /* The STIBP bit doesn't fault even if it's not advertised */
|
|
|
+ if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP))
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ vmx->spec_ctrl = data;
|
|
|
+
|
|
|
+ if (!data)
|
|
|
+ break;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * For non-nested:
|
|
|
+ * When it's written (to non-zero) for the first time, pass
|
|
|
+ * it through.
|
|
|
+ *
|
|
|
+ * For nested:
|
|
|
+ * The handling of the MSR bitmap for L2 guests is done in
|
|
|
+ * nested_vmx_merge_msr_bitmap. We should not touch the
|
|
|
+ * vmcs02.msr_bitmap here since it gets completely overwritten
|
|
|
+ * in the merging. We update the vmcs01 here for L1 as well
|
|
|
+ * since it will end up touching the MSR anyway now.
|
|
|
+ */
|
|
|
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
|
|
|
+ MSR_IA32_SPEC_CTRL,
|
|
|
+ MSR_TYPE_RW);
|
|
|
+ break;
|
|
|
+ case MSR_IA32_PRED_CMD:
|
|
|
+ if (!msr_info->host_initiated &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_IBPB) &&
|
|
|
+ !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ if (data & ~PRED_CMD_IBPB)
|
|
|
+ return 1;
|
|
|
+
|
|
|
+ if (!data)
|
|
|
+ break;
|
|
|
+
|
|
|
+ wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * For non-nested:
|
|
|
+ * When it's written (to non-zero) for the first time, pass
|
|
|
+ * it through.
|
|
|
+ *
|
|
|
+ * For nested:
|
|
|
+ * The handling of the MSR bitmap for L2 guests is done in
|
|
|
+ * nested_vmx_merge_msr_bitmap. We should not touch the
|
|
|
+ * vmcs02.msr_bitmap here since it gets completely overwritten
|
|
|
+ * in the merging.
|
|
|
+ */
|
|
|
+ vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
|
|
|
+ MSR_TYPE_W);
|
|
|
+ break;
|
|
|
+ case MSR_IA32_ARCH_CAPABILITIES:
|
|
|
+ if (!msr_info->host_initiated)
|
|
|
+ return 1;
|
|
|
+ vmx->arch_capabilities = data;
|
|
|
+ break;
|
|
|
case MSR_IA32_CR_PAT:
|
|
|
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
|
|
|
if (!kvm_mtrr_valid(vcpu, MSR_IA32_CR_PAT, data))
|
|
@@ -3837,11 +3925,6 @@ static struct vmcs *alloc_vmcs_cpu(int cpu)
|
|
|
return vmcs;
|
|
|
}
|
|
|
|
|
|
-static struct vmcs *alloc_vmcs(void)
|
|
|
-{
|
|
|
- return alloc_vmcs_cpu(raw_smp_processor_id());
|
|
|
-}
|
|
|
-
|
|
|
static void free_vmcs(struct vmcs *vmcs)
|
|
|
{
|
|
|
free_pages((unsigned long)vmcs, vmcs_config.order);
|
|
@@ -3857,9 +3940,38 @@ static void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
|
|
|
loaded_vmcs_clear(loaded_vmcs);
|
|
|
free_vmcs(loaded_vmcs->vmcs);
|
|
|
loaded_vmcs->vmcs = NULL;
|
|
|
+ if (loaded_vmcs->msr_bitmap)
|
|
|
+ free_page((unsigned long)loaded_vmcs->msr_bitmap);
|
|
|
WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
|
|
|
}
|
|
|
|
|
|
+static struct vmcs *alloc_vmcs(void)
|
|
|
+{
|
|
|
+ return alloc_vmcs_cpu(raw_smp_processor_id());
|
|
|
+}
|
|
|
+
|
|
|
+static int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
|
|
|
+{
|
|
|
+ loaded_vmcs->vmcs = alloc_vmcs();
|
|
|
+ if (!loaded_vmcs->vmcs)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ loaded_vmcs->shadow_vmcs = NULL;
|
|
|
+ loaded_vmcs_init(loaded_vmcs);
|
|
|
+
|
|
|
+ if (cpu_has_vmx_msr_bitmap()) {
|
|
|
+ loaded_vmcs->msr_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL);
|
|
|
+ if (!loaded_vmcs->msr_bitmap)
|
|
|
+ goto out_vmcs;
|
|
|
+ memset(loaded_vmcs->msr_bitmap, 0xff, PAGE_SIZE);
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+
|
|
|
+out_vmcs:
|
|
|
+ free_loaded_vmcs(loaded_vmcs);
|
|
|
+ return -ENOMEM;
|
|
|
+}
|
|
|
+
|
|
|
static void free_kvm_area(void)
|
|
|
{
|
|
|
int cpu;
|
|
@@ -4918,10 +5030,8 @@ static void free_vpid(int vpid)
|
|
|
spin_unlock(&vmx_vpid_lock);
|
|
|
}
|
|
|
|
|
|
-#define MSR_TYPE_R 1
|
|
|
-#define MSR_TYPE_W 2
|
|
|
-static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
- u32 msr, int type)
|
|
|
+static void __always_inline vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
+ u32 msr, int type)
|
|
|
{
|
|
|
int f = sizeof(unsigned long);
|
|
|
|
|
@@ -4955,6 +5065,50 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void __always_inline vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
+ u32 msr, int type)
|
|
|
+{
|
|
|
+ int f = sizeof(unsigned long);
|
|
|
+
|
|
|
+ if (!cpu_has_vmx_msr_bitmap())
|
|
|
+ return;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
|
|
|
+ * have the write-low and read-high bitmap offsets the wrong way round.
|
|
|
+ * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
|
|
|
+ */
|
|
|
+ if (msr <= 0x1fff) {
|
|
|
+ if (type & MSR_TYPE_R)
|
|
|
+ /* read-low */
|
|
|
+ __set_bit(msr, msr_bitmap + 0x000 / f);
|
|
|
+
|
|
|
+ if (type & MSR_TYPE_W)
|
|
|
+ /* write-low */
|
|
|
+ __set_bit(msr, msr_bitmap + 0x800 / f);
|
|
|
+
|
|
|
+ } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
|
|
|
+ msr &= 0x1fff;
|
|
|
+ if (type & MSR_TYPE_R)
|
|
|
+ /* read-high */
|
|
|
+ __set_bit(msr, msr_bitmap + 0x400 / f);
|
|
|
+
|
|
|
+ if (type & MSR_TYPE_W)
|
|
|
+ /* write-high */
|
|
|
+ __set_bit(msr, msr_bitmap + 0xc00 / f);
|
|
|
+
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void __always_inline vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
|
|
|
+ u32 msr, int type, bool value)
|
|
|
+{
|
|
|
+ if (value)
|
|
|
+ vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
|
|
|
+ else
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* If a msr is allowed by L0, we should check whether it is allowed by L1.
|
|
|
* The corresponding bit will be cleared unless both of L0 and L1 allow it.
|
|
@@ -5001,30 +5155,70 @@ static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only)
|
|
|
+static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
|
|
|
{
|
|
|
- if (!longmode_only)
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy,
|
|
|
- msr, MSR_TYPE_R | MSR_TYPE_W);
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode,
|
|
|
- msr, MSR_TYPE_R | MSR_TYPE_W);
|
|
|
+ u8 mode = 0;
|
|
|
+
|
|
|
+ if (cpu_has_secondary_exec_ctrls() &&
|
|
|
+ (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
|
|
|
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
|
|
|
+ mode |= MSR_BITMAP_MODE_X2APIC;
|
|
|
+ if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
|
|
|
+ mode |= MSR_BITMAP_MODE_X2APIC_APICV;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (is_long_mode(vcpu))
|
|
|
+ mode |= MSR_BITMAP_MODE_LM;
|
|
|
+
|
|
|
+ return mode;
|
|
|
}
|
|
|
|
|
|
-static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active)
|
|
|
+#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
|
|
|
+
|
|
|
+static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
|
|
|
+ u8 mode)
|
|
|
{
|
|
|
- if (apicv_active) {
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv,
|
|
|
- msr, type);
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv,
|
|
|
- msr, type);
|
|
|
- } else {
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic,
|
|
|
- msr, type);
|
|
|
- __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic,
|
|
|
- msr, type);
|
|
|
+ int msr;
|
|
|
+
|
|
|
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
|
|
|
+ unsigned word = msr / BITS_PER_LONG;
|
|
|
+ msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
|
|
|
+ msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (mode & MSR_BITMAP_MODE_X2APIC) {
|
|
|
+ /*
|
|
|
+ * TPR reads and writes can be virtualized even if virtual interrupt
|
|
|
+ * delivery is not in use.
|
|
|
+ */
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
|
|
|
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
|
|
|
+ vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
|
|
|
+ }
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+static void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
|
|
|
+{
|
|
|
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
|
|
|
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
|
|
|
+ u8 mode = vmx_msr_bitmap_mode(vcpu);
|
|
|
+ u8 changed = mode ^ vmx->msr_bitmap_mode;
|
|
|
+
|
|
|
+ if (!changed)
|
|
|
+ return;
|
|
|
+
|
|
|
+ vmx_set_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW,
|
|
|
+ !(mode & MSR_BITMAP_MODE_LM));
|
|
|
+
|
|
|
+ if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
|
|
|
+ vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
|
|
|
+
|
|
|
+ vmx->msr_bitmap_mode = mode;
|
|
|
+}
|
|
|
+
|
|
|
static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
|
|
|
{
|
|
|
return enable_apicv;
|
|
@@ -5274,7 +5468,7 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
|
|
|
}
|
|
|
|
|
|
if (cpu_has_vmx_msr_bitmap())
|
|
|
- vmx_set_msr_bitmap(vcpu);
|
|
|
+ vmx_update_msr_bitmap(vcpu);
|
|
|
}
|
|
|
|
|
|
static u32 vmx_exec_control(struct vcpu_vmx *vmx)
|
|
@@ -5461,7 +5655,7 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
|
|
|
vmcs_write64(VMWRITE_BITMAP, __pa(vmx_vmwrite_bitmap));
|
|
|
}
|
|
|
if (cpu_has_vmx_msr_bitmap())
|
|
|
- vmcs_write64(MSR_BITMAP, __pa(vmx_msr_bitmap_legacy));
|
|
|
+ vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
|
|
|
|
|
|
vmcs_write64(VMCS_LINK_POINTER, -1ull); /* 22.3.1.5 */
|
|
|
|
|
@@ -5539,6 +5733,8 @@ static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
|
|
|
++vmx->nmsrs;
|
|
|
}
|
|
|
|
|
|
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
|
|
|
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, vmx->arch_capabilities);
|
|
|
|
|
|
vm_exit_controls_init(vmx, vmcs_config.vmexit_ctrl);
|
|
|
|
|
@@ -5567,6 +5763,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
|
|
|
u64 cr0;
|
|
|
|
|
|
vmx->rmode.vm86_active = 0;
|
|
|
+ vmx->spec_ctrl = 0;
|
|
|
|
|
|
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
|
|
|
kvm_set_cr8(vcpu, 0);
|
|
@@ -6744,7 +6941,7 @@ void vmx_enable_tdp(void)
|
|
|
|
|
|
static __init int hardware_setup(void)
|
|
|
{
|
|
|
- int r = -ENOMEM, i, msr;
|
|
|
+ int r = -ENOMEM, i;
|
|
|
|
|
|
rdmsrl_safe(MSR_EFER, &host_efer);
|
|
|
|
|
@@ -6764,9 +6961,6 @@ static __init int hardware_setup(void)
|
|
|
|
|
|
memset(vmx_io_bitmap_b, 0xff, PAGE_SIZE);
|
|
|
|
|
|
- memset(vmx_msr_bitmap_legacy, 0xff, PAGE_SIZE);
|
|
|
- memset(vmx_msr_bitmap_longmode, 0xff, PAGE_SIZE);
|
|
|
-
|
|
|
if (setup_vmcs_config(&vmcs_config) < 0) {
|
|
|
r = -EIO;
|
|
|
goto out;
|
|
@@ -6835,42 +7029,8 @@ static __init int hardware_setup(void)
|
|
|
kvm_tsc_scaling_ratio_frac_bits = 48;
|
|
|
}
|
|
|
|
|
|
- vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
|
|
|
- vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
|
|
|
- vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
|
|
|
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_CS, false);
|
|
|
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_ESP, false);
|
|
|
- vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false);
|
|
|
-
|
|
|
- memcpy(vmx_msr_bitmap_legacy_x2apic_apicv,
|
|
|
- vmx_msr_bitmap_legacy, PAGE_SIZE);
|
|
|
- memcpy(vmx_msr_bitmap_longmode_x2apic_apicv,
|
|
|
- vmx_msr_bitmap_longmode, PAGE_SIZE);
|
|
|
- memcpy(vmx_msr_bitmap_legacy_x2apic,
|
|
|
- vmx_msr_bitmap_legacy, PAGE_SIZE);
|
|
|
- memcpy(vmx_msr_bitmap_longmode_x2apic,
|
|
|
- vmx_msr_bitmap_longmode, PAGE_SIZE);
|
|
|
-
|
|
|
set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
|
|
|
|
|
|
- for (msr = 0x800; msr <= 0x8ff; msr++) {
|
|
|
- if (msr == 0x839 /* TMCCT */)
|
|
|
- continue;
|
|
|
- vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true);
|
|
|
- }
|
|
|
-
|
|
|
- /*
|
|
|
- * TPR reads and writes can be virtualized even if virtual interrupt
|
|
|
- * delivery is not in use.
|
|
|
- */
|
|
|
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true);
|
|
|
- vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false);
|
|
|
-
|
|
|
- /* EOI */
|
|
|
- vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true);
|
|
|
- /* SELF-IPI */
|
|
|
- vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true);
|
|
|
-
|
|
|
if (enable_ept)
|
|
|
vmx_enable_tdp();
|
|
|
else
|
|
@@ -6973,94 +7133,6 @@ static int handle_monitor(struct kvm_vcpu *vcpu)
|
|
|
return handle_nop(vcpu);
|
|
|
}
|
|
|
|
|
|
-/*
|
|
|
- * To run an L2 guest, we need a vmcs02 based on the L1-specified vmcs12.
|
|
|
- * We could reuse a single VMCS for all the L2 guests, but we also want the
|
|
|
- * option to allocate a separate vmcs02 for each separate loaded vmcs12 - this
|
|
|
- * allows keeping them loaded on the processor, and in the future will allow
|
|
|
- * optimizations where prepare_vmcs02 doesn't need to set all the fields on
|
|
|
- * every entry if they never change.
|
|
|
- * So we keep, in vmx->nested.vmcs02_pool, a cache of size VMCS02_POOL_SIZE
|
|
|
- * (>=0) with a vmcs02 for each recently loaded vmcs12s, most recent first.
|
|
|
- *
|
|
|
- * The following functions allocate and free a vmcs02 in this pool.
|
|
|
- */
|
|
|
-
|
|
|
-/* Get a VMCS from the pool to use as vmcs02 for the current vmcs12. */
|
|
|
-static struct loaded_vmcs *nested_get_current_vmcs02(struct vcpu_vmx *vmx)
|
|
|
-{
|
|
|
- struct vmcs02_list *item;
|
|
|
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
|
|
|
- if (item->vmptr == vmx->nested.current_vmptr) {
|
|
|
- list_move(&item->list, &vmx->nested.vmcs02_pool);
|
|
|
- return &item->vmcs02;
|
|
|
- }
|
|
|
-
|
|
|
- if (vmx->nested.vmcs02_num >= max(VMCS02_POOL_SIZE, 1)) {
|
|
|
- /* Recycle the least recently used VMCS. */
|
|
|
- item = list_last_entry(&vmx->nested.vmcs02_pool,
|
|
|
- struct vmcs02_list, list);
|
|
|
- item->vmptr = vmx->nested.current_vmptr;
|
|
|
- list_move(&item->list, &vmx->nested.vmcs02_pool);
|
|
|
- return &item->vmcs02;
|
|
|
- }
|
|
|
-
|
|
|
- /* Create a new VMCS */
|
|
|
- item = kzalloc(sizeof(struct vmcs02_list), GFP_KERNEL);
|
|
|
- if (!item)
|
|
|
- return NULL;
|
|
|
- item->vmcs02.vmcs = alloc_vmcs();
|
|
|
- item->vmcs02.shadow_vmcs = NULL;
|
|
|
- if (!item->vmcs02.vmcs) {
|
|
|
- kfree(item);
|
|
|
- return NULL;
|
|
|
- }
|
|
|
- loaded_vmcs_init(&item->vmcs02);
|
|
|
- item->vmptr = vmx->nested.current_vmptr;
|
|
|
- list_add(&(item->list), &(vmx->nested.vmcs02_pool));
|
|
|
- vmx->nested.vmcs02_num++;
|
|
|
- return &item->vmcs02;
|
|
|
-}
|
|
|
-
|
|
|
-/* Free and remove from pool a vmcs02 saved for a vmcs12 (if there is one) */
|
|
|
-static void nested_free_vmcs02(struct vcpu_vmx *vmx, gpa_t vmptr)
|
|
|
-{
|
|
|
- struct vmcs02_list *item;
|
|
|
- list_for_each_entry(item, &vmx->nested.vmcs02_pool, list)
|
|
|
- if (item->vmptr == vmptr) {
|
|
|
- free_loaded_vmcs(&item->vmcs02);
|
|
|
- list_del(&item->list);
|
|
|
- kfree(item);
|
|
|
- vmx->nested.vmcs02_num--;
|
|
|
- return;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
-/*
|
|
|
- * Free all VMCSs saved for this vcpu, except the one pointed by
|
|
|
- * vmx->loaded_vmcs. We must be running L1, so vmx->loaded_vmcs
|
|
|
- * must be &vmx->vmcs01.
|
|
|
- */
|
|
|
-static void nested_free_all_saved_vmcss(struct vcpu_vmx *vmx)
|
|
|
-{
|
|
|
- struct vmcs02_list *item, *n;
|
|
|
-
|
|
|
- WARN_ON(vmx->loaded_vmcs != &vmx->vmcs01);
|
|
|
- list_for_each_entry_safe(item, n, &vmx->nested.vmcs02_pool, list) {
|
|
|
- /*
|
|
|
- * Something will leak if the above WARN triggers. Better than
|
|
|
- * a use-after-free.
|
|
|
- */
|
|
|
- if (vmx->loaded_vmcs == &item->vmcs02)
|
|
|
- continue;
|
|
|
-
|
|
|
- free_loaded_vmcs(&item->vmcs02);
|
|
|
- list_del(&item->list);
|
|
|
- kfree(item);
|
|
|
- vmx->nested.vmcs02_num--;
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
/*
|
|
|
* The following 3 functions, nested_vmx_succeed()/failValid()/failInvalid(),
|
|
|
* set the success or error code of an emulated VMX instruction, as specified
|
|
@@ -7241,13 +7313,11 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
|
|
|
{
|
|
|
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
|
|
struct vmcs *shadow_vmcs;
|
|
|
+ int r;
|
|
|
|
|
|
- if (cpu_has_vmx_msr_bitmap()) {
|
|
|
- vmx->nested.msr_bitmap =
|
|
|
- (unsigned long *)__get_free_page(GFP_KERNEL);
|
|
|
- if (!vmx->nested.msr_bitmap)
|
|
|
- goto out_msr_bitmap;
|
|
|
- }
|
|
|
+ r = alloc_loaded_vmcs(&vmx->nested.vmcs02);
|
|
|
+ if (r < 0)
|
|
|
+ goto out_vmcs02;
|
|
|
|
|
|
vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
|
|
|
if (!vmx->nested.cached_vmcs12)
|
|
@@ -7264,9 +7334,6 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
|
|
|
vmx->vmcs01.shadow_vmcs = shadow_vmcs;
|
|
|
}
|
|
|
|
|
|
- INIT_LIST_HEAD(&(vmx->nested.vmcs02_pool));
|
|
|
- vmx->nested.vmcs02_num = 0;
|
|
|
-
|
|
|
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
|
|
|
HRTIMER_MODE_REL_PINNED);
|
|
|
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
|
|
@@ -7278,9 +7345,9 @@ out_shadow_vmcs:
|
|
|
kfree(vmx->nested.cached_vmcs12);
|
|
|
|
|
|
out_cached_vmcs12:
|
|
|
- free_page((unsigned long)vmx->nested.msr_bitmap);
|
|
|
+ free_loaded_vmcs(&vmx->nested.vmcs02);
|
|
|
|
|
|
-out_msr_bitmap:
|
|
|
+out_vmcs02:
|
|
|
return -ENOMEM;
|
|
|
}
|
|
|
|
|
@@ -7423,10 +7490,6 @@ static void free_nested(struct vcpu_vmx *vmx)
|
|
|
free_vpid(vmx->nested.vpid02);
|
|
|
vmx->nested.posted_intr_nv = -1;
|
|
|
vmx->nested.current_vmptr = -1ull;
|
|
|
- if (vmx->nested.msr_bitmap) {
|
|
|
- free_page((unsigned long)vmx->nested.msr_bitmap);
|
|
|
- vmx->nested.msr_bitmap = NULL;
|
|
|
- }
|
|
|
if (enable_shadow_vmcs) {
|
|
|
vmx_disable_shadow_vmcs(vmx);
|
|
|
vmcs_clear(vmx->vmcs01.shadow_vmcs);
|
|
@@ -7434,7 +7497,7 @@ static void free_nested(struct vcpu_vmx *vmx)
|
|
|
vmx->vmcs01.shadow_vmcs = NULL;
|
|
|
}
|
|
|
kfree(vmx->nested.cached_vmcs12);
|
|
|
- /* Unpin physical memory we referred to in current vmcs02 */
|
|
|
+ /* Unpin physical memory we referred to in the vmcs02 */
|
|
|
if (vmx->nested.apic_access_page) {
|
|
|
kvm_release_page_dirty(vmx->nested.apic_access_page);
|
|
|
vmx->nested.apic_access_page = NULL;
|
|
@@ -7450,7 +7513,7 @@ static void free_nested(struct vcpu_vmx *vmx)
|
|
|
vmx->nested.pi_desc = NULL;
|
|
|
}
|
|
|
|
|
|
- nested_free_all_saved_vmcss(vmx);
|
|
|
+ free_loaded_vmcs(&vmx->nested.vmcs02);
|
|
|
}
|
|
|
|
|
|
/* Emulate the VMXOFF instruction */
|
|
@@ -7493,8 +7556,6 @@ static int handle_vmclear(struct kvm_vcpu *vcpu)
|
|
|
vmptr + offsetof(struct vmcs12, launch_state),
|
|
|
&zero, sizeof(zero));
|
|
|
|
|
|
- nested_free_vmcs02(vmx, vmptr);
|
|
|
-
|
|
|
nested_vmx_succeed(vcpu);
|
|
|
return kvm_skip_emulated_instruction(vcpu);
|
|
|
}
|
|
@@ -8406,10 +8467,11 @@ static bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
|
|
|
|
|
|
/*
|
|
|
* The host physical addresses of some pages of guest memory
|
|
|
- * are loaded into VMCS02 (e.g. L1's Virtual APIC Page). The CPU
|
|
|
- * may write to these pages via their host physical address while
|
|
|
- * L2 is running, bypassing any address-translation-based dirty
|
|
|
- * tracking (e.g. EPT write protection).
|
|
|
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
|
|
|
+ * Page). The CPU may write to these pages via their host
|
|
|
+ * physical address while L2 is running, bypassing any
|
|
|
+ * address-translation-based dirty tracking (e.g. EPT write
|
|
|
+ * protection).
|
|
|
*
|
|
|
* Mark them dirty on every exit from L2 to prevent them from
|
|
|
* getting out of sync with dirty tracking.
|
|
@@ -8943,7 +9005,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
|
|
|
}
|
|
|
vmcs_write32(SECONDARY_VM_EXEC_CONTROL, sec_exec_control);
|
|
|
|
|
|
- vmx_set_msr_bitmap(vcpu);
|
|
|
+ vmx_update_msr_bitmap(vcpu);
|
|
|
}
|
|
|
|
|
|
static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
|
|
@@ -9373,6 +9435,15 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|
|
|
|
|
vmx_arm_hv_timer(vcpu);
|
|
|
|
|
|
+ /*
|
|
|
+ * If this vCPU has touched SPEC_CTRL, restore the guest's value if
|
|
|
+ * it's non-zero. Since vmentry is serialising on affected CPUs, there
|
|
|
+ * is no need to worry about the conditional branch over the wrmsr
|
|
|
+ * being speculatively taken.
|
|
|
+ */
|
|
|
+ if (vmx->spec_ctrl)
|
|
|
+ wrmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
|
|
|
+
|
|
|
vmx->__launched = vmx->loaded_vmcs->launched;
|
|
|
asm(
|
|
|
/* Store host registers */
|
|
@@ -9491,6 +9562,27 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
|
|
|
#endif
|
|
|
);
|
|
|
|
|
|
+ /*
|
|
|
+ * We do not use IBRS in the kernel. If this vCPU has used the
|
|
|
+ * SPEC_CTRL MSR it may have left it on; save the value and
|
|
|
+ * turn it off. This is much more efficient than blindly adding
|
|
|
+ * it to the atomic save/restore list. Especially as the former
|
|
|
+ * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
|
|
|
+ *
|
|
|
+ * For non-nested case:
|
|
|
+ * If the L01 MSR bitmap does not intercept the MSR, then we need to
|
|
|
+ * save it.
|
|
|
+ *
|
|
|
+ * For nested case:
|
|
|
+ * If the L02 MSR bitmap does not intercept the MSR, then we need to
|
|
|
+ * save it.
|
|
|
+ */
|
|
|
+ if (!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))
|
|
|
+ rdmsrl(MSR_IA32_SPEC_CTRL, vmx->spec_ctrl);
|
|
|
+
|
|
|
+ if (vmx->spec_ctrl)
|
|
|
+ wrmsrl(MSR_IA32_SPEC_CTRL, 0);
|
|
|
+
|
|
|
/* Eliminate branch target predictions from guest mode */
|
|
|
vmexit_fill_RSB();
|
|
|
|
|
@@ -9604,6 +9696,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
|
|
|
{
|
|
|
int err;
|
|
|
struct vcpu_vmx *vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
|
|
|
+ unsigned long *msr_bitmap;
|
|
|
int cpu;
|
|
|
|
|
|
if (!vmx)
|
|
@@ -9636,13 +9729,20 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
|
|
|
if (!vmx->guest_msrs)
|
|
|
goto free_pml;
|
|
|
|
|
|
- vmx->loaded_vmcs = &vmx->vmcs01;
|
|
|
- vmx->loaded_vmcs->vmcs = alloc_vmcs();
|
|
|
- vmx->loaded_vmcs->shadow_vmcs = NULL;
|
|
|
- if (!vmx->loaded_vmcs->vmcs)
|
|
|
+ err = alloc_loaded_vmcs(&vmx->vmcs01);
|
|
|
+ if (err < 0)
|
|
|
goto free_msrs;
|
|
|
- loaded_vmcs_init(vmx->loaded_vmcs);
|
|
|
|
|
|
+ msr_bitmap = vmx->vmcs01.msr_bitmap;
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
|
|
|
+ vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
|
|
|
+ vmx->msr_bitmap_mode = 0;
|
|
|
+
|
|
|
+ vmx->loaded_vmcs = &vmx->vmcs01;
|
|
|
cpu = get_cpu();
|
|
|
vmx_vcpu_load(&vmx->vcpu, cpu);
|
|
|
vmx->vcpu.cpu = cpu;
|
|
@@ -10105,10 +10205,25 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
|
|
|
int msr;
|
|
|
struct page *page;
|
|
|
unsigned long *msr_bitmap_l1;
|
|
|
- unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.msr_bitmap;
|
|
|
+ unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap;
|
|
|
+ /*
|
|
|
+ * pred_cmd & spec_ctrl are trying to verify two things:
|
|
|
+ *
|
|
|
+ * 1. L0 gave a permission to L1 to actually passthrough the MSR. This
|
|
|
+ * ensures that we do not accidentally generate an L02 MSR bitmap
|
|
|
+ * from the L12 MSR bitmap that is too permissive.
|
|
|
+ * 2. That L1 or L2s have actually used the MSR. This avoids
|
|
|
+ * unnecessarily merging of the bitmap if the MSR is unused. This
|
|
|
+ * works properly because we only update the L01 MSR bitmap lazily.
|
|
|
+ * So even if L0 should pass L1 these MSRs, the L01 bitmap is only
|
|
|
+ * updated to reflect this when L1 (or its L2s) actually write to
|
|
|
+ * the MSR.
|
|
|
+ */
|
|
|
+ bool pred_cmd = msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD);
|
|
|
+ bool spec_ctrl = msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL);
|
|
|
|
|
|
- /* This shortcut is ok because we support only x2APIC MSRs so far. */
|
|
|
- if (!nested_cpu_has_virt_x2apic_mode(vmcs12))
|
|
|
+ if (!nested_cpu_has_virt_x2apic_mode(vmcs12) &&
|
|
|
+ !pred_cmd && !spec_ctrl)
|
|
|
return false;
|
|
|
|
|
|
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->msr_bitmap);
|
|
@@ -10141,6 +10256,19 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
|
|
|
MSR_TYPE_W);
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+ if (spec_ctrl)
|
|
|
+ nested_vmx_disable_intercept_for_msr(
|
|
|
+ msr_bitmap_l1, msr_bitmap_l0,
|
|
|
+ MSR_IA32_SPEC_CTRL,
|
|
|
+ MSR_TYPE_R | MSR_TYPE_W);
|
|
|
+
|
|
|
+ if (pred_cmd)
|
|
|
+ nested_vmx_disable_intercept_for_msr(
|
|
|
+ msr_bitmap_l1, msr_bitmap_l0,
|
|
|
+ MSR_IA32_PRED_CMD,
|
|
|
+ MSR_TYPE_W);
|
|
|
+
|
|
|
kunmap(page);
|
|
|
kvm_release_page_clean(page);
|
|
|
|
|
@@ -10682,6 +10810,9 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
|
|
|
if (kvm_has_tsc_control)
|
|
|
decache_tsc_multiplier(vmx);
|
|
|
|
|
|
+ if (cpu_has_vmx_msr_bitmap())
|
|
|
+ vmcs_write64(MSR_BITMAP, __pa(vmx->nested.vmcs02.msr_bitmap));
|
|
|
+
|
|
|
if (enable_vpid) {
|
|
|
/*
|
|
|
* There is no direct mapping between vpid02 and vpid12, the
|
|
@@ -10903,20 +11034,15 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry)
|
|
|
{
|
|
|
struct vcpu_vmx *vmx = to_vmx(vcpu);
|
|
|
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
|
|
|
- struct loaded_vmcs *vmcs02;
|
|
|
u32 msr_entry_idx;
|
|
|
u32 exit_qual;
|
|
|
|
|
|
- vmcs02 = nested_get_current_vmcs02(vmx);
|
|
|
- if (!vmcs02)
|
|
|
- return -ENOMEM;
|
|
|
-
|
|
|
enter_guest_mode(vcpu);
|
|
|
|
|
|
if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
|
|
|
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
|
|
|
|
|
|
- vmx_switch_vmcs(vcpu, vmcs02);
|
|
|
+ vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
|
|
|
vmx_segment_cache_clear(vmx);
|
|
|
|
|
|
if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) {
|
|
@@ -11485,7 +11611,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
|
|
|
vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
|
|
|
|
|
|
if (cpu_has_vmx_msr_bitmap())
|
|
|
- vmx_set_msr_bitmap(vcpu);
|
|
|
+ vmx_update_msr_bitmap(vcpu);
|
|
|
|
|
|
if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr,
|
|
|
vmcs12->vm_exit_msr_load_count))
|
|
@@ -11534,10 +11660,6 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
|
|
|
vm_exit_controls_reset_shadow(vmx);
|
|
|
vmx_segment_cache_clear(vmx);
|
|
|
|
|
|
- /* if no vmcs02 cache requested, remove the one we used */
|
|
|
- if (VMCS02_POOL_SIZE == 0)
|
|
|
- nested_free_vmcs02(vmx, vmx->nested.current_vmptr);
|
|
|
-
|
|
|
/* Update any VMCS fields that might have changed while L2 ran */
|
|
|
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
|
|
|
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr);
|