7 years ago · 9d67121a4f
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1922,6 +1922,7 @@ registers, find a list below:
 
				   PPC   | KVM_REG_PPC_TIDR              | 64
			
 
				   PPC   | KVM_REG_PPC_PSSCR             | 64
			
 
				   PPC   | KVM_REG_PPC_DEC_EXPIRY        | 64
			
 
				+  PPC   | KVM_REG_PPC_PTCR              | 64
			
 
				   PPC   | KVM_REG_PPC_TM_GPR0           | 64
			
 
				           ...
			
 
				   PPC   | KVM_REG_PPC_TM_GPR31          | 64
			
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
 
				 
			
 
				 extern long flush_count_cache;
			
 
				 
			
 
				+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				+void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
			
 
				+void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
			
 
				+#else
			
 
				+static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
			
 
				+				     bool preserve_nv) { }
			
 
				+static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
			
 
				+					bool preserve_nv) { }
			
 
				+#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
			
 
				+
			
 
				+void kvmhv_save_host_pmu(void);
			
 
				+void kvmhv_load_host_pmu(void);
			
 
				+void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
			
 
				+void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
			
 
				+
			
 
				+int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
			
 
				+
			
 
				+long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
			
 
				+long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
			
 
				+			unsigned long dabrx);
			
 
				+
			
 
				 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
			
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
 
				 	BUG();
			
 
				 }
			
 
				 
			
 
				+static inline unsigned int ap_to_shift(unsigned long ap)
			
 
				+{
			
 
				+	int psize;
			
 
				+
			
 
				+	for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
			
 
				+		if (mmu_psize_defs[psize].ap == ap)
			
 
				+			return mmu_psize_defs[psize].shift;
			
 
				+	}
			
 
				+
			
 
				+	return -1;
			
 
				+}
			
 
				+
			
 
				 static inline unsigned long get_sllp_encoding(int psize)
			
 
				 {
			
 
				 	unsigned long sllp;
			
--- a/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
+++ b/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
 
				 					unsigned long addr,
			
 
				 					unsigned long page_size);
			
 
				 extern void radix__flush_pwc_lpid(unsigned int lpid);
			
 
				+extern void radix__flush_tlb_lpid(unsigned int lpid);
			
 
				 extern void radix__local_flush_tlb_lpid(unsigned int lpid);
			
 
				 extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
			
 
				 
			
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -322,6 +322,11 @@
 
				 #define H_GET_24X7_DATA		0xF07C
			
 
				 #define H_GET_PERF_COUNTER_INFO	0xF080
			
 
				 
			
 
				+/* Platform-specific hcalls used for nested HV KVM */
			
 
				+#define H_SET_PARTITION_TABLE	0xF800
			
 
				+#define H_ENTER_NESTED		0xF804
			
 
				+#define H_TLB_INVALIDATE	0xF808
			
 
				+
			
 
				 /* Values for 2nd argument to H_SET_MODE */
			
 
				 #define H_SET_MODE_RESOURCE_SET_CIABR		1
			
 
				 #define H_SET_MODE_RESOURCE_SET_DAWR		2
			
@@ -461,6 +466,42 @@ struct h_cpu_char_result {
 
				 	u64 behaviour;
			
 
				 };
			
 
				 
			
 
				+/* Register state for entering a nested guest with H_ENTER_NESTED */
			
 
				+struct hv_guest_state {
			
 
				+	u64 version;		/* version of this structure layout */
			
 
				+	u32 lpid;
			
 
				+	u32 vcpu_token;
			
 
				+	/* These registers are hypervisor privileged (at least for writing) */
			
 
				+	u64 lpcr;
			
 
				+	u64 pcr;
			
 
				+	u64 amor;
			
 
				+	u64 dpdes;
			
 
				+	u64 hfscr;
			
 
				+	s64 tb_offset;
			
 
				+	u64 dawr0;
			
 
				+	u64 dawrx0;
			
 
				+	u64 ciabr;
			
 
				+	u64 hdec_expiry;
			
 
				+	u64 purr;
			
 
				+	u64 spurr;
			
 
				+	u64 ic;
			
 
				+	u64 vtb;
			
 
				+	u64 hdar;
			
 
				+	u64 hdsisr;
			
 
				+	u64 heir;
			
 
				+	u64 asdr;
			
 
				+	/* These are OS privileged but need to be set late in guest entry */
			
 
				+	u64 srr0;
			
 
				+	u64 srr1;
			
 
				+	u64 sprg[4];
			
 
				+	u64 pidr;
			
 
				+	u64 cfar;
			
 
				+	u64 ppr;
			
 
				+};
			
 
				+
			
 
				+/* Latest version of hv_guest_state structure */
			
 
				+#define HV_GUEST_STATE_VERSION	1
			
 
				+
			
 
				 #endif /* __ASSEMBLY__ */
			
 
				 #endif /* __KERNEL__ */
			
 
				 #endif /* _ASM_POWERPC_HVCALL_H */
			
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -84,7 +84,6 @@
 
				 #define BOOK3S_INTERRUPT_INST_STORAGE	0x400
			
 
				 #define BOOK3S_INTERRUPT_INST_SEGMENT	0x480
			
 
				 #define BOOK3S_INTERRUPT_EXTERNAL	0x500
			
 
				-#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL	0x501
			
 
				 #define BOOK3S_INTERRUPT_EXTERNAL_HV	0x502
			
 
				 #define BOOK3S_INTERRUPT_ALIGNMENT	0x600
			
 
				 #define BOOK3S_INTERRUPT_PROGRAM	0x700
			
@@ -134,8 +133,7 @@
 
				 #define BOOK3S_IRQPRIO_EXTERNAL			14
			
 
				 #define BOOK3S_IRQPRIO_DECREMENTER		15
			
 
				 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR	16
			
 
				-#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL		17
			
 
				-#define BOOK3S_IRQPRIO_MAX			18
			
 
				+#define BOOK3S_IRQPRIO_MAX			17
			
 
				 
			
 
				 #define BOOK3S_HFLAG_DCBZ32			0x1
			
 
				 #define BOOK3S_HFLAG_SLB			0x2
			
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
 
				 extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
			
 
				 			struct kvm_vcpu *vcpu,
			
 
				 			unsigned long ea, unsigned long dsisr);
			
 
				+extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				+				      struct kvmppc_pte *gpte, u64 root,
			
 
				+				      u64 *pte_ret_p);
			
 
				+extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				+			struct kvmppc_pte *gpte, u64 table,
			
 
				+			int table_index, u64 *pte_ret_p);
			
 
				 extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				 			struct kvmppc_pte *gpte, bool data, bool iswrite);
			
 
				+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
			
 
				+			unsigned int shift, struct kvm_memory_slot *memslot,
			
 
				+			unsigned int lpid);
			
 
				+extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
			
 
				+				    bool writing, unsigned long gpa,
			
 
				+				    unsigned int lpid);
			
 
				+extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
			
 
				+				unsigned long gpa,
			
 
				+				struct kvm_memory_slot *memslot,
			
 
				+				bool writing, bool kvm_ro,
			
 
				+				pte_t *inserted_pte, unsigned int *levelp);
			
 
				 extern int kvmppc_init_vm_radix(struct kvm *kvm);
			
 
				 extern void kvmppc_free_radix(struct kvm *kvm);
			
 
				+extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
			
 
				+				      unsigned int lpid);
			
 
				 extern int kvmppc_radix_init(void);
			
 
				 extern void kvmppc_radix_exit(void);
			
 
				 extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
			
 
				 			unsigned long gfn);
			
 
				+extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
			
 
				+			     unsigned long gpa, unsigned int shift,
			
 
				+			     struct kvm_memory_slot *memslot,
			
 
				+			     unsigned int lpid);
			
 
				 extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
			
 
				 			unsigned long gfn);
			
 
				 extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
			
@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
 
				 static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
			
 
				 #endif
			
 
				 
			
 
				+long kvmhv_nested_init(void);
			
 
				+void kvmhv_nested_exit(void);
			
 
				+void kvmhv_vm_nested_init(struct kvm *kvm);
			
 
				+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
			
 
				+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
			
 
				+void kvmhv_release_all_nested(struct kvm *kvm);
			
 
				+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
			
 
				+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
			
 
				+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
			
 
				+			  u64 time_limit, unsigned long lpcr);
			
 
				+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
			
 
				+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
			
 
				+				   struct hv_guest_state *hr);
			
 
				+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
			
 
				+
			
 
				 void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
			
 
				 
			
 
				 extern int kvm_irq_bypass;
			
@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
				 
			
 
				 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
			
 
				 {
			
 
				-	vcpu->arch.cr = val;
			
 
				+	vcpu->arch.regs.ccr = val;
			
 
				 }
			
 
				 
			
 
				 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				-	return vcpu->arch.cr;
			
 
				+	return vcpu->arch.regs.ccr;
			
 
				 }
			
 
				 
			
 
				 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
			
@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
 
				 /* TO = 31 for unconditional trap */
			
 
				 #define INS_TW				0x7fe00008
			
 
				 
			
 
				-/* LPIDs we support with this build -- runtime limit may be lower */
			
 
				-#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
			
 
				-
			
 
				 #define SPLIT_HACK_MASK			0xff000000
			
 
				 #define SPLIT_HACK_OFFS			0xfb000000
			
 
				 
			
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -23,6 +23,108 @@
 
				 #include <linux/string.h>
			
 
				 #include <asm/bitops.h>
			
 
				 #include <asm/book3s/64/mmu-hash.h>
			
 
				+#include <asm/cpu_has_feature.h>
			
 
				+#include <asm/ppc-opcode.h>
			
 
				+
			
 
				+#ifdef CONFIG_PPC_PSERIES
			
 
				+static inline bool kvmhv_on_pseries(void)
			
 
				+{
			
 
				+	return !cpu_has_feature(CPU_FTR_HVMODE);
			
 
				+}
			
 
				+#else
			
 
				+static inline bool kvmhv_on_pseries(void)
			
 
				+{
			
 
				+	return false;
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+ * Structure for a nested guest, that is, for a guest that is managed by
			
 
				+ * one of our guests.
			
 
				+ */
			
 
				+struct kvm_nested_guest {
			
 
				+	struct kvm *l1_host;		/* L1 VM that owns this nested guest */
			
 
				+	int l1_lpid;			/* lpid L1 guest thinks this guest is */
			
 
				+	int shadow_lpid;		/* real lpid of this nested guest */
			
 
				+	pgd_t *shadow_pgtable;		/* our page table for this guest */
			
 
				+	u64 l1_gr_to_hr;		/* L1's addr of part'n-scoped table */
			
 
				+	u64 process_table;		/* process table entry for this guest */
			
 
				+	long refcnt;			/* number of pointers to this struct */
			
 
				+	struct mutex tlb_lock;		/* serialize page faults and tlbies */
			
 
				+	struct kvm_nested_guest *next;
			
 
				+	cpumask_t need_tlb_flush;
			
 
				+	cpumask_t cpu_in_guest;
			
 
				+	short prev_cpu[NR_CPUS];
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * We define a nested rmap entry as a single 64-bit quantity
			
 
				+ * 0xFFF0000000000000	12-bit lpid field
			
 
				+ * 0x000FFFFFFFFFF000	40-bit guest 4k page frame number
			
 
				+ * 0x0000000000000001	1-bit  single entry flag
			
 
				+ */
			
 
				+#define RMAP_NESTED_LPID_MASK		0xFFF0000000000000UL
			
 
				+#define RMAP_NESTED_LPID_SHIFT		(52)
			
 
				+#define RMAP_NESTED_GPA_MASK		0x000FFFFFFFFFF000UL
			
 
				+#define RMAP_NESTED_IS_SINGLE_ENTRY	0x0000000000000001UL
			
 
				+
			
 
				+/* Structure for a nested guest rmap entry */
			
 
				+struct rmap_nested {
			
 
				+	struct llist_node list;
			
 
				+	u64 rmap;
			
 
				+};
			
 
				+
			
 
				+/*
			
 
				+ * for_each_nest_rmap_safe - iterate over the list of nested rmap entries
			
 
				+ *			     safe against removal of the list entry or NULL list
			
 
				+ * @pos:	a (struct rmap_nested *) to use as a loop cursor
			
 
				+ * @node:	pointer to the first entry
			
 
				+ *		NOTE: this can be NULL
			
 
				+ * @rmapp:	an (unsigned long *) in which to return the rmap entries on each
			
 
				+ *		iteration
			
 
				+ *		NOTE: this must point to already allocated memory
			
 
				+ *
			
 
				+ * The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
			
 
				+ * rmap entry in the memslot. The list is always terminated by a "single entry"
			
 
				+ * stored in the list element of the final entry of the llist. If there is ONLY
			
 
				+ * a single entry then this is itself in the rmap entry of the memslot, not a
			
 
				+ * llist head pointer.
			
 
				+ *
			
 
				+ * Note that the iterator below assumes that a nested rmap entry is always
			
 
				+ * non-zero.  This is true for our usage because the LPID field is always
			
 
				+ * non-zero (zero is reserved for the host).
			
 
				+ *
			
 
				+ * This should be used to iterate over the list of rmap_nested entries with
			
 
				+ * processing done on the u64 rmap value given by each iteration. This is safe
			
 
				+ * against removal of list entries and it is always safe to call free on (pos).
			
 
				+ *
			
 
				+ * e.g.
			
 
				+ * struct rmap_nested *cursor;
			
 
				+ * struct llist_node *first;
			
 
				+ * unsigned long rmap;
			
 
				+ * for_each_nest_rmap_safe(cursor, first, &rmap) {
			
 
				+ *	do_something(rmap);
			
 
				+ *	free(cursor);
			
 
				+ * }
			
 
				+ */
			
 
				+#define for_each_nest_rmap_safe(pos, node, rmapp)			       \
			
 
				+	for ((pos) = llist_entry((node), typeof(*(pos)), list);		       \
			
 
				+	     (node) &&							       \
			
 
				+	     (*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?     \
			
 
				+			  ((u64) (node)) : ((pos)->rmap))) &&		       \
			
 
				+	     (((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ?      \
			
 
				+			 ((struct llist_node *) ((pos) = NULL)) :	       \
			
 
				+			 (pos)->list.next)), true);			       \
			
 
				+	     (pos) = llist_entry((node), typeof(*(pos)), list))
			
 
				+
			
 
				+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
			
 
				+					  bool create);
			
 
				+void kvmhv_put_nested(struct kvm_nested_guest *gp);
			
 
				+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
			
 
				+
			
 
				+/* Encoding of first parameter for H_TLB_INVALIDATE */
			
 
				+#define H_TLBIE_P1_ENC(ric, prs, r)	(___PPC_RIC(ric) | ___PPC_PRS(prs) | \
			
 
				+					 ___PPC_R(r))
			
 
				 
			
 
				 /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
			
 
				 #define PPC_MIN_HPT_ORDER	18
			
@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
 
				 }
			
 
				 
			
 
				 extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
			
 
				+extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
			
 
				 
			
 
				 extern void kvmhv_rm_send_ipi(int cpu);
			
 
				 
			
@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
 
				 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				 static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				-	vcpu->arch.cr  = vcpu->arch.cr_tm;
			
 
				+	vcpu->arch.regs.ccr  = vcpu->arch.cr_tm;
			
 
				 	vcpu->arch.regs.xer = vcpu->arch.xer_tm;
			
 
				 	vcpu->arch.regs.link  = vcpu->arch.lr_tm;
			
 
				 	vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
			
@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
 
				 
			
 
				 static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				-	vcpu->arch.cr_tm  = vcpu->arch.cr;
			
 
				+	vcpu->arch.cr_tm  = vcpu->arch.regs.ccr;
			
 
				 	vcpu->arch.xer_tm = vcpu->arch.regs.xer;
			
 
				 	vcpu->arch.lr_tm  = vcpu->arch.regs.link;
			
 
				 	vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
			
@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
 
				 }
			
 
				 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
			
 
				 
			
 
				+extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			
 
				+			     unsigned long gpa, unsigned int level,
			
 
				+			     unsigned long mmu_seq, unsigned int lpid,
			
 
				+			     unsigned long *rmapp, struct rmap_nested **n_rmap);
			
 
				+extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
			
 
				+				   struct rmap_nested **n_rmap);
			
 
				+extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
			
 
				+				struct kvm_memory_slot *memslot,
			
 
				+				unsigned long gpa, unsigned long hpa,
			
 
				+				unsigned long nbytes);
			
 
				+
			
 
				 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
			
 
				 
			
 
				 #endif /* __ASM_KVM_BOOK3S_64_H__ */
			
--- a/arch/powerpc/include/asm/kvm_book3s_asm.h
+++ b/arch/powerpc/include/asm/kvm_book3s_asm.h
@@ -25,6 +25,9 @@
 
				 #define XICS_MFRR		0xc
			
 
				 #define XICS_IPI		2	/* interrupt source # for IPIs */
			
 
				 
			
 
				+/* LPIDs we support with this build -- runtime limit may be lower */
			
 
				+#define KVMPPC_NR_LPIDS			(LPID_RSVD + 1)
			
 
				+
			
 
				 /* Maximum number of threads per physical core */
			
 
				 #define MAX_SMT_THREADS		8
			
 
				 
			
--- a/arch/powerpc/include/asm/kvm_booke.h
+++ b/arch/powerpc/include/asm/kvm_booke.h
@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
 
				 
			
 
				 static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
			
 
				 {
			
 
				-	vcpu->arch.cr = val;
			
 
				+	vcpu->arch.regs.ccr = val;
			
 
				 }
			
 
				 
			
 
				 static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				-	return vcpu->arch.cr;
			
 
				+	return vcpu->arch.regs.ccr;
			
 
				 }
			
 
				 
			
 
				 static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
			
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -46,6 +46,7 @@
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 #include <asm/kvm_book3s_asm.h>		/* for MAX_SMT_THREADS */
			
 
				 #define KVM_MAX_VCPU_ID		(MAX_SMT_THREADS * KVM_MAX_VCORES)
			
 
				+#define KVM_MAX_NESTED_GUESTS	KVMPPC_NR_LPIDS
			
 
				 
			
 
				 #else
			
 
				 #define KVM_MAX_VCPU_ID		KVM_MAX_VCPUS
			
@@ -94,6 +95,7 @@ struct dtl_entry;
 
				 
			
 
				 struct kvmppc_vcpu_book3s;
			
 
				 struct kvmppc_book3s_shadow_vcpu;
			
 
				+struct kvm_nested_guest;
			
 
				 
			
 
				 struct kvm_vm_stat {
			
 
				 	ulong remote_tlb_flush;
			
@@ -287,10 +289,12 @@ struct kvm_arch {
 
				 	u8 radix;
			
 
				 	u8 fwnmi_enabled;
			
 
				 	bool threads_indep;
			
 
				+	bool nested_enable;
			
 
				 	pgd_t *pgtable;
			
 
				 	u64 process_table;
			
 
				 	struct dentry *debugfs_dir;
			
 
				 	struct dentry *htab_dentry;
			
 
				+	struct dentry *radix_dentry;
			
 
				 	struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
			
 
				 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
			
 
				 #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
			
@@ -311,6 +315,9 @@ struct kvm_arch {
 
				 #endif
			
 
				 	struct kvmppc_ops *kvm_ops;
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				+	u64 l1_ptcr;
			
 
				+	int max_nested_lpid;
			
 
				+	struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
			
 
				 	/* This array can grow quite large, keep it at the end */
			
 
				 	struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
			
 
				 #endif
			
@@ -360,7 +367,9 @@ struct kvmppc_pte {
 
				 	bool may_write		: 1;
			
 
				 	bool may_execute	: 1;
			
 
				 	unsigned long wimg;
			
 
				+	unsigned long rc;
			
 
				 	u8 page_size;		/* MMU_PAGE_xxx */
			
 
				+	u8 page_shift;
			
 
				 };
			
 
				 
			
 
				 struct kvmppc_mmu {
			
@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
 
				 	ulong tar;
			
 
				 #endif
			
 
				 
			
 
				-	u32 cr;
			
 
				-
			
 
				 #ifdef CONFIG_PPC_BOOK3S
			
 
				 	ulong hflags;
			
 
				 	ulong guest_owned_ext;
			
@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
 
				 	u8 hcall_needed;
			
 
				 	u8 epr_flags; /* KVMPPC_EPR_xxx */
			
 
				 	u8 epr_needed;
			
 
				+	u8 external_oneshot;	/* clear external irq after delivery */
			
 
				 
			
 
				 	u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
			
 
				 
			
@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
 
				 	u32 emul_inst;
			
 
				 
			
 
				 	u32 online;
			
 
				+
			
 
				+	/* For support of nested guests */
			
 
				+	struct kvm_nested_guest *nested;
			
 
				+	u32 nested_vcpu_id;
			
 
				 #endif
			
 
				 
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
			
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
 
				 		(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
			
 
				 				(stt)->size, (ioba), (npages)) ?        \
			
 
				 				H_PARAMETER : H_SUCCESS)
			
 
				-extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
			
 
				-		unsigned long tce);
			
 
				-extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
			
 
				+extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
			
 
				 		unsigned long *ua, unsigned long **prmap);
			
 
				 extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
			
 
				 		unsigned long idx, unsigned long tce);
			
@@ -585,6 +583,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
 
				 
			
 
				 extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
			
 
				 			       int level, bool line_status);
			
 
				+extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
			
 
				 #else
			
 
				 static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
			
 
				 				       u32 priority) { return -1; }
			
@@ -607,6 +606,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
 
				 
			
 
				 static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
			
 
				 				      int level, bool line_status) { return -ENODEV; }
			
 
				+static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
			
 
				 #endif /* CONFIG_KVM_XIVE */
			
 
				 
			
 
				 /*
			
@@ -652,6 +652,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
 
				                     unsigned long mfrr);
			
 
				 int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
			
 
				 int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
			
 
				+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
			
 
				 
			
 
				 /*
			
 
				  * Host-side operations we want to set up while running in real
			
--- a/arch/powerpc/include/asm/ppc-opcode.h
+++ b/arch/powerpc/include/asm/ppc-opcode.h
@@ -104,6 +104,7 @@
 
				 #define OP_31_XOP_LHZUX     311
			
 
				 #define OP_31_XOP_MSGSNDP   142
			
 
				 #define OP_31_XOP_MSGCLRP   174
			
 
				+#define OP_31_XOP_TLBIE     306
			
 
				 #define OP_31_XOP_MFSPR     339
			
 
				 #define OP_31_XOP_LWAX      341
			
 
				 #define OP_31_XOP_LHAX      343
			
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -415,6 +415,7 @@
 
				 #define   HFSCR_DSCR	__MASK(FSCR_DSCR_LG)
			
 
				 #define   HFSCR_VECVSX	__MASK(FSCR_VECVSX_LG)
			
 
				 #define   HFSCR_FP	__MASK(FSCR_FP_LG)
			
 
				+#define   HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56)	/* interrupt cause */
			
 
				 #define SPRN_TAR	0x32f	/* Target Address Register */
			
 
				 #define SPRN_LPCR	0x13E	/* LPAR Control Register */
			
 
				 #define   LPCR_VPM0		ASM_CONST(0x8000000000000000)
			
@@ -766,6 +767,7 @@
 
				 #define SPRN_HSRR0	0x13A	/* Save/Restore Register 0 */
			
 
				 #define SPRN_HSRR1	0x13B	/* Save/Restore Register 1 */
			
 
				 #define   HSRR1_DENORM		0x00100000 /* Denorm exception */
			
 
				+#define   HSRR1_HISI_WRITE	0x00010000 /* HISI bcs couldn't update mem */
			
 
				 
			
 
				 #define SPRN_TBCTL	0x35f	/* PA6T Timebase control register */
			
 
				 #define   TBCTL_FREEZE		0x0000000000000000ull /* Freeze all tbs */
			
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
 
				 
			
 
				 #define KVM_REG_PPC_DEC_EXPIRY	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
			
 
				 #define KVM_REG_PPC_ONLINE	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
			
 
				+#define KVM_REG_PPC_PTCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
			
 
				 
			
 
				 /* Transactional Memory checkpointed state:
			
 
				  * This is all GPRs, all VSX regs and a subset of SPRs
			
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -438,7 +438,7 @@ int main(void)
 
				 #ifdef CONFIG_PPC_BOOK3S
			
 
				 	OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
			
 
				 #endif
			
 
				-	OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
			
 
				+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
			
 
				 	OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 	OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
			
@@ -503,6 +503,7 @@ int main(void)
 
				 	OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
			
 
				 	OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
			
 
				 	OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
			
 
				+	OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
			
 
				 	OFFSET(VCPU_CPU, kvm_vcpu, cpu);
			
 
				 	OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
			
 
				 #endif
			
@@ -695,7 +696,7 @@ int main(void)
 
				 #endif /* CONFIG_PPC_BOOK3S_64 */
			
 
				 
			
 
				 #else /* CONFIG_PPC_BOOK3S */
			
 
				-	OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
			
 
				+	OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
			
 
				 	OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
			
 
				 	OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
			
 
				 	OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
			
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -147,8 +147,8 @@ __init_hvmode_206:
 
				 	rldicl.	r0,r3,4,63
			
 
				 	bnelr
			
 
				 	ld	r5,CPU_SPEC_FEATURES(r4)
			
 
				-	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
			
 
				-	xor	r5,r5,r6
			
 
				+	LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
			
 
				+	andc	r5,r5,r6
			
 
				 	std	r5,CPU_SPEC_FEATURES(r4)
			
 
				 	blr
			
 
				 
			
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -75,7 +75,8 @@ kvm-hv-y += \
 
				 	book3s_hv.o \
			
 
				 	book3s_hv_interrupts.o \
			
 
				 	book3s_64_mmu_hv.o \
			
 
				-	book3s_64_mmu_radix.o
			
 
				+	book3s_64_mmu_radix.o \
			
 
				+	book3s_hv_nested.o
			
 
				 
			
 
				 kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
			
 
				 	book3s_hv_tm.o
			
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -153,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
 
				 	case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE;		break;
			
 
				 	case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT;		break;
			
 
				 	case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL;		break;
			
 
				-	case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL;	break;
			
 
				 	case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT;		break;
			
 
				 	case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM;		break;
			
 
				 	case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL;		break;
			
@@ -239,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
 
				 void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
			
 
				                                 struct kvm_interrupt *irq)
			
 
				 {
			
 
				-	unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
			
 
				-
			
 
				-	if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
			
 
				-		vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
			
 
				+	/*
			
 
				+	 * This case (KVM_INTERRUPT_SET) should never actually arise for
			
 
				+	 * a pseries guest (because pseries guests expect their interrupt
			
 
				+	 * controllers to continue asserting an external interrupt request
			
 
				+	 * until it is acknowledged at the interrupt controller), but is
			
 
				+	 * included to avoid ABI breakage and potentially for other
			
 
				+	 * sorts of guest.
			
 
				+	 *
			
 
				+	 * There is a subtlety here: HV KVM does not test the
			
 
				+	 * external_oneshot flag in the code that synthesizes
			
 
				+	 * external interrupts for the guest just before entering
			
 
				+	 * the guest.  That is OK even if userspace did do a
			
 
				+	 * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
			
 
				+	 * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
			
 
				+	 * which ends up doing a smp_send_reschedule(), which will
			
 
				+	 * pull the guest all the way out to the host, meaning that
			
 
				+	 * we will call kvmppc_core_prepare_to_enter() before entering
			
 
				+	 * the guest again, and that will handle the external_oneshot
			
 
				+	 * flag correctly.
			
 
				+	 */
			
 
				+	if (irq->irq == KVM_INTERRUPT_SET)
			
 
				+		vcpu->arch.external_oneshot = 1;
			
 
				 
			
 
				-	kvmppc_book3s_queue_irqprio(vcpu, vec);
			
 
				+	kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
			
 
				 }
			
 
				 
			
 
				 void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
			
 
				-	kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
			
 
				 }
			
 
				 
			
 
				 void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
			
@@ -281,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
 
				 		vec = BOOK3S_INTERRUPT_DECREMENTER;
			
 
				 		break;
			
 
				 	case BOOK3S_IRQPRIO_EXTERNAL:
			
 
				-	case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
			
 
				 		deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
			
 
				 		vec = BOOK3S_INTERRUPT_EXTERNAL;
			
 
				 		break;
			
@@ -355,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
 
				 		case BOOK3S_IRQPRIO_DECREMENTER:
			
 
				 			/* DEC interrupts get cleared by mtdec */
			
 
				 			return false;
			
 
				-		case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
			
 
				-			/* External interrupts get cleared by userspace */
			
 
				+		case BOOK3S_IRQPRIO_EXTERNAL:
			
 
				+			/*
			
 
				+			 * External interrupts get cleared by userspace
			
 
				+			 * except when set by the KVM_INTERRUPT ioctl with
			
 
				+			 * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
			
 
				+			 */
			
 
				+			if (vcpu->arch.external_oneshot) {
			
 
				+				vcpu->arch.external_oneshot = 0;
			
 
				+				return true;
			
 
				+			}
			
 
				 			return false;
			
 
				 	}
			
 
				 
			
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
 
				 {
			
 
				 	unsigned long host_lpid, rsvd_lpid;
			
 
				 
			
 
				-	if (!cpu_has_feature(CPU_FTR_HVMODE))
			
 
				-		return -EINVAL;
			
 
				-
			
 
				 	if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
			
 
				 		return -EINVAL;
			
 
				 
			
 
				 	/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
			
 
				-	host_lpid = mfspr(SPRN_LPID);
			
 
				+	host_lpid = 0;
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE))
			
 
				+		host_lpid = mfspr(SPRN_LPID);
			
 
				 	rsvd_lpid = LPID_RSVD;
			
 
				 
			
 
				 	kvmppc_init_lpid(rsvd_lpid + 1);
			
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -10,6 +10,9 @@
 
				 #include <linux/string.h>
			
 
				 #include <linux/kvm.h>
			
 
				 #include <linux/kvm_host.h>
			
 
				+#include <linux/anon_inodes.h>
			
 
				+#include <linux/file.h>
			
 
				+#include <linux/debugfs.h>
			
 
				 
			
 
				 #include <asm/kvm_ppc.h>
			
 
				 #include <asm/kvm_book3s.h>
			
@@ -26,87 +29,74 @@
 
				  */
			
 
				 static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
			
 
				 
			
 
				-int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				-			   struct kvmppc_pte *gpte, bool data, bool iswrite)
			
 
				+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				+			       struct kvmppc_pte *gpte, u64 root,
			
 
				+			       u64 *pte_ret_p)
			
 
				 {
			
 
				 	struct kvm *kvm = vcpu->kvm;
			
 
				-	u32 pid;
			
 
				 	int ret, level, ps;
			
 
				-	__be64 prte, rpte;
			
 
				-	unsigned long ptbl;
			
 
				-	unsigned long root, pte, index;
			
 
				-	unsigned long rts, bits, offset;
			
 
				-	unsigned long gpa;
			
 
				-	unsigned long proc_tbl_size;
			
 
				-
			
 
				-	/* Work out effective PID */
			
 
				-	switch (eaddr >> 62) {
			
 
				-	case 0:
			
 
				-		pid = vcpu->arch.pid;
			
 
				-		break;
			
 
				-	case 3:
			
 
				-		pid = 0;
			
 
				-		break;
			
 
				-	default:
			
 
				-		return -EINVAL;
			
 
				-	}
			
 
				-	proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
			
 
				-	if (pid * 16 >= proc_tbl_size)
			
 
				-		return -EINVAL;
			
 
				-
			
 
				-	/* Read partition table to find root of tree for effective PID */
			
 
				-	ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
			
 
				-	ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
			
 
				-	if (ret)
			
 
				-		return ret;
			
 
				+	unsigned long rts, bits, offset, index;
			
 
				+	u64 pte, base, gpa;
			
 
				+	__be64 rpte;
			
 
				 
			
 
				-	root = be64_to_cpu(prte);
			
 
				 	rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
			
 
				 		((root & RTS2_MASK) >> RTS2_SHIFT);
			
 
				 	bits = root & RPDS_MASK;
			
 
				-	root = root & RPDB_MASK;
			
 
				+	base = root & RPDB_MASK;
			
 
				 
			
 
				 	offset = rts + 31;
			
 
				 
			
 
				-	/* current implementations only support 52-bit space */
			
 
				+	/* Current implementations only support 52-bit space */
			
 
				 	if (offset != 52)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	/* Walk each level of the radix tree */
			
 
				 	for (level = 3; level >= 0; --level) {
			
 
				+		u64 addr;
			
 
				+		/* Check a valid size */
			
 
				 		if (level && bits != p9_supported_radix_bits[level])
			
 
				 			return -EINVAL;
			
 
				 		if (level == 0 && !(bits == 5 || bits == 9))
			
 
				 			return -EINVAL;
			
 
				 		offset -= bits;
			
 
				 		index = (eaddr >> offset) & ((1UL << bits) - 1);
			
 
				-		/* check that low bits of page table base are zero */
			
 
				-		if (root & ((1UL << (bits + 3)) - 1))
			
 
				+		/* Check that low bits of page table base are zero */
			
 
				+		if (base & ((1UL << (bits + 3)) - 1))
			
 
				 			return -EINVAL;
			
 
				-		ret = kvm_read_guest(kvm, root + index * 8,
			
 
				-				     &rpte, sizeof(rpte));
			
 
				-		if (ret)
			
 
				+		/* Read the entry from guest memory */
			
 
				+		addr = base + (index * sizeof(rpte));
			
 
				+		ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
			
 
				+		if (ret) {
			
 
				+			if (pte_ret_p)
			
 
				+				*pte_ret_p = addr;
			
 
				 			return ret;
			
 
				+		}
			
 
				 		pte = __be64_to_cpu(rpte);
			
 
				 		if (!(pte & _PAGE_PRESENT))
			
 
				 			return -ENOENT;
			
 
				+		/* Check if a leaf entry */
			
 
				 		if (pte & _PAGE_PTE)
			
 
				 			break;
			
 
				-		bits = pte & 0x1f;
			
 
				-		root = pte & 0x0fffffffffffff00ul;
			
 
				+		/* Get ready to walk the next level */
			
 
				+		base = pte & RPDB_MASK;
			
 
				+		bits = pte & RPDS_MASK;
			
 
				 	}
			
 
				-	/* need a leaf at lowest level; 512GB pages not supported */
			
 
				+
			
 
				+	/* Need a leaf at lowest level; 512GB pages not supported */
			
 
				 	if (level < 0 || level == 3)
			
 
				 		return -EINVAL;
			
 
				 
			
 
				-	/* offset is now log base 2 of the page size */
			
 
				+	/* We found a valid leaf PTE */
			
 
				+	/* Offset is now log base 2 of the page size */
			
 
				 	gpa = pte & 0x01fffffffffff000ul;
			
 
				 	if (gpa & ((1ul << offset) - 1))
			
 
				 		return -EINVAL;
			
 
				-	gpa += eaddr & ((1ul << offset) - 1);
			
 
				+	gpa |= eaddr & ((1ul << offset) - 1);
			
 
				 	for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
			
 
				 		if (offset == mmu_psize_defs[ps].shift)
			
 
				 			break;
			
 
				 	gpte->page_size = ps;
			
 
				+	gpte->page_shift = offset;
			
 
				 
			
 
				 	gpte->eaddr = eaddr;
			
 
				 	gpte->raddr = gpa;
			
@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
				 	gpte->may_read = !!(pte & _PAGE_READ);
			
 
				 	gpte->may_write = !!(pte & _PAGE_WRITE);
			
 
				 	gpte->may_execute = !!(pte & _PAGE_EXEC);
			
 
				+
			
 
				+	gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
			
 
				+
			
 
				+	if (pte_ret_p)
			
 
				+		*pte_ret_p = pte;
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Used to walk a partition or process table radix tree in guest memory
			
 
				+ * Note: We exploit the fact that a partition table and a process
			
 
				+ * table have the same layout, a partition-scoped page table and a
			
 
				+ * process-scoped page table have the same layout, and the 2nd
			
 
				+ * doubleword of a partition table entry has the same layout as
			
 
				+ * the PTCR register.
			
 
				+ */
			
 
				+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				+				     struct kvmppc_pte *gpte, u64 table,
			
 
				+				     int table_index, u64 *pte_ret_p)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	int ret;
			
 
				+	unsigned long size, ptbl, root;
			
 
				+	struct prtb_entry entry;
			
 
				+
			
 
				+	if ((table & PRTS_MASK) > 24)
			
 
				+		return -EINVAL;
			
 
				+	size = 1ul << ((table & PRTS_MASK) + 12);
			
 
				+
			
 
				+	/* Is the table big enough to contain this entry? */
			
 
				+	if ((table_index * sizeof(entry)) >= size)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Read the table to find the root of the radix tree */
			
 
				+	ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
			
 
				+	ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* Root is stored in the first double word */
			
 
				+	root = be64_to_cpu(entry.prtb0);
			
 
				+
			
 
				+	return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
			
 
				+}
			
 
				+
			
 
				+int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
			
 
				+			   struct kvmppc_pte *gpte, bool data, bool iswrite)
			
 
				+{
			
 
				+	u32 pid;
			
 
				+	u64 pte;
			
 
				+	int ret;
			
 
				+
			
 
				+	/* Work out effective PID */
			
 
				+	switch (eaddr >> 62) {
			
 
				+	case 0:
			
 
				+		pid = vcpu->arch.pid;
			
 
				+		break;
			
 
				+	case 3:
			
 
				+		pid = 0;
			
 
				+		break;
			
 
				+	default:
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+
			
 
				+	ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
			
 
				+				vcpu->kvm->arch.process_table, pid, &pte);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* Check privilege (applies only to process scoped translations) */
			
 
				 	if (kvmppc_get_msr(vcpu) & MSR_PR) {
			
 
				 		if (pte & _PAGE_PRIVILEGED) {
			
 
				 			gpte->may_read = 0;
			
@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
				 }
			
 
				 
			
 
				 static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
			
 
				-				    unsigned int pshift)
			
 
				+				    unsigned int pshift, unsigned int lpid)
			
 
				 {
			
 
				 	unsigned long psize = PAGE_SIZE;
			
 
				+	int psi;
			
 
				+	long rc;
			
 
				+	unsigned long rb;
			
 
				 
			
 
				 	if (pshift)
			
 
				 		psize = 1UL << pshift;
			
 
				+	else
			
 
				+		pshift = PAGE_SHIFT;
			
 
				 
			
 
				 	addr &= ~(psize - 1);
			
 
				-	radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
			
 
				+
			
 
				+	if (!kvmhv_on_pseries()) {
			
 
				+		radix__flush_tlb_lpid_page(lpid, addr, psize);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	psi = shift_to_mmu_psize(pshift);
			
 
				+	rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
			
 
				+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
			
 
				+				lpid, rb);
			
 
				+	if (rc)
			
 
				+		pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
			
 
				 }
			
 
				 
			
 
				-static void kvmppc_radix_flush_pwc(struct kvm *kvm)
			
 
				+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
			
 
				 {
			
 
				-	radix__flush_pwc_lpid(kvm->arch.lpid);
			
 
				+	long rc;
			
 
				+
			
 
				+	if (!kvmhv_on_pseries()) {
			
 
				+		radix__flush_pwc_lpid(lpid);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
			
 
				+				lpid, TLBIEL_INVAL_SET_LPID);
			
 
				+	if (rc)
			
 
				+		pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
			
 
				 }
			
 
				 
			
 
				 static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
			
@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
 
				 	kmem_cache_free(kvm_pmd_cache, pmdp);
			
 
				 }
			
 
				 
			
 
				-static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
			
 
				-			     unsigned long gpa, unsigned int shift)
			
 
				+/* Called with kvm->mmu_lock held */
			
 
				+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
			
 
				+		      unsigned int shift, struct kvm_memory_slot *memslot,
			
 
				+		      unsigned int lpid)
			
 
				 
			
 
				 {
			
 
				-	unsigned long page_size = 1ul << shift;
			
 
				 	unsigned long old;
			
 
				+	unsigned long gfn = gpa >> PAGE_SHIFT;
			
 
				+	unsigned long page_size = PAGE_SIZE;
			
 
				+	unsigned long hpa;
			
 
				 
			
 
				 	old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
			
 
				-	kvmppc_radix_tlbie_page(kvm, gpa, shift);
			
 
				-	if (old & _PAGE_DIRTY) {
			
 
				-		unsigned long gfn = gpa >> PAGE_SHIFT;
			
 
				-		struct kvm_memory_slot *memslot;
			
 
				+	kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
			
 
				+
			
 
				+	/* The following only applies to L1 entries */
			
 
				+	if (lpid != kvm->arch.lpid)
			
 
				+		return;
			
 
				 
			
 
				+	if (!memslot) {
			
 
				 		memslot = gfn_to_memslot(kvm, gfn);
			
 
				-		if (memslot && memslot->dirty_bitmap)
			
 
				-			kvmppc_update_dirty_map(memslot, gfn, page_size);
			
 
				+		if (!memslot)
			
 
				+			return;
			
 
				 	}
			
 
				+	if (shift)
			
 
				+		page_size = 1ul << shift;
			
 
				+
			
 
				+	gpa &= ~(page_size - 1);
			
 
				+	hpa = old & PTE_RPN_MASK;
			
 
				+	kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
			
 
				+
			
 
				+	if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
			
 
				+		kvmppc_update_dirty_map(memslot, gfn, page_size);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
 
				  * and emit a warning if encountered, but there may already be data
			
 
				  * corruption due to the unexpected mappings.
			
 
				  */
			
 
				-static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
			
 
				+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
			
 
				+				  unsigned int lpid)
			
 
				 {
			
 
				 	if (full) {
			
 
				 		memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
			
@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
 
				 			WARN_ON_ONCE(1);
			
 
				 			kvmppc_unmap_pte(kvm, p,
			
 
				 					 pte_pfn(*p) << PAGE_SHIFT,
			
 
				-					 PAGE_SHIFT);
			
 
				+					 PAGE_SHIFT, NULL, lpid);
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	kvmppc_pte_free(pte);
			
 
				 }
			
 
				 
			
 
				-static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
			
 
				+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
			
 
				+				  unsigned int lpid)
			
 
				 {
			
 
				 	unsigned long im;
			
 
				 	pmd_t *p = pmd;
			
@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
 
				 				WARN_ON_ONCE(1);
			
 
				 				kvmppc_unmap_pte(kvm, (pte_t *)p,
			
 
				 					 pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
			
 
				-					 PMD_SHIFT);
			
 
				+					 PMD_SHIFT, NULL, lpid);
			
 
				 			}
			
 
				 		} else {
			
 
				 			pte_t *pte;
			
 
				 
			
 
				 			pte = pte_offset_map(p, 0);
			
 
				-			kvmppc_unmap_free_pte(kvm, pte, full);
			
 
				+			kvmppc_unmap_free_pte(kvm, pte, full, lpid);
			
 
				 			pmd_clear(p);
			
 
				 		}
			
 
				 	}
			
 
				 	kvmppc_pmd_free(pmd);
			
 
				 }
			
 
				 
			
 
				-static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
			
 
				+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
			
 
				+				  unsigned int lpid)
			
 
				 {
			
 
				 	unsigned long iu;
			
 
				 	pud_t *p = pud;
			
@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
 
				 			pmd_t *pmd;
			
 
				 
			
 
				 			pmd = pmd_offset(p, 0);
			
 
				-			kvmppc_unmap_free_pmd(kvm, pmd, true);
			
 
				+			kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
			
 
				 			pud_clear(p);
			
 
				 		}
			
 
				 	}
			
 
				 	pud_free(kvm->mm, pud);
			
 
				 }
			
 
				 
			
 
				-void kvmppc_free_radix(struct kvm *kvm)
			
 
				+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
			
 
				 {
			
 
				 	unsigned long ig;
			
 
				-	pgd_t *pgd;
			
 
				 
			
 
				-	if (!kvm->arch.pgtable)
			
 
				-		return;
			
 
				-	pgd = kvm->arch.pgtable;
			
 
				 	for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
			
 
				 		pud_t *pud;
			
 
				 
			
 
				 		if (!pgd_present(*pgd))
			
 
				 			continue;
			
 
				 		pud = pud_offset(pgd, 0);
			
 
				-		kvmppc_unmap_free_pud(kvm, pud);
			
 
				+		kvmppc_unmap_free_pud(kvm, pud, lpid);
			
 
				 		pgd_clear(pgd);
			
 
				 	}
			
 
				-	pgd_free(kvm->mm, kvm->arch.pgtable);
			
 
				-	kvm->arch.pgtable = NULL;
			
 
				+}
			
 
				+
			
 
				+void kvmppc_free_radix(struct kvm *kvm)
			
 
				+{
			
 
				+	if (kvm->arch.pgtable) {
			
 
				+		kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
			
 
				+					  kvm->arch.lpid);
			
 
				+		pgd_free(kvm->mm, kvm->arch.pgtable);
			
 
				+		kvm->arch.pgtable = NULL;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
			
 
				-					      unsigned long gpa)
			
 
				+					unsigned long gpa, unsigned int lpid)
			
 
				 {
			
 
				 	pte_t *pte = pte_offset_kernel(pmd, 0);
			
 
				 
			
@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
 
				 	 * flushing the PWC again.
			
 
				 	 */
			
 
				 	pmd_clear(pmd);
			
 
				-	kvmppc_radix_flush_pwc(kvm);
			
 
				+	kvmppc_radix_flush_pwc(kvm, lpid);
			
 
				 
			
 
				-	kvmppc_unmap_free_pte(kvm, pte, false);
			
 
				+	kvmppc_unmap_free_pte(kvm, pte, false, lpid);
			
 
				 }
			
 
				 
			
 
				 static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
			
 
				-					unsigned long gpa)
			
 
				+					unsigned long gpa, unsigned int lpid)
			
 
				 {
			
 
				 	pmd_t *pmd = pmd_offset(pud, 0);
			
 
				 
			
@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 
				 	 * so can be freed without flushing the PWC again.
			
 
				 	 */
			
 
				 	pud_clear(pud);
			
 
				-	kvmppc_radix_flush_pwc(kvm);
			
 
				+	kvmppc_radix_flush_pwc(kvm, lpid);
			
 
				 
			
 
				-	kvmppc_unmap_free_pmd(kvm, pmd, false);
			
 
				+	kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
 
				  */
			
 
				 #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
			
 
				 
			
 
				-static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
			
 
				-			     unsigned int level, unsigned long mmu_seq)
			
 
				+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
			
 
				+		      unsigned long gpa, unsigned int level,
			
 
				+		      unsigned long mmu_seq, unsigned int lpid,
			
 
				+		      unsigned long *rmapp, struct rmap_nested **n_rmap)
			
 
				 {
			
 
				 	pgd_t *pgd;
			
 
				 	pud_t *pud, *new_pud = NULL;
			
@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 	int ret;
			
 
				 
			
 
				 	/* Traverse the guest's 2nd-level tree, allocate new levels needed */
			
 
				-	pgd = kvm->arch.pgtable + pgd_index(gpa);
			
 
				+	pgd = pgtable + pgd_index(gpa);
			
 
				 	pud = NULL;
			
 
				 	if (pgd_present(*pgd))
			
 
				 		pud = pud_offset(pgd, gpa);
			
@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 			goto out_unlock;
			
 
				 		}
			
 
				 		/* Valid 1GB page here already, remove it */
			
 
				-		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
			
 
				+		kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
			
 
				+				 lpid);
			
 
				 	}
			
 
				 	if (level == 2) {
			
 
				 		if (!pud_none(*pud)) {
			
@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 			 * install a large page, so remove and free the page
			
 
				 			 * table page.
			
 
				 			 */
			
 
				-			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
			
 
				+			kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
			
 
				 		}
			
 
				 		kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
			
 
				+		if (rmapp && n_rmap)
			
 
				+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
			
 
				 		ret = 0;
			
 
				 		goto out_unlock;
			
 
				 	}
			
@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 			WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
			
 
				 							PTE_BITS_MUST_MATCH);
			
 
				 			kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
			
 
				-					      0, pte_val(pte), lgpa, PMD_SHIFT);
			
 
				+					0, pte_val(pte), lgpa, PMD_SHIFT);
			
 
				 			ret = 0;
			
 
				 			goto out_unlock;
			
 
				 		}
			
@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 			goto out_unlock;
			
 
				 		}
			
 
				 		/* Valid 2MB page here already, remove it */
			
 
				-		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
			
 
				+		kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
			
 
				+				 lpid);
			
 
				 	}
			
 
				 	if (level == 1) {
			
 
				 		if (!pmd_none(*pmd)) {
			
@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 			 * install a large page, so remove and free the page
			
 
				 			 * table page.
			
 
				 			 */
			
 
				-			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
			
 
				+			kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
			
 
				 		}
			
 
				 		kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
			
 
				+		if (rmapp && n_rmap)
			
 
				+			kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
			
 
				 		ret = 0;
			
 
				 		goto out_unlock;
			
 
				 	}
			
@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 		goto out_unlock;
			
 
				 	}
			
 
				 	kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
			
 
				+	if (rmapp && n_rmap)
			
 
				+		kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
			
 
				 	ret = 0;
			
 
				 
			
 
				  out_unlock:
			
@@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
			
 
				-				   unsigned long ea, unsigned long dsisr)
			
 
				+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
			
 
				+			     unsigned long gpa, unsigned int lpid)
			
 
				+{
			
 
				+	unsigned long pgflags;
			
 
				+	unsigned int shift;
			
 
				+	pte_t *ptep;
			
 
				+
			
 
				+	/*
			
 
				+	 * Need to set an R or C bit in the 2nd-level tables;
			
 
				+	 * since we are just helping out the hardware here,
			
 
				+	 * it is sufficient to do what the hardware does.
			
 
				+	 */
			
 
				+	pgflags = _PAGE_ACCESSED;
			
 
				+	if (writing)
			
 
				+		pgflags |= _PAGE_DIRTY;
			
 
				+	/*
			
 
				+	 * We are walking the secondary (partition-scoped) page table here.
			
 
				+	 * We can do this without disabling irq because the Linux MM
			
 
				+	 * subsystem doesn't do THP splits and collapses on this tree.
			
 
				+	 */
			
 
				+	ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
			
 
				+	if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
			
 
				+		kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
			
 
				+		return true;
			
 
				+	}
			
 
				+	return false;
			
 
				+}
			
 
				+
			
 
				+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
			
 
				+				   unsigned long gpa,
			
 
				+				   struct kvm_memory_slot *memslot,
			
 
				+				   bool writing, bool kvm_ro,
			
 
				+				   pte_t *inserted_pte, unsigned int *levelp)
			
 
				 {
			
 
				 	struct kvm *kvm = vcpu->kvm;
			
 
				-	unsigned long mmu_seq;
			
 
				-	unsigned long gpa, gfn, hva;
			
 
				-	struct kvm_memory_slot *memslot;
			
 
				 	struct page *page = NULL;
			
 
				-	long ret;
			
 
				-	bool writing;
			
 
				+	unsigned long mmu_seq;
			
 
				+	unsigned long hva, gfn = gpa >> PAGE_SHIFT;
			
 
				 	bool upgrade_write = false;
			
 
				 	bool *upgrade_p = &upgrade_write;
			
 
				 	pte_t pte, *ptep;
			
 
				-	unsigned long pgflags;
			
 
				 	unsigned int shift, level;
			
 
				-
			
 
				-	/* Check for unusual errors */
			
 
				-	if (dsisr & DSISR_UNSUPP_MMU) {
			
 
				-		pr_err("KVM: Got unsupported MMU fault\n");
			
 
				-		return -EFAULT;
			
 
				-	}
			
 
				-	if (dsisr & DSISR_BADACCESS) {
			
 
				-		/* Reflect to the guest as DSI */
			
 
				-		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
			
 
				-		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				-		return RESUME_GUEST;
			
 
				-	}
			
 
				-
			
 
				-	/* Translate the logical address and get the page */
			
 
				-	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
			
 
				-	gpa &= ~0xF000000000000000ul;
			
 
				-	gfn = gpa >> PAGE_SHIFT;
			
 
				-	if (!(dsisr & DSISR_PRTABLE_FAULT))
			
 
				-		gpa |= ea & 0xfff;
			
 
				-	memslot = gfn_to_memslot(kvm, gfn);
			
 
				-
			
 
				-	/* No memslot means it's an emulated MMIO region */
			
 
				-	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
			
 
				-		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
			
 
				-			     DSISR_SET_RC)) {
			
 
				-			/*
			
 
				-			 * Bad address in guest page table tree, or other
			
 
				-			 * unusual error - reflect it to the guest as DSI.
			
 
				-			 */
			
 
				-			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				-			return RESUME_GUEST;
			
 
				-		}
			
 
				-		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
			
 
				-					      dsisr & DSISR_ISSTORE);
			
 
				-	}
			
 
				-
			
 
				-	writing = (dsisr & DSISR_ISSTORE) != 0;
			
 
				-	if (memslot->flags & KVM_MEM_READONLY) {
			
 
				-		if (writing) {
			
 
				-			/* give the guest a DSI */
			
 
				-			dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
			
 
				-			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				-			return RESUME_GUEST;
			
 
				-		}
			
 
				-		upgrade_p = NULL;
			
 
				-	}
			
 
				-
			
 
				-	if (dsisr & DSISR_SET_RC) {
			
 
				-		/*
			
 
				-		 * Need to set an R or C bit in the 2nd-level tables;
			
 
				-		 * since we are just helping out the hardware here,
			
 
				-		 * it is sufficient to do what the hardware does.
			
 
				-		 */
			
 
				-		pgflags = _PAGE_ACCESSED;
			
 
				-		if (writing)
			
 
				-			pgflags |= _PAGE_DIRTY;
			
 
				-		/*
			
 
				-		 * We are walking the secondary page table here. We can do this
			
 
				-		 * without disabling irq.
			
 
				-		 */
			
 
				-		spin_lock(&kvm->mmu_lock);
			
 
				-		ptep = __find_linux_pte(kvm->arch.pgtable,
			
 
				-					gpa, NULL, &shift);
			
 
				-		if (ptep && pte_present(*ptep) &&
			
 
				-		    (!writing || pte_write(*ptep))) {
			
 
				-			kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
			
 
				-						gpa, shift);
			
 
				-			dsisr &= ~DSISR_SET_RC;
			
 
				-		}
			
 
				-		spin_unlock(&kvm->mmu_lock);
			
 
				-		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
			
 
				-			       DSISR_PROTFAULT | DSISR_SET_RC)))
			
 
				-			return RESUME_GUEST;
			
 
				-	}
			
 
				+	int ret;
			
 
				 
			
 
				 	/* used to check for invalidations in progress */
			
 
				 	mmu_seq = kvm->mmu_notifier_seq;
			
@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	 * is that the page is writable.
			
 
				 	 */
			
 
				 	hva = gfn_to_hva_memslot(memslot, gfn);
			
 
				-	if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
			
 
				+	if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
			
 
				 		upgrade_write = true;
			
 
				 	} else {
			
 
				 		unsigned long pfn;
			
@@ -680,7 +753,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	}
			
 
				 
			
 
				 	/* Allocate space in the tree and write the PTE */
			
 
				-	ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
			
 
				+	ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
			
 
				+				mmu_seq, kvm->arch.lpid, NULL, NULL);
			
 
				+	if (inserted_pte)
			
 
				+		*inserted_pte = pte;
			
 
				+	if (levelp)
			
 
				+		*levelp = level;
			
 
				 
			
 
				 	if (page) {
			
 
				 		if (!ret && (pte_val(pte) & _PAGE_WRITE))
			
@@ -688,6 +766,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 		put_page(page);
			
 
				 	}
			
 
				 
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
			
 
				+				   unsigned long ea, unsigned long dsisr)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	unsigned long gpa, gfn;
			
 
				+	struct kvm_memory_slot *memslot;
			
 
				+	long ret;
			
 
				+	bool writing = !!(dsisr & DSISR_ISSTORE);
			
 
				+	bool kvm_ro = false;
			
 
				+
			
 
				+	/* Check for unusual errors */
			
 
				+	if (dsisr & DSISR_UNSUPP_MMU) {
			
 
				+		pr_err("KVM: Got unsupported MMU fault\n");
			
 
				+		return -EFAULT;
			
 
				+	}
			
 
				+	if (dsisr & DSISR_BADACCESS) {
			
 
				+		/* Reflect to the guest as DSI */
			
 
				+		pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
			
 
				+		kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				+		return RESUME_GUEST;
			
 
				+	}
			
 
				+
			
 
				+	/* Translate the logical address */
			
 
				+	gpa = vcpu->arch.fault_gpa & ~0xfffUL;
			
 
				+	gpa &= ~0xF000000000000000ul;
			
 
				+	gfn = gpa >> PAGE_SHIFT;
			
 
				+	if (!(dsisr & DSISR_PRTABLE_FAULT))
			
 
				+		gpa |= ea & 0xfff;
			
 
				+
			
 
				+	/* Get the corresponding memslot */
			
 
				+	memslot = gfn_to_memslot(kvm, gfn);
			
 
				+
			
 
				+	/* No memslot means it's an emulated MMIO region */
			
 
				+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
			
 
				+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
			
 
				+			     DSISR_SET_RC)) {
			
 
				+			/*
			
 
				+			 * Bad address in guest page table tree, or other
			
 
				+			 * unusual error - reflect it to the guest as DSI.
			
 
				+			 */
			
 
				+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				+			return RESUME_GUEST;
			
 
				+		}
			
 
				+		return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
			
 
				+	}
			
 
				+
			
 
				+	if (memslot->flags & KVM_MEM_READONLY) {
			
 
				+		if (writing) {
			
 
				+			/* give the guest a DSI */
			
 
				+			kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
			
 
				+						       DSISR_PROTFAULT);
			
 
				+			return RESUME_GUEST;
			
 
				+		}
			
 
				+		kvm_ro = true;
			
 
				+	}
			
 
				+
			
 
				+	/* Failed to set the reference/change bits */
			
 
				+	if (dsisr & DSISR_SET_RC) {
			
 
				+		spin_lock(&kvm->mmu_lock);
			
 
				+		if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
			
 
				+					    writing, gpa, kvm->arch.lpid))
			
 
				+			dsisr &= ~DSISR_SET_RC;
			
 
				+		spin_unlock(&kvm->mmu_lock);
			
 
				+
			
 
				+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
			
 
				+			       DSISR_PROTFAULT | DSISR_SET_RC)))
			
 
				+			return RESUME_GUEST;
			
 
				+	}
			
 
				+
			
 
				+	/* Try to insert a pte */
			
 
				+	ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
			
 
				+					     kvm_ro, NULL, NULL);
			
 
				+
			
 
				 	if (ret == 0 || ret == -EAGAIN)
			
 
				 		ret = RESUME_GUEST;
			
 
				 	return ret;
			
@@ -700,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
 
				 	pte_t *ptep;
			
 
				 	unsigned long gpa = gfn << PAGE_SHIFT;
			
 
				 	unsigned int shift;
			
 
				-	unsigned long old;
			
 
				 
			
 
				 	ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
			
 
				-	if (ptep && pte_present(*ptep)) {
			
 
				-		old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
			
 
				-					      gpa, shift);
			
 
				-		kvmppc_radix_tlbie_page(kvm, gpa, shift);
			
 
				-		if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
			
 
				-			unsigned long psize = PAGE_SIZE;
			
 
				-			if (shift)
			
 
				-				psize = 1ul << shift;
			
 
				-			kvmppc_update_dirty_map(memslot, gfn, psize);
			
 
				-		}
			
 
				-	}
			
 
				+	if (ptep && pte_present(*ptep))
			
 
				+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
			
 
				+				 kvm->arch.lpid);
			
 
				 	return 0;				
			
 
				 }
			
 
				 
			
@@ -768,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
 
				 			ret = 1 << (shift - PAGE_SHIFT);
			
 
				 		kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
			
 
				 					gpa, shift);
			
 
				-		kvmppc_radix_tlbie_page(kvm, gpa, shift);
			
 
				+		kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
			
 
				 	}
			
 
				 	return ret;
			
 
				 }
			
@@ -853,6 +998,215 @@ static void pmd_ctor(void *addr)
 
				 	memset(addr, 0, RADIX_PMD_TABLE_SIZE);
			
 
				 }
			
 
				 
			
 
				+struct debugfs_radix_state {
			
 
				+	struct kvm	*kvm;
			
 
				+	struct mutex	mutex;
			
 
				+	unsigned long	gpa;
			
 
				+	int		lpid;
			
 
				+	int		chars_left;
			
 
				+	int		buf_index;
			
 
				+	char		buf[128];
			
 
				+	u8		hdr;
			
 
				+};
			
 
				+
			
 
				+static int debugfs_radix_open(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	struct kvm *kvm = inode->i_private;
			
 
				+	struct debugfs_radix_state *p;
			
 
				+
			
 
				+	p = kzalloc(sizeof(*p), GFP_KERNEL);
			
 
				+	if (!p)
			
 
				+		return -ENOMEM;
			
 
				+
			
 
				+	kvm_get_kvm(kvm);
			
 
				+	p->kvm = kvm;
			
 
				+	mutex_init(&p->mutex);
			
 
				+	file->private_data = p;
			
 
				+
			
 
				+	return nonseekable_open(inode, file);
			
 
				+}
			
 
				+
			
 
				+static int debugfs_radix_release(struct inode *inode, struct file *file)
			
 
				+{
			
 
				+	struct debugfs_radix_state *p = file->private_data;
			
 
				+
			
 
				+	kvm_put_kvm(p->kvm);
			
 
				+	kfree(p);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
			
 
				+				 size_t len, loff_t *ppos)
			
 
				+{
			
 
				+	struct debugfs_radix_state *p = file->private_data;
			
 
				+	ssize_t ret, r;
			
 
				+	unsigned long n;
			
 
				+	struct kvm *kvm;
			
 
				+	unsigned long gpa;
			
 
				+	pgd_t *pgt;
			
 
				+	struct kvm_nested_guest *nested;
			
 
				+	pgd_t pgd, *pgdp;
			
 
				+	pud_t pud, *pudp;
			
 
				+	pmd_t pmd, *pmdp;
			
 
				+	pte_t *ptep;
			
 
				+	int shift;
			
 
				+	unsigned long pte;
			
 
				+
			
 
				+	kvm = p->kvm;
			
 
				+	if (!kvm_is_radix(kvm))
			
 
				+		return 0;
			
 
				+
			
 
				+	ret = mutex_lock_interruptible(&p->mutex);
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	if (p->chars_left) {
			
 
				+		n = p->chars_left;
			
 
				+		if (n > len)
			
 
				+			n = len;
			
 
				+		r = copy_to_user(buf, p->buf + p->buf_index, n);
			
 
				+		n -= r;
			
 
				+		p->chars_left -= n;
			
 
				+		p->buf_index += n;
			
 
				+		buf += n;
			
 
				+		len -= n;
			
 
				+		ret = n;
			
 
				+		if (r) {
			
 
				+			if (!n)
			
 
				+				ret = -EFAULT;
			
 
				+			goto out;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	gpa = p->gpa;
			
 
				+	nested = NULL;
			
 
				+	pgt = NULL;
			
 
				+	while (len != 0 && p->lpid >= 0) {
			
 
				+		if (gpa >= RADIX_PGTABLE_RANGE) {
			
 
				+			gpa = 0;
			
 
				+			pgt = NULL;
			
 
				+			if (nested) {
			
 
				+				kvmhv_put_nested(nested);
			
 
				+				nested = NULL;
			
 
				+			}
			
 
				+			p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
			
 
				+			p->hdr = 0;
			
 
				+			if (p->lpid < 0)
			
 
				+				break;
			
 
				+		}
			
 
				+		if (!pgt) {
			
 
				+			if (p->lpid == 0) {
			
 
				+				pgt = kvm->arch.pgtable;
			
 
				+			} else {
			
 
				+				nested = kvmhv_get_nested(kvm, p->lpid, false);
			
 
				+				if (!nested) {
			
 
				+					gpa = RADIX_PGTABLE_RANGE;
			
 
				+					continue;
			
 
				+				}
			
 
				+				pgt = nested->shadow_pgtable;
			
 
				+			}
			
 
				+		}
			
 
				+		n = 0;
			
 
				+		if (!p->hdr) {
			
 
				+			if (p->lpid > 0)
			
 
				+				n = scnprintf(p->buf, sizeof(p->buf),
			
 
				+					      "\nNested LPID %d: ", p->lpid);
			
 
				+			n += scnprintf(p->buf + n, sizeof(p->buf) - n,
			
 
				+				      "pgdir: %lx\n", (unsigned long)pgt);
			
 
				+			p->hdr = 1;
			
 
				+			goto copy;
			
 
				+		}
			
 
				+
			
 
				+		pgdp = pgt + pgd_index(gpa);
			
 
				+		pgd = READ_ONCE(*pgdp);
			
 
				+		if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
			
 
				+			gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		pudp = pud_offset(&pgd, gpa);
			
 
				+		pud = READ_ONCE(*pudp);
			
 
				+		if (!(pud_val(pud) & _PAGE_PRESENT)) {
			
 
				+			gpa = (gpa & PUD_MASK) + PUD_SIZE;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (pud_val(pud) & _PAGE_PTE) {
			
 
				+			pte = pud_val(pud);
			
 
				+			shift = PUD_SHIFT;
			
 
				+			goto leaf;
			
 
				+		}
			
 
				+
			
 
				+		pmdp = pmd_offset(&pud, gpa);
			
 
				+		pmd = READ_ONCE(*pmdp);
			
 
				+		if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
			
 
				+			gpa = (gpa & PMD_MASK) + PMD_SIZE;
			
 
				+			continue;
			
 
				+		}
			
 
				+		if (pmd_val(pmd) & _PAGE_PTE) {
			
 
				+			pte = pmd_val(pmd);
			
 
				+			shift = PMD_SHIFT;
			
 
				+			goto leaf;
			
 
				+		}
			
 
				+
			
 
				+		ptep = pte_offset_kernel(&pmd, gpa);
			
 
				+		pte = pte_val(READ_ONCE(*ptep));
			
 
				+		if (!(pte & _PAGE_PRESENT)) {
			
 
				+			gpa += PAGE_SIZE;
			
 
				+			continue;
			
 
				+		}
			
 
				+		shift = PAGE_SHIFT;
			
 
				+	leaf:
			
 
				+		n = scnprintf(p->buf, sizeof(p->buf),
			
 
				+			      " %lx: %lx %d\n", gpa, pte, shift);
			
 
				+		gpa += 1ul << shift;
			
 
				+	copy:
			
 
				+		p->chars_left = n;
			
 
				+		if (n > len)
			
 
				+			n = len;
			
 
				+		r = copy_to_user(buf, p->buf, n);
			
 
				+		n -= r;
			
 
				+		p->chars_left -= n;
			
 
				+		p->buf_index = n;
			
 
				+		buf += n;
			
 
				+		len -= n;
			
 
				+		ret += n;
			
 
				+		if (r) {
			
 
				+			if (!ret)
			
 
				+				ret = -EFAULT;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	p->gpa = gpa;
			
 
				+	if (nested)
			
 
				+		kvmhv_put_nested(nested);
			
 
				+
			
 
				+ out:
			
 
				+	mutex_unlock(&p->mutex);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
			
 
				+			   size_t len, loff_t *ppos)
			
 
				+{
			
 
				+	return -EACCES;
			
 
				+}
			
 
				+
			
 
				+static const struct file_operations debugfs_radix_fops = {
			
 
				+	.owner	 = THIS_MODULE,
			
 
				+	.open	 = debugfs_radix_open,
			
 
				+	.release = debugfs_radix_release,
			
 
				+	.read	 = debugfs_radix_read,
			
 
				+	.write	 = debugfs_radix_write,
			
 
				+	.llseek	 = generic_file_llseek,
			
 
				+};
			
 
				+
			
 
				+void kvmhv_radix_debugfs_init(struct kvm *kvm)
			
 
				+{
			
 
				+	kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
			
 
				+						     kvm->arch.debugfs_dir, kvm,
			
 
				+						     &debugfs_radix_fops);
			
 
				+}
			
 
				+
			
 
				 int kvmppc_radix_init(void)
			
 
				 {
			
 
				 	unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
			
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
			
 
				+		unsigned long tce)
			
 
				+{
			
 
				+	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
			
 
				+	enum dma_data_direction dir = iommu_tce_direction(tce);
			
 
				+	struct kvmppc_spapr_tce_iommu_table *stit;
			
 
				+	unsigned long ua = 0;
			
 
				+
			
 
				+	/* Allow userspace to poison TCE table */
			
 
				+	if (dir == DMA_NONE)
			
 
				+		return H_SUCCESS;
			
 
				+
			
 
				+	if (iommu_tce_check_gpa(stt->page_shift, gpa))
			
 
				+		return H_TOO_HARD;
			
 
				+
			
 
				+	if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
			
 
				+		return H_TOO_HARD;
			
 
				+
			
 
				+	list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
			
 
				+		unsigned long hpa = 0;
			
 
				+		struct mm_iommu_table_group_mem_t *mem;
			
 
				+		long shift = stit->tbl->it_page_shift;
			
 
				+
			
 
				+		mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
			
 
				+		if (!mem)
			
 
				+			return H_TOO_HARD;
			
 
				+
			
 
				+		if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
			
 
				+			return H_TOO_HARD;
			
 
				+	}
			
 
				+
			
 
				+	return H_SUCCESS;
			
 
				+}
			
 
				+
			
 
				 static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
			
 
				 {
			
 
				 	unsigned long hpa = 0;
			
@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
 
				 	long ret;
			
 
				 
			
 
				 	if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
			
 
				-		return H_HARDWARE;
			
 
				+		return H_TOO_HARD;
			
 
				 
			
 
				 	if (dir == DMA_NONE)
			
 
				 		return H_SUCCESS;
			
@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
				 		return H_TOO_HARD;
			
 
				 
			
 
				 	if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
			
 
				-		return H_HARDWARE;
			
 
				+		return H_TOO_HARD;
			
 
				 
			
 
				 	if (mm_iommu_mapped_inc(mem))
			
 
				-		return H_CLOSED;
			
 
				+		return H_TOO_HARD;
			
 
				 
			
 
				 	ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
			
 
				 	if (WARN_ON_ONCE(ret)) {
			
 
				 		mm_iommu_mapped_dec(mem);
			
 
				-		return H_HARDWARE;
			
 
				+		return H_TOO_HARD;
			
 
				 	}
			
 
				 
			
 
				 	if (dir != DMA_NONE)
			
@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
				 
			
 
				 	idx = srcu_read_lock(&vcpu->kvm->srcu);
			
 
				 
			
 
				-	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
			
 
				-			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
			
 
				+	if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
			
 
				 		ret = H_PARAMETER;
			
 
				 		goto unlock_exit;
			
 
				 	}
			
@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
				 			ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
			
 
				 					entry, ua, dir);
			
 
				 
			
 
				-		if (ret == H_SUCCESS)
			
 
				-			continue;
			
 
				-
			
 
				-		if (ret == H_TOO_HARD)
			
 
				+		if (ret != H_SUCCESS) {
			
 
				+			kvmppc_clear_tce(stit->tbl, entry);
			
 
				 			goto unlock_exit;
			
 
				-
			
 
				-		WARN_ON_ONCE(1);
			
 
				-		kvmppc_clear_tce(stit->tbl, entry);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	kvmppc_tce_put(stt, entry, tce);
			
@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 		return ret;
			
 
				 
			
 
				 	idx = srcu_read_lock(&vcpu->kvm->srcu);
			
 
				-	if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
			
 
				+	if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
			
 
				 		ret = H_TOO_HARD;
			
 
				 		goto unlock_exit;
			
 
				 	}
			
@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 		ret = kvmppc_tce_validate(stt, tce);
			
 
				 		if (ret != H_SUCCESS)
			
 
				 			goto unlock_exit;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < npages; ++i) {
			
 
				+		/*
			
 
				+		 * This looks unsafe, because we validate, then regrab
			
 
				+		 * the TCE from userspace which could have been changed by
			
 
				+		 * another thread.
			
 
				+		 *
			
 
				+		 * But it actually is safe, because the relevant checks will be
			
 
				+		 * re-executed in the following code.  If userspace tries to
			
 
				+		 * change this dodgily it will result in a messier failure mode
			
 
				+		 * but won't threaten the host.
			
 
				+		 */
			
 
				+		if (get_user(tce, tces + i)) {
			
 
				+			ret = H_TOO_HARD;
			
 
				+			goto unlock_exit;
			
 
				+		}
			
 
				+		tce = be64_to_cpu(tce);
			
 
				 
			
 
				-		if (kvmppc_gpa_to_ua(vcpu->kvm,
			
 
				-				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
			
 
				-				&ua, NULL))
			
 
				+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
			
 
				 			return H_PARAMETER;
			
 
				 
			
 
				 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
			
@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 					stit->tbl, entry + i, ua,
			
 
				 					iommu_tce_direction(tce));
			
 
				 
			
 
				-			if (ret == H_SUCCESS)
			
 
				-				continue;
			
 
				-
			
 
				-			if (ret == H_TOO_HARD)
			
 
				+			if (ret != H_SUCCESS) {
			
 
				+				kvmppc_clear_tce(stit->tbl, entry);
			
 
				 				goto unlock_exit;
			
 
				-
			
 
				-			WARN_ON_ONCE(1);
			
 
				-			kvmppc_clear_tce(stit->tbl, entry);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		kvmppc_tce_put(stt, entry + i, tce);
			
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(kvmppc_find_table);
			
 
				 
			
 
				+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 /*
			
 
				  * Validates TCE address.
			
 
				  * At the moment flags and page mask are validated.
			
@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
 
				  * to the table and user space is supposed to process them), we can skip
			
 
				  * checking other things (such as TCE is a guest RAM address or the page
			
 
				  * was actually allocated).
			
 
				- *
			
 
				- * WARNING: This will be called in real-mode on HV KVM and virtual
			
 
				- *          mode on PR KVM
			
 
				  */
			
 
				-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
			
 
				+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
			
 
				+		unsigned long tce)
			
 
				 {
			
 
				 	unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
			
 
				 	enum dma_data_direction dir = iommu_tce_direction(tce);
			
 
				+	struct kvmppc_spapr_tce_iommu_table *stit;
			
 
				+	unsigned long ua = 0;
			
 
				 
			
 
				 	/* Allow userspace to poison TCE table */
			
 
				 	if (dir == DMA_NONE)
			
@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
 
				 	if (iommu_tce_check_gpa(stt->page_shift, gpa))
			
 
				 		return H_PARAMETER;
			
 
				 
			
 
				+	if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
			
 
				+		return H_TOO_HARD;
			
 
				+
			
 
				+	list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
			
 
				+		unsigned long hpa = 0;
			
 
				+		struct mm_iommu_table_group_mem_t *mem;
			
 
				+		long shift = stit->tbl->it_page_shift;
			
 
				+
			
 
				+		mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
			
 
				+		if (!mem)
			
 
				+			return H_TOO_HARD;
			
 
				+
			
 
				+		if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
			
 
				+			return H_TOO_HARD;
			
 
				+	}
			
 
				+
			
 
				 	return H_SUCCESS;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
			
 
				+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
			
 
				 
			
 
				 /* Note on the use of page_address() in real mode,
			
 
				  *
			
@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(kvmppc_tce_put);
			
 
				 
			
 
				-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
			
 
				+long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
			
 
				 		unsigned long *ua, unsigned long **prmap)
			
 
				 {
			
 
				-	unsigned long gfn = gpa >> PAGE_SHIFT;
			
 
				+	unsigned long gfn = tce >> PAGE_SHIFT;
			
 
				 	struct kvm_memory_slot *memslot;
			
 
				 
			
 
				 	memslot = search_memslots(kvm_memslots(kvm), gfn);
			
@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
				 		return -EINVAL;
			
 
				 
			
 
				 	*ua = __gfn_to_hva_memslot(memslot, gfn) |
			
 
				-		(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
			
 
				+		(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
			
 
				 
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 	if (prmap)
			
@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
 
				 
			
 
				 	return 0;
			
 
				 }
			
 
				-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
			
 
				 
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
			
@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
 
				 
			
 
				 	if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
			
 
				 			&hpa)))
			
 
				-		return H_HARDWARE;
			
 
				+		return H_TOO_HARD;
			
 
				 
			
 
				 	if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
			
 
				-		return H_CLOSED;
			
 
				+		return H_TOO_HARD;
			
 
				 
			
 
				 	ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
			
 
				 	if (ret) {
			
@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
				 	if (ret != H_SUCCESS)
			
 
				 		return ret;
			
 
				 
			
 
				-	ret = kvmppc_tce_validate(stt, tce);
			
 
				+	ret = kvmppc_rm_tce_validate(stt, tce);
			
 
				 	if (ret != H_SUCCESS)
			
 
				 		return ret;
			
 
				 
			
 
				 	dir = iommu_tce_direction(tce);
			
 
				-	if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
			
 
				-			tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
			
 
				+	if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
			
 
				 		return H_PARAMETER;
			
 
				 
			
 
				 	entry = ioba >> stt->page_shift;
			
@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 
				 			ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
			
 
				 					stit->tbl, entry, ua, dir);
			
 
				 
			
 
				-		if (ret == H_SUCCESS)
			
 
				-			continue;
			
 
				-
			
 
				-		if (ret == H_TOO_HARD)
			
 
				+		if (ret != H_SUCCESS) {
			
 
				+			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
			
 
				 			return ret;
			
 
				-
			
 
				-		WARN_ON_ONCE_RM(1);
			
 
				-		kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	kvmppc_tce_put(stt, entry, tce);
			
@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 		 */
			
 
				 		struct mm_iommu_table_group_mem_t *mem;
			
 
				 
			
 
				-		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
			
 
				+		if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
			
 
				 			return H_TOO_HARD;
			
 
				 
			
 
				 		mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
			
@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 		 * We do not require memory to be preregistered in this case
			
 
				 		 * so lock rmap and do __find_linux_pte_or_hugepte().
			
 
				 		 */
			
 
				-		if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
			
 
				+		if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
			
 
				 			return H_TOO_HARD;
			
 
				 
			
 
				 		rmap = (void *) vmalloc_to_phys(rmap);
			
 
				 		if (WARN_ON_ONCE_RM(!rmap))
			
 
				-			return H_HARDWARE;
			
 
				+			return H_TOO_HARD;
			
 
				 
			
 
				 		/*
			
 
				 		 * Synchronize with the MMU notifier callbacks in
			
@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 	for (i = 0; i < npages; ++i) {
			
 
				 		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
			
 
				 
			
 
				-		ret = kvmppc_tce_validate(stt, tce);
			
 
				+		ret = kvmppc_rm_tce_validate(stt, tce);
			
 
				 		if (ret != H_SUCCESS)
			
 
				 			goto unlock_exit;
			
 
				+	}
			
 
				+
			
 
				+	for (i = 0; i < npages; ++i) {
			
 
				+		unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
			
 
				 
			
 
				 		ua = 0;
			
 
				-		if (kvmppc_gpa_to_ua(vcpu->kvm,
			
 
				-				tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
			
 
				-				&ua, NULL))
			
 
				+		if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
			
 
				 			return H_PARAMETER;
			
 
				 
			
 
				 		list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
			
@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
 
				 					stit->tbl, entry + i, ua,
			
 
				 					iommu_tce_direction(tce));
			
 
				 
			
 
				-			if (ret == H_SUCCESS)
			
 
				-				continue;
			
 
				-
			
 
				-			if (ret == H_TOO_HARD)
			
 
				+			if (ret != H_SUCCESS) {
			
 
				+				kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
			
 
				+						entry);
			
 
				 				goto unlock_exit;
			
 
				-
			
 
				-			WARN_ON_ONCE_RM(1);
			
 
				-			kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
			
 
				+			}
			
 
				 		}
			
 
				 
			
 
				 		kvmppc_tce_put(stt, entry + i, tce);
			
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -36,7 +36,6 @@
 
				 #define OP_31_XOP_MTSR		210
			
 
				 #define OP_31_XOP_MTSRIN	242
			
 
				 #define OP_31_XOP_TLBIEL	274
			
 
				-#define OP_31_XOP_TLBIE		306
			
 
				 /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
			
 
				 #define OP_31_XOP_FAKE_SC1	308
			
 
				 #define OP_31_XOP_SLBMTE	402
			
@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
 
				 	vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
			
 
				 	vcpu->arch.tar_tm = vcpu->arch.tar;
			
 
				 	vcpu->arch.lr_tm = vcpu->arch.regs.link;
			
 
				-	vcpu->arch.cr_tm = vcpu->arch.cr;
			
 
				+	vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
			
 
				 	vcpu->arch.xer_tm = vcpu->arch.regs.xer;
			
 
				 	vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
			
 
				 }
			
@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
 
				 	vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
			
 
				 	vcpu->arch.tar = vcpu->arch.tar_tm;
			
 
				 	vcpu->arch.regs.link = vcpu->arch.lr_tm;
			
 
				-	vcpu->arch.cr = vcpu->arch.cr_tm;
			
 
				+	vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
			
 
				 	vcpu->arch.regs.xer = vcpu->arch.xer_tm;
			
 
				 	vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
			
 
				 }
			
@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
 
				 	uint64_t texasr;
			
 
				 
			
 
				 	/* CR0 = 0 | MSR[TS] | 0 */
			
 
				-	vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
			
 
				+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
			
 
				 		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
			
 
				 		 << CR0_SHIFT);
			
 
				 
			
@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
 
				 	tm_abort(ra_val);
			
 
				 
			
 
				 	/* CR0 = 0 | MSR[TS] | 0 */
			
 
				-	vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
			
 
				+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
			
 
				 		(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
			
 
				 		 << CR0_SHIFT);
			
 
				 
			
@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 
			
 
				 			if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
			
 
				 				preempt_disable();
			
 
				-				vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
			
 
				-				  (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
			
 
				+				vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
			
 
				+				  (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
			
 
				 
			
 
				 				vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
			
 
				 					(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
			
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -50,6 +50,7 @@
 
				 #include <asm/reg.h>
			
 
				 #include <asm/ppc-opcode.h>
			
 
				 #include <asm/asm-prototypes.h>
			
 
				+#include <asm/archrandom.h>
			
 
				 #include <asm/debug.h>
			
 
				 #include <asm/disassemble.h>
			
 
				 #include <asm/cputable.h>
			
@@ -177,6 +178,10 @@ static bool kvmppc_ipi_thread(int cpu)
 
				 {
			
 
				 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				 
			
 
				+	/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
			
 
				+	if (kvmhv_on_pseries())
			
 
				+		return false;
			
 
				+
			
 
				 	/* On POWER9 we can use msgsnd to IPI any cpu */
			
 
				 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				 		msg |= get_hard_smp_processor_id(cpu);
			
@@ -414,8 +419,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
 
				 	       vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
			
 
				 	pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
			
 
				 	       vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
			
 
				-	pr_err("cr = %.8x  xer = %.16lx  dsisr = %.8x\n",
			
 
				-	       vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
			
 
				+	pr_err("cr = %.8lx  xer = %.16lx  dsisr = %.8x\n",
			
 
				+	       vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
			
 
				 	pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
			
 
				 	pr_err("fault dar = %.16lx dsisr = %.8x\n",
			
 
				 	       vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
			
@@ -734,8 +739,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
 
				 	/*
			
 
				 	 * Ensure that the read of vcore->dpdes comes after the read
			
 
				 	 * of vcpu->doorbell_request.  This barrier matches the
			
 
				-	 * lwsync in book3s_hv_rmhandlers.S just before the
			
 
				-	 * fast_guest_return label.
			
 
				+	 * smb_wmb() in kvmppc_guest_entry_inject().
			
 
				 	 */
			
 
				 	smp_rmb();
			
 
				 	vc = vcpu->arch.vcore;
			
@@ -916,6 +920,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
				 			break;
			
 
				 		}
			
 
				 		return RESUME_HOST;
			
 
				+	case H_SET_DABR:
			
 
				+		ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
			
 
				+		break;
			
 
				+	case H_SET_XDABR:
			
 
				+		ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
			
 
				+						kvmppc_get_gpr(vcpu, 5));
			
 
				+		break;
			
 
				+	case H_GET_TCE:
			
 
				+		ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
			
 
				+						kvmppc_get_gpr(vcpu, 5));
			
 
				+		if (ret == H_TOO_HARD)
			
 
				+			return RESUME_HOST;
			
 
				+		break;
			
 
				 	case H_PUT_TCE:
			
 
				 		ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
			
 
				 						kvmppc_get_gpr(vcpu, 5),
			
@@ -939,6 +956,33 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
				 		if (ret == H_TOO_HARD)
			
 
				 			return RESUME_HOST;
			
 
				 		break;
			
 
				+	case H_RANDOM:
			
 
				+		if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
			
 
				+			ret = H_HARDWARE;
			
 
				+		break;
			
 
				+
			
 
				+	case H_SET_PARTITION_TABLE:
			
 
				+		ret = H_FUNCTION;
			
 
				+		if (vcpu->kvm->arch.nested_enable)
			
 
				+			ret = kvmhv_set_partition_table(vcpu);
			
 
				+		break;
			
 
				+	case H_ENTER_NESTED:
			
 
				+		ret = H_FUNCTION;
			
 
				+		if (!vcpu->kvm->arch.nested_enable)
			
 
				+			break;
			
 
				+		ret = kvmhv_enter_nested_guest(vcpu);
			
 
				+		if (ret == H_INTERRUPT) {
			
 
				+			kvmppc_set_gpr(vcpu, 3, 0);
			
 
				+			return -EINTR;
			
 
				+		}
			
 
				+		break;
			
 
				+	case H_TLB_INVALIDATE:
			
 
				+		ret = H_FUNCTION;
			
 
				+		if (!vcpu->kvm->arch.nested_enable)
			
 
				+			break;
			
 
				+		ret = kvmhv_do_nested_tlbie(vcpu);
			
 
				+		break;
			
 
				+
			
 
				 	default:
			
 
				 		return RESUME_HOST;
			
 
				 	}
			
@@ -947,6 +991,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 
				 	return RESUME_GUEST;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Handle H_CEDE in the nested virtualization case where we haven't
			
 
				+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
			
 
				+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
			
 
				+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
			
 
				+ */
			
 
				+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	vcpu->arch.shregs.msr |= MSR_EE;
			
 
				+	vcpu->arch.ceded = 1;
			
 
				+	smp_mb();
			
 
				+	if (vcpu->arch.prodded) {
			
 
				+		vcpu->arch.prodded = 0;
			
 
				+		smp_mb();
			
 
				+		vcpu->arch.ceded = 0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static int kvmppc_hcall_impl_hv(unsigned long cmd)
			
 
				 {
			
 
				 	switch (cmd) {
			
@@ -1089,7 +1151,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
 
				 	return RESUME_GUEST;
			
 
				 }
			
 
				 
			
 
				-/* Called with vcpu->arch.vcore->lock held */
			
 
				 static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
			
 
				 				 struct task_struct *tsk)
			
 
				 {
			
@@ -1194,7 +1255,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 		break;
			
 
				 	case BOOK3S_INTERRUPT_H_INST_STORAGE:
			
 
				 		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
			
 
				-		vcpu->arch.fault_dsisr = 0;
			
 
				+		vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
			
 
				+			DSISR_SRR1_MATCH_64S;
			
 
				+		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
			
 
				+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
			
 
				 		r = RESUME_PAGE_FAULT;
			
 
				 		break;
			
 
				 	/*
			
@@ -1210,10 +1274,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 				swab32(vcpu->arch.emul_inst) :
			
 
				 				vcpu->arch.emul_inst;
			
 
				 		if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
			
 
				-			/* Need vcore unlocked to call kvmppc_get_last_inst */
			
 
				-			spin_unlock(&vcpu->arch.vcore->lock);
			
 
				 			r = kvmppc_emulate_debug_inst(run, vcpu);
			
 
				-			spin_lock(&vcpu->arch.vcore->lock);
			
 
				 		} else {
			
 
				 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
			
 
				 			r = RESUME_GUEST;
			
@@ -1229,12 +1290,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
			
 
				 		r = EMULATE_FAIL;
			
 
				 		if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
			
 
				-		    cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				-			/* Need vcore unlocked to call kvmppc_get_last_inst */
			
 
				-			spin_unlock(&vcpu->arch.vcore->lock);
			
 
				+		    cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				 			r = kvmppc_emulate_doorbell_instr(vcpu);
			
 
				-			spin_lock(&vcpu->arch.vcore->lock);
			
 
				-		}
			
 
				 		if (r == EMULATE_FAIL) {
			
 
				 			kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
			
 
				 			r = RESUME_GUEST;
			
@@ -1269,6 +1326,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	int r;
			
 
				+	int srcu_idx;
			
 
				+
			
 
				+	vcpu->stat.sum_exits++;
			
 
				+
			
 
				+	/*
			
 
				+	 * This can happen if an interrupt occurs in the last stages
			
 
				+	 * of guest entry or the first stages of guest exit (i.e. after
			
 
				+	 * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
			
 
				+	 * and before setting it to KVM_GUEST_MODE_HOST_HV).
			
 
				+	 * That can happen due to a bug, or due to a machine check
			
 
				+	 * occurring at just the wrong time.
			
 
				+	 */
			
 
				+	if (vcpu->arch.shregs.msr & MSR_HV) {
			
 
				+		pr_emerg("KVM trap in HV mode while nested!\n");
			
 
				+		pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
			
 
				+			 vcpu->arch.trap, kvmppc_get_pc(vcpu),
			
 
				+			 vcpu->arch.shregs.msr);
			
 
				+		kvmppc_dump_regs(vcpu);
			
 
				+		return RESUME_HOST;
			
 
				+	}
			
 
				+	switch (vcpu->arch.trap) {
			
 
				+	/* We're good on these - the host merely wanted to get our attention */
			
 
				+	case BOOK3S_INTERRUPT_HV_DECREMENTER:
			
 
				+		vcpu->stat.dec_exits++;
			
 
				+		r = RESUME_GUEST;
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_EXTERNAL:
			
 
				+		vcpu->stat.ext_intr_exits++;
			
 
				+		r = RESUME_HOST;
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_H_DOORBELL:
			
 
				+	case BOOK3S_INTERRUPT_H_VIRT:
			
 
				+		vcpu->stat.ext_intr_exits++;
			
 
				+		r = RESUME_GUEST;
			
 
				+		break;
			
 
				+	/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
			
 
				+	case BOOK3S_INTERRUPT_HMI:
			
 
				+	case BOOK3S_INTERRUPT_PERFMON:
			
 
				+	case BOOK3S_INTERRUPT_SYSTEM_RESET:
			
 
				+		r = RESUME_GUEST;
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_MACHINE_CHECK:
			
 
				+		/* Pass the machine check to the L1 guest */
			
 
				+		r = RESUME_HOST;
			
 
				+		/* Print the MCE event to host console. */
			
 
				+		machine_check_print_event_info(&vcpu->arch.mce_evt, false);
			
 
				+		break;
			
 
				+	/*
			
 
				+	 * We get these next two if the guest accesses a page which it thinks
			
 
				+	 * it has mapped but which is not actually present, either because
			
 
				+	 * it is for an emulated I/O device or because the corresonding
			
 
				+	 * host page has been paged out.
			
 
				+	 */
			
 
				+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
			
 
				+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
			
 
				+		r = kvmhv_nested_page_fault(vcpu);
			
 
				+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
			
 
				+		vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
			
 
				+		vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
			
 
				+					 DSISR_SRR1_MATCH_64S;
			
 
				+		if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
			
 
				+			vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
			
 
				+		srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
			
 
				+		r = kvmhv_nested_page_fault(vcpu);
			
 
				+		srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
			
 
				+		break;
			
 
				+
			
 
				+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				+	case BOOK3S_INTERRUPT_HV_SOFTPATCH:
			
 
				+		/*
			
 
				+		 * This occurs for various TM-related instructions that
			
 
				+		 * we need to emulate on POWER9 DD2.2.  We have already
			
 
				+		 * handled the cases where the guest was in real-suspend
			
 
				+		 * mode and was transitioning to transactional state.
			
 
				+		 */
			
 
				+		r = kvmhv_p9_tm_emulation(vcpu);
			
 
				+		break;
			
 
				+#endif
			
 
				+
			
 
				+	case BOOK3S_INTERRUPT_HV_RM_HARD:
			
 
				+		vcpu->arch.trap = 0;
			
 
				+		r = RESUME_GUEST;
			
 
				+		if (!xive_enabled())
			
 
				+			kvmppc_xics_rm_complete(vcpu, 0);
			
 
				+		break;
			
 
				+	default:
			
 
				+		r = RESUME_HOST;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return r;
			
 
				+}
			
 
				+
			
 
				 static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
			
 
				 					    struct kvm_sregs *sregs)
			
 
				 {
			
@@ -1559,6 +1714,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	case KVM_REG_PPC_ONLINE:
			
 
				 		*val = get_reg_val(id, vcpu->arch.online);
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_PTCR:
			
 
				+		*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
			
 
				+		break;
			
 
				 	default:
			
 
				 		r = -EINVAL;
			
 
				 		break;
			
@@ -1790,6 +1948,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 			atomic_dec(&vcpu->arch.vcore->online_count);
			
 
				 		vcpu->arch.online = i;
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_PTCR:
			
 
				+		vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
			
 
				+		break;
			
 
				 	default:
			
 
				 		r = -EINVAL;
			
 
				 		break;
			
@@ -2023,15 +2184,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 
				 	 * Set the default HFSCR for the guest from the host value.
			
 
				 	 * This value is only used on POWER9.
			
 
				 	 * On POWER9, we want to virtualize the doorbell facility, so we
			
 
				-	 * turn off the HFSCR bit, which causes those instructions to trap.
			
 
				+	 * don't set the HFSCR_MSGP bit, and that causes those instructions
			
 
				+	 * to trap and then we emulate them.
			
 
				 	 */
			
 
				-	vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
			
 
				-	if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
			
 
				+	vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
			
 
				+		HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
			
 
				+		vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
			
 
				+		if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
			
 
				+			vcpu->arch.hfscr |= HFSCR_TM;
			
 
				+	}
			
 
				+	if (cpu_has_feature(CPU_FTR_TM_COMP))
			
 
				 		vcpu->arch.hfscr |= HFSCR_TM;
			
 
				-	else if (!cpu_has_feature(CPU_FTR_TM_COMP))
			
 
				-		vcpu->arch.hfscr &= ~HFSCR_TM;
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-		vcpu->arch.hfscr &= ~HFSCR_MSGP;
			
 
				 
			
 
				 	kvmppc_mmu_book3s_hv_init(vcpu);
			
 
				 
			
@@ -2246,10 +2410,18 @@ static void kvmppc_release_hwthread(int cpu)
 
				 
			
 
				 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				+	struct kvm_nested_guest *nested = vcpu->arch.nested;
			
 
				+	cpumask_t *cpu_in_guest;
			
 
				 	int i;
			
 
				 
			
 
				 	cpu = cpu_first_thread_sibling(cpu);
			
 
				-	cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
			
 
				+	if (nested) {
			
 
				+		cpumask_set_cpu(cpu, &nested->need_tlb_flush);
			
 
				+		cpu_in_guest = &nested->cpu_in_guest;
			
 
				+	} else {
			
 
				+		cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
			
 
				+		cpu_in_guest = &kvm->arch.cpu_in_guest;
			
 
				+	}
			
 
				 	/*
			
 
				 	 * Make sure setting of bit in need_tlb_flush precedes
			
 
				 	 * testing of cpu_in_guest bits.  The matching barrier on
			
@@ -2257,13 +2429,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 
				 	 */
			
 
				 	smp_mb();
			
 
				 	for (i = 0; i < threads_per_core; ++i)
			
 
				-		if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
			
 
				+		if (cpumask_test_cpu(cpu + i, cpu_in_guest))
			
 
				 			smp_call_function_single(cpu + i, do_nothing, NULL, 1);
			
 
				 }
			
 
				 
			
 
				 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
			
 
				 {
			
 
				+	struct kvm_nested_guest *nested = vcpu->arch.nested;
			
 
				 	struct kvm *kvm = vcpu->kvm;
			
 
				+	int prev_cpu;
			
 
				+
			
 
				+	if (!cpu_has_feature(CPU_FTR_HVMODE))
			
 
				+		return;
			
 
				+
			
 
				+	if (nested)
			
 
				+		prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
			
 
				+	else
			
 
				+		prev_cpu = vcpu->arch.prev_cpu;
			
 
				 
			
 
				 	/*
			
 
				 	 * With radix, the guest can do TLB invalidations itself,
			
@@ -2277,12 +2459,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 
				 	 * ran to flush the TLB.  The TLB is shared between threads,
			
 
				 	 * so we use a single bit in .need_tlb_flush for all 4 threads.
			
 
				 	 */
			
 
				-	if (vcpu->arch.prev_cpu != pcpu) {
			
 
				-		if (vcpu->arch.prev_cpu >= 0 &&
			
 
				-		    cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
			
 
				+	if (prev_cpu != pcpu) {
			
 
				+		if (prev_cpu >= 0 &&
			
 
				+		    cpu_first_thread_sibling(prev_cpu) !=
			
 
				 		    cpu_first_thread_sibling(pcpu))
			
 
				-			radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
			
 
				-		vcpu->arch.prev_cpu = pcpu;
			
 
				+			radix_flush_cpu(kvm, prev_cpu, vcpu);
			
 
				+		if (nested)
			
 
				+			nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
			
 
				+		else
			
 
				+			vcpu->arch.prev_cpu = pcpu;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
			
 
				+					      struct kvm_nested_guest *nested)
			
 
				+{
			
 
				+	cpumask_t *need_tlb_flush;
			
 
				+	int lpid;
			
 
				+
			
 
				+	if (!cpu_has_feature(CPU_FTR_HVMODE))
			
 
				+		return;
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		pcpu &= ~0x3UL;
			
 
				+
			
 
				+	if (nested) {
			
 
				+		lpid = nested->shadow_lpid;
			
 
				+		need_tlb_flush = &nested->need_tlb_flush;
			
 
				+	} else {
			
 
				+		lpid = kvm->arch.lpid;
			
 
				+		need_tlb_flush = &kvm->arch.need_tlb_flush;
			
 
				+	}
			
 
				+
			
 
				+	mtspr(SPRN_LPID, lpid);
			
 
				+	isync();
			
 
				+	smp_mb();
			
 
				+
			
 
				+	if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
			
 
				+		radix__local_flush_tlb_lpid_guest(lpid);
			
 
				+		/* Clear the bit after the TLB flush */
			
 
				+		cpumask_clear_cpu(pcpu, need_tlb_flush);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -2608,6 +2824,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 
				 	spin_lock(&vc->lock);
			
 
				 	now = get_tb();
			
 
				 	for_each_runnable_thread(i, vcpu, vc) {
			
 
				+		/*
			
 
				+		 * It's safe to unlock the vcore in the loop here, because
			
 
				+		 * for_each_runnable_thread() is safe against removal of
			
 
				+		 * the vcpu, and the vcore state is VCORE_EXITING here,
			
 
				+		 * so any vcpus becoming runnable will have their arch.trap
			
 
				+		 * set to zero and can't actually run in the guest.
			
 
				+		 */
			
 
				+		spin_unlock(&vc->lock);
			
 
				 		/* cancel pending dec exception if dec is positive */
			
 
				 		if (now < vcpu->arch.dec_expires &&
			
 
				 		    kvmppc_core_pending_dec(vcpu))
			
@@ -2623,6 +2847,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
 
				 		vcpu->arch.ret = ret;
			
 
				 		vcpu->arch.trap = 0;
			
 
				 
			
 
				+		spin_lock(&vc->lock);
			
 
				 		if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
			
 
				 			if (vcpu->arch.pending_exceptions)
			
 
				 				kvmppc_core_prepare_to_enter(vcpu);
			
@@ -2971,8 +3196,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 		spin_unlock(&core_info.vc[sub]->lock);
			
 
				 
			
 
				 	if (kvm_is_radix(vc->kvm)) {
			
 
				-		int tmp = pcpu;
			
 
				-
			
 
				 		/*
			
 
				 		 * Do we need to flush the process scoped TLB for the LPAR?
			
 
				 		 *
			
@@ -2983,17 +3206,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 		 *
			
 
				 		 * Hash must be flushed in realmode in order to use tlbiel.
			
 
				 		 */
			
 
				-		mtspr(SPRN_LPID, vc->kvm->arch.lpid);
			
 
				-		isync();
			
 
				-
			
 
				-		if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-			tmp &= ~0x3UL;
			
 
				-
			
 
				-		if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
			
 
				-			radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
			
 
				-			/* Clear the bit after the TLB flush */
			
 
				-			cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
			
 
				-		}
			
 
				+		kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -3087,6 +3300,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 	trace_kvmppc_run_core(vc, 1);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Load up hypervisor-mode registers on P9.
			
 
				+ */
			
 
				+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
			
 
				+				     unsigned long lpcr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+	s64 hdec;
			
 
				+	u64 tb, purr, spurr;
			
 
				+	int trap;
			
 
				+	unsigned long host_hfscr = mfspr(SPRN_HFSCR);
			
 
				+	unsigned long host_ciabr = mfspr(SPRN_CIABR);
			
 
				+	unsigned long host_dawr = mfspr(SPRN_DAWR);
			
 
				+	unsigned long host_dawrx = mfspr(SPRN_DAWRX);
			
 
				+	unsigned long host_psscr = mfspr(SPRN_PSSCR);
			
 
				+	unsigned long host_pidr = mfspr(SPRN_PID);
			
 
				+
			
 
				+	hdec = time_limit - mftb();
			
 
				+	if (hdec < 0)
			
 
				+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
			
 
				+	mtspr(SPRN_HDEC, hdec);
			
 
				+
			
 
				+	if (vc->tb_offset) {
			
 
				+		u64 new_tb = mftb() + vc->tb_offset;
			
 
				+		mtspr(SPRN_TBU40, new_tb);
			
 
				+		tb = mftb();
			
 
				+		if ((tb & 0xffffff) < (new_tb & 0xffffff))
			
 
				+			mtspr(SPRN_TBU40, new_tb + 0x1000000);
			
 
				+		vc->tb_offset_applied = vc->tb_offset;
			
 
				+	}
			
 
				+
			
 
				+	if (vc->pcr)
			
 
				+		mtspr(SPRN_PCR, vc->pcr);
			
 
				+	mtspr(SPRN_DPDES, vc->dpdes);
			
 
				+	mtspr(SPRN_VTB, vc->vtb);
			
 
				+
			
 
				+	local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
			
 
				+	local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
			
 
				+	mtspr(SPRN_PURR, vcpu->arch.purr);
			
 
				+	mtspr(SPRN_SPURR, vcpu->arch.spurr);
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_DAWR)) {
			
 
				+		mtspr(SPRN_DAWR, vcpu->arch.dawr);
			
 
				+		mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
			
 
				+	}
			
 
				+	mtspr(SPRN_CIABR, vcpu->arch.ciabr);
			
 
				+	mtspr(SPRN_IC, vcpu->arch.ic);
			
 
				+	mtspr(SPRN_PID, vcpu->arch.pid);
			
 
				+
			
 
				+	mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
			
 
				+	      (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
			
 
				+
			
 
				+	mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
			
 
				+
			
 
				+	mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
			
 
				+	mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
			
 
				+	mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
			
 
				+	mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
			
 
				+
			
 
				+	mtspr(SPRN_AMOR, ~0UL);
			
 
				+
			
 
				+	mtspr(SPRN_LPCR, lpcr);
			
 
				+	isync();
			
 
				+
			
 
				+	kvmppc_xive_push_vcpu(vcpu);
			
 
				+
			
 
				+	mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
			
 
				+	mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
			
 
				+
			
 
				+	trap = __kvmhv_vcpu_entry_p9(vcpu);
			
 
				+
			
 
				+	/* Advance host PURR/SPURR by the amount used by guest */
			
 
				+	purr = mfspr(SPRN_PURR);
			
 
				+	spurr = mfspr(SPRN_SPURR);
			
 
				+	mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
			
 
				+	      purr - vcpu->arch.purr);
			
 
				+	mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
			
 
				+	      spurr - vcpu->arch.spurr);
			
 
				+	vcpu->arch.purr = purr;
			
 
				+	vcpu->arch.spurr = spurr;
			
 
				+
			
 
				+	vcpu->arch.ic = mfspr(SPRN_IC);
			
 
				+	vcpu->arch.pid = mfspr(SPRN_PID);
			
 
				+	vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
			
 
				+
			
 
				+	vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
			
 
				+	vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
			
 
				+	vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
			
 
				+	vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
			
 
				+
			
 
				+	mtspr(SPRN_PSSCR, host_psscr);
			
 
				+	mtspr(SPRN_HFSCR, host_hfscr);
			
 
				+	mtspr(SPRN_CIABR, host_ciabr);
			
 
				+	mtspr(SPRN_DAWR, host_dawr);
			
 
				+	mtspr(SPRN_DAWRX, host_dawrx);
			
 
				+	mtspr(SPRN_PID, host_pidr);
			
 
				+
			
 
				+	/*
			
 
				+	 * Since this is radix, do a eieio; tlbsync; ptesync sequence in
			
 
				+	 * case we interrupted the guest between a tlbie and a ptesync.
			
 
				+	 */
			
 
				+	asm volatile("eieio; tlbsync; ptesync");
			
 
				+
			
 
				+	mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);	/* restore host LPID */
			
 
				+	isync();
			
 
				+
			
 
				+	vc->dpdes = mfspr(SPRN_DPDES);
			
 
				+	vc->vtb = mfspr(SPRN_VTB);
			
 
				+	mtspr(SPRN_DPDES, 0);
			
 
				+	if (vc->pcr)
			
 
				+		mtspr(SPRN_PCR, 0);
			
 
				+
			
 
				+	if (vc->tb_offset_applied) {
			
 
				+		u64 new_tb = mftb() - vc->tb_offset_applied;
			
 
				+		mtspr(SPRN_TBU40, new_tb);
			
 
				+		tb = mftb();
			
 
				+		if ((tb & 0xffffff) < (new_tb & 0xffffff))
			
 
				+			mtspr(SPRN_TBU40, new_tb + 0x1000000);
			
 
				+		vc->tb_offset_applied = 0;
			
 
				+	}
			
 
				+
			
 
				+	mtspr(SPRN_HDEC, 0x7fffffff);
			
 
				+	mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
			
 
				+
			
 
				+	return trap;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Virtual-mode guest entry for POWER9 and later when the host and
			
 
				+ * guest are both using the radix MMU.  The LPIDR has already been set.
			
 
				+ */
			
 
				+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
			
 
				+			 unsigned long lpcr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+	unsigned long host_dscr = mfspr(SPRN_DSCR);
			
 
				+	unsigned long host_tidr = mfspr(SPRN_TIDR);
			
 
				+	unsigned long host_iamr = mfspr(SPRN_IAMR);
			
 
				+	s64 dec;
			
 
				+	u64 tb;
			
 
				+	int trap, save_pmu;
			
 
				+
			
 
				+	dec = mfspr(SPRN_DEC);
			
 
				+	tb = mftb();
			
 
				+	if (dec < 512)
			
 
				+		return BOOK3S_INTERRUPT_HV_DECREMENTER;
			
 
				+	local_paca->kvm_hstate.dec_expires = dec + tb;
			
 
				+	if (local_paca->kvm_hstate.dec_expires < time_limit)
			
 
				+		time_limit = local_paca->kvm_hstate.dec_expires;
			
 
				+
			
 
				+	vcpu->arch.ceded = 0;
			
 
				+
			
 
				+	kvmhv_save_host_pmu();		/* saves it to PACA kvm_hstate */
			
 
				+
			
 
				+	kvmppc_subcore_enter_guest();
			
 
				+
			
 
				+	vc->entry_exit_map = 1;
			
 
				+	vc->in_guest = 1;
			
 
				+
			
 
				+	if (vcpu->arch.vpa.pinned_addr) {
			
 
				+		struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
			
 
				+		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
			
 
				+		lp->yield_count = cpu_to_be32(yield_count);
			
 
				+		vcpu->arch.vpa.dirty = 1;
			
 
				+	}
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_TM) ||
			
 
				+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
			
 
				+		kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
			
 
				+
			
 
				+	kvmhv_load_guest_pmu(vcpu);
			
 
				+
			
 
				+	msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
			
 
				+	load_fp_state(&vcpu->arch.fp);
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+	load_vr_state(&vcpu->arch.vr);
			
 
				+#endif
			
 
				+
			
 
				+	mtspr(SPRN_DSCR, vcpu->arch.dscr);
			
 
				+	mtspr(SPRN_IAMR, vcpu->arch.iamr);
			
 
				+	mtspr(SPRN_PSPB, vcpu->arch.pspb);
			
 
				+	mtspr(SPRN_FSCR, vcpu->arch.fscr);
			
 
				+	mtspr(SPRN_TAR, vcpu->arch.tar);
			
 
				+	mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
			
 
				+	mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
			
 
				+	mtspr(SPRN_BESCR, vcpu->arch.bescr);
			
 
				+	mtspr(SPRN_WORT, vcpu->arch.wort);
			
 
				+	mtspr(SPRN_TIDR, vcpu->arch.tid);
			
 
				+	mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
			
 
				+	mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
			
 
				+	mtspr(SPRN_AMR, vcpu->arch.amr);
			
 
				+	mtspr(SPRN_UAMOR, vcpu->arch.uamor);
			
 
				+
			
 
				+	if (!(vcpu->arch.ctrl & 1))
			
 
				+		mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
			
 
				+
			
 
				+	mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
			
 
				+
			
 
				+	if (kvmhv_on_pseries()) {
			
 
				+		/* call our hypervisor to load up HV regs and go */
			
 
				+		struct hv_guest_state hvregs;
			
 
				+
			
 
				+		kvmhv_save_hv_regs(vcpu, &hvregs);
			
 
				+		hvregs.lpcr = lpcr;
			
 
				+		vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
			
 
				+		hvregs.version = HV_GUEST_STATE_VERSION;
			
 
				+		if (vcpu->arch.nested) {
			
 
				+			hvregs.lpid = vcpu->arch.nested->shadow_lpid;
			
 
				+			hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
			
 
				+		} else {
			
 
				+			hvregs.lpid = vcpu->kvm->arch.lpid;
			
 
				+			hvregs.vcpu_token = vcpu->vcpu_id;
			
 
				+		}
			
 
				+		hvregs.hdec_expiry = time_limit;
			
 
				+		trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
			
 
				+					  __pa(&vcpu->arch.regs));
			
 
				+		kvmhv_restore_hv_return_state(vcpu, &hvregs);
			
 
				+		vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
			
 
				+		vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
			
 
				+		vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
			
 
				+
			
 
				+		/* H_CEDE has to be handled now, not later */
			
 
				+		if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
			
 
				+		    kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
			
 
				+			kvmppc_nested_cede(vcpu);
			
 
				+			trap = 0;
			
 
				+		}
			
 
				+	} else {
			
 
				+		trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
			
 
				+	}
			
 
				+
			
 
				+	vcpu->arch.slb_max = 0;
			
 
				+	dec = mfspr(SPRN_DEC);
			
 
				+	tb = mftb();
			
 
				+	vcpu->arch.dec_expires = dec + tb;
			
 
				+	vcpu->cpu = -1;
			
 
				+	vcpu->arch.thread_cpu = -1;
			
 
				+	vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
			
 
				+
			
 
				+	vcpu->arch.iamr = mfspr(SPRN_IAMR);
			
 
				+	vcpu->arch.pspb = mfspr(SPRN_PSPB);
			
 
				+	vcpu->arch.fscr = mfspr(SPRN_FSCR);
			
 
				+	vcpu->arch.tar = mfspr(SPRN_TAR);
			
 
				+	vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
			
 
				+	vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
			
 
				+	vcpu->arch.bescr = mfspr(SPRN_BESCR);
			
 
				+	vcpu->arch.wort = mfspr(SPRN_WORT);
			
 
				+	vcpu->arch.tid = mfspr(SPRN_TIDR);
			
 
				+	vcpu->arch.amr = mfspr(SPRN_AMR);
			
 
				+	vcpu->arch.uamor = mfspr(SPRN_UAMOR);
			
 
				+	vcpu->arch.dscr = mfspr(SPRN_DSCR);
			
 
				+
			
 
				+	mtspr(SPRN_PSPB, 0);
			
 
				+	mtspr(SPRN_WORT, 0);
			
 
				+	mtspr(SPRN_AMR, 0);
			
 
				+	mtspr(SPRN_UAMOR, 0);
			
 
				+	mtspr(SPRN_DSCR, host_dscr);
			
 
				+	mtspr(SPRN_TIDR, host_tidr);
			
 
				+	mtspr(SPRN_IAMR, host_iamr);
			
 
				+	mtspr(SPRN_PSPB, 0);
			
 
				+
			
 
				+	msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
			
 
				+	store_fp_state(&vcpu->arch.fp);
			
 
				+#ifdef CONFIG_ALTIVEC
			
 
				+	store_vr_state(&vcpu->arch.vr);
			
 
				+#endif
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_TM) ||
			
 
				+	    cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
			
 
				+		kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
			
 
				+
			
 
				+	save_pmu = 1;
			
 
				+	if (vcpu->arch.vpa.pinned_addr) {
			
 
				+		struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
			
 
				+		u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
			
 
				+		lp->yield_count = cpu_to_be32(yield_count);
			
 
				+		vcpu->arch.vpa.dirty = 1;
			
 
				+		save_pmu = lp->pmcregs_in_use;
			
 
				+	}
			
 
				+
			
 
				+	kvmhv_save_guest_pmu(vcpu, save_pmu);
			
 
				+
			
 
				+	vc->entry_exit_map = 0x101;
			
 
				+	vc->in_guest = 0;
			
 
				+
			
 
				+	mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
			
 
				+
			
 
				+	kvmhv_load_host_pmu();
			
 
				+
			
 
				+	kvmppc_subcore_exit_guest();
			
 
				+
			
 
				+	return trap;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * Wait for some other vcpu thread to execute us, and
			
 
				  * wake us up when we need to handle something in the host.
			
@@ -3264,6 +3771,11 @@ out:
 
				 	trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * This never fails for a radix guest, as none of the operations it does
			
 
				+ * for a radix guest can fail or have a way to report failure.
			
 
				+ * kvmhv_run_single_vcpu() relies on this fact.
			
 
				+ */
			
 
				 static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	int r = 0;
			
@@ -3413,6 +3925,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
 
				 	return vcpu->arch.ret;
			
 
				 }
			
 
				 
			
 
				+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
			
 
				+			  struct kvm_vcpu *vcpu, u64 time_limit,
			
 
				+			  unsigned long lpcr)
			
 
				+{
			
 
				+	int trap, r, pcpu;
			
 
				+	int srcu_idx;
			
 
				+	struct kvmppc_vcore *vc;
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	struct kvm_nested_guest *nested = vcpu->arch.nested;
			
 
				+
			
 
				+	trace_kvmppc_run_vcpu_enter(vcpu);
			
 
				+
			
 
				+	kvm_run->exit_reason = 0;
			
 
				+	vcpu->arch.ret = RESUME_GUEST;
			
 
				+	vcpu->arch.trap = 0;
			
 
				+
			
 
				+	vc = vcpu->arch.vcore;
			
 
				+	vcpu->arch.ceded = 0;
			
 
				+	vcpu->arch.run_task = current;
			
 
				+	vcpu->arch.kvm_run = kvm_run;
			
 
				+	vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
			
 
				+	vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
			
 
				+	vcpu->arch.busy_preempt = TB_NIL;
			
 
				+	vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
			
 
				+	vc->runnable_threads[0] = vcpu;
			
 
				+	vc->n_runnable = 1;
			
 
				+	vc->runner = vcpu;
			
 
				+
			
 
				+	/* See if the MMU is ready to go */
			
 
				+	if (!kvm->arch.mmu_ready)
			
 
				+		kvmhv_setup_mmu(vcpu);
			
 
				+
			
 
				+	if (need_resched())
			
 
				+		cond_resched();
			
 
				+
			
 
				+	kvmppc_update_vpas(vcpu);
			
 
				+
			
 
				+	init_vcore_to_run(vc);
			
 
				+	vc->preempt_tb = TB_NIL;
			
 
				+
			
 
				+	preempt_disable();
			
 
				+	pcpu = smp_processor_id();
			
 
				+	vc->pcpu = pcpu;
			
 
				+	kvmppc_prepare_radix_vcpu(vcpu, pcpu);
			
 
				+
			
 
				+	local_irq_disable();
			
 
				+	hard_irq_disable();
			
 
				+	if (signal_pending(current))
			
 
				+		goto sigpend;
			
 
				+	if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
			
 
				+		goto out;
			
 
				+
			
 
				+	if (!nested) {
			
 
				+		kvmppc_core_prepare_to_enter(vcpu);
			
 
				+		if (vcpu->arch.doorbell_request) {
			
 
				+			vc->dpdes = 1;
			
 
				+			smp_wmb();
			
 
				+			vcpu->arch.doorbell_request = 0;
			
 
				+		}
			
 
				+		if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
			
 
				+			     &vcpu->arch.pending_exceptions))
			
 
				+			lpcr |= LPCR_MER;
			
 
				+	} else if (vcpu->arch.pending_exceptions ||
			
 
				+		   vcpu->arch.doorbell_request ||
			
 
				+		   xive_interrupt_pending(vcpu)) {
			
 
				+		vcpu->arch.ret = RESUME_HOST;
			
 
				+		goto out;
			
 
				+	}
			
 
				+
			
 
				+	kvmppc_clear_host_core(pcpu);
			
 
				+
			
 
				+	local_paca->kvm_hstate.tid = 0;
			
 
				+	local_paca->kvm_hstate.napping = 0;
			
 
				+	local_paca->kvm_hstate.kvm_split_mode = NULL;
			
 
				+	kvmppc_start_thread(vcpu, vc);
			
 
				+	kvmppc_create_dtl_entry(vcpu, vc);
			
 
				+	trace_kvm_guest_enter(vcpu);
			
 
				+
			
 
				+	vc->vcore_state = VCORE_RUNNING;
			
 
				+	trace_kvmppc_run_core(vc, 0);
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE))
			
 
				+		kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
			
 
				+
			
 
				+	trace_hardirqs_on();
			
 
				+	guest_enter_irqoff();
			
 
				+
			
 
				+	srcu_idx = srcu_read_lock(&kvm->srcu);
			
 
				+
			
 
				+	this_cpu_disable_ftrace();
			
 
				+
			
 
				+	trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
			
 
				+	vcpu->arch.trap = trap;
			
 
				+
			
 
				+	this_cpu_enable_ftrace();
			
 
				+
			
 
				+	srcu_read_unlock(&kvm->srcu, srcu_idx);
			
 
				+
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
			
 
				+		mtspr(SPRN_LPID, kvm->arch.host_lpid);
			
 
				+		isync();
			
 
				+	}
			
 
				+
			
 
				+	trace_hardirqs_off();
			
 
				+	set_irq_happened(trap);
			
 
				+
			
 
				+	kvmppc_set_host_core(pcpu);
			
 
				+
			
 
				+	local_irq_enable();
			
 
				+	guest_exit();
			
 
				+
			
 
				+	cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
			
 
				+
			
 
				+	preempt_enable();
			
 
				+
			
 
				+	/* cancel pending decrementer exception if DEC is now positive */
			
 
				+	if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
			
 
				+		kvmppc_core_dequeue_dec(vcpu);
			
 
				+
			
 
				+	trace_kvm_guest_exit(vcpu);
			
 
				+	r = RESUME_GUEST;
			
 
				+	if (trap) {
			
 
				+		if (!nested)
			
 
				+			r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
			
 
				+		else
			
 
				+			r = kvmppc_handle_nested_exit(vcpu);
			
 
				+	}
			
 
				+	vcpu->arch.ret = r;
			
 
				+
			
 
				+	if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
			
 
				+	    !kvmppc_vcpu_woken(vcpu)) {
			
 
				+		kvmppc_set_timer(vcpu);
			
 
				+		while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
			
 
				+			if (signal_pending(current)) {
			
 
				+				vcpu->stat.signal_exits++;
			
 
				+				kvm_run->exit_reason = KVM_EXIT_INTR;
			
 
				+				vcpu->arch.ret = -EINTR;
			
 
				+				break;
			
 
				+			}
			
 
				+			spin_lock(&vc->lock);
			
 
				+			kvmppc_vcore_blocked(vc);
			
 
				+			spin_unlock(&vc->lock);
			
 
				+		}
			
 
				+	}
			
 
				+	vcpu->arch.ceded = 0;
			
 
				+
			
 
				+	vc->vcore_state = VCORE_INACTIVE;
			
 
				+	trace_kvmppc_run_core(vc, 1);
			
 
				+
			
 
				+ done:
			
 
				+	kvmppc_remove_runnable(vc, vcpu);
			
 
				+	trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
			
 
				+
			
 
				+	return vcpu->arch.ret;
			
 
				+
			
 
				+ sigpend:
			
 
				+	vcpu->stat.signal_exits++;
			
 
				+	kvm_run->exit_reason = KVM_EXIT_INTR;
			
 
				+	vcpu->arch.ret = -EINTR;
			
 
				+ out:
			
 
				+	local_irq_enable();
			
 
				+	preempt_enable();
			
 
				+	goto done;
			
 
				+}
			
 
				+
			
 
				 static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	int r;
			
@@ -3488,7 +4165,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
 
				 	vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
			
 
				 
			
 
				 	do {
			
 
				-		r = kvmppc_run_vcpu(run, vcpu);
			
 
				+		if (kvm->arch.threads_indep && kvm_is_radix(kvm))
			
 
				+			r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
			
 
				+						  vcpu->arch.vcore->lpcr);
			
 
				+		else
			
 
				+			r = kvmppc_run_vcpu(run, vcpu);
			
 
				 
			
 
				 		if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
			
 
				 		    !(vcpu->arch.shregs.msr & MSR_PR)) {
			
@@ -3731,8 +4412,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
 
				 			__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
			
 
				 		dw1 = PATB_GR | kvm->arch.process_table;
			
 
				 	}
			
 
				-
			
 
				-	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
			
 
				+	kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -3828,6 +4508,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
				 /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
			
 
				 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
			
 
				 {
			
 
				+	if (kvm->arch.nested_enable) {
			
 
				+		kvm->arch.nested_enable = false;
			
 
				+		kvmhv_release_all_nested(kvm);
			
 
				+	}
			
 
				 	kvmppc_free_radix(kvm);
			
 
				 	kvmppc_update_lpcr(kvm, LPCR_VPM1,
			
 
				 			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
			
@@ -3849,6 +4533,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
 
				 	kvmppc_free_hpt(&kvm->arch.hpt);
			
 
				 	kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
			
 
				 			   LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
			
 
				+	kvmppc_rmap_reset(kvm);
			
 
				 	kvm->arch.radix = 1;
			
 
				 	return 0;
			
 
				 }
			
@@ -3948,6 +4633,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 
			
 
				 	kvmppc_alloc_host_rm_ops();
			
 
				 
			
 
				+	kvmhv_vm_nested_init(kvm);
			
 
				+
			
 
				 	/*
			
 
				 	 * Since we don't flush the TLB when tearing down a VM,
			
 
				 	 * and this lpid might have previously been used,
			
@@ -3966,9 +4653,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
			
 
				 
			
 
				 	/* Init LPCR for virtual RMA mode */
			
 
				-	kvm->arch.host_lpid = mfspr(SPRN_LPID);
			
 
				-	kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
			
 
				-	lpcr &= LPCR_PECE | LPCR_LPES;
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE)) {
			
 
				+		kvm->arch.host_lpid = mfspr(SPRN_LPID);
			
 
				+		kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
			
 
				+		lpcr &= LPCR_PECE | LPCR_LPES;
			
 
				+	} else {
			
 
				+		lpcr = 0;
			
 
				+	}
			
 
				 	lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
			
 
				 		LPCR_VPM0 | LPCR_VPM1;
			
 
				 	kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
			
@@ -4035,8 +4726,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 	 * On POWER9, we only need to do this if the "indep_threads_mode"
			
 
				 	 * module parameter has been set to N.
			
 
				 	 */
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-		kvm->arch.threads_indep = indep_threads_mode;
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
			
 
				+			pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
			
 
				+			kvm->arch.threads_indep = true;
			
 
				+		} else {
			
 
				+			kvm->arch.threads_indep = indep_threads_mode;
			
 
				+		}
			
 
				+	}
			
 
				 	if (!kvm->arch.threads_indep)
			
 
				 		kvm_hv_vm_activated();
			
 
				 
			
@@ -4059,6 +4756,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 	snprintf(buf, sizeof(buf), "vm%d", current->pid);
			
 
				 	kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
			
 
				 	kvmppc_mmu_debugfs_init(kvm);
			
 
				+	if (radix_enabled())
			
 
				+		kvmhv_radix_debugfs_init(kvm);
			
 
				 
			
 
				 	return 0;
			
 
				 }
			
@@ -4081,13 +4780,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
 
				 
			
 
				 	kvmppc_free_vcores(kvm);
			
 
				 
			
 
				-	kvmppc_free_lpid(kvm->arch.lpid);
			
 
				 
			
 
				 	if (kvm_is_radix(kvm))
			
 
				 		kvmppc_free_radix(kvm);
			
 
				 	else
			
 
				 		kvmppc_free_hpt(&kvm->arch.hpt);
			
 
				 
			
 
				+	/* Perform global invalidation and return lpid to the pool */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		if (kvm->arch.nested_enable)
			
 
				+			kvmhv_release_all_nested(kvm);
			
 
				+		kvm->arch.process_table = 0;
			
 
				+		kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
			
 
				+	}
			
 
				+	kvmppc_free_lpid(kvm->arch.lpid);
			
 
				+
			
 
				 	kvmppc_free_pimap(kvm);
			
 
				 }
			
 
				 
			
@@ -4112,11 +4819,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
 
				 
			
 
				 static int kvmppc_core_check_processor_compat_hv(void)
			
 
				 {
			
 
				-	if (!cpu_has_feature(CPU_FTR_HVMODE) ||
			
 
				-	    !cpu_has_feature(CPU_FTR_ARCH_206))
			
 
				-		return -EIO;
			
 
				+	if (cpu_has_feature(CPU_FTR_HVMODE) &&
			
 
				+	    cpu_has_feature(CPU_FTR_ARCH_206))
			
 
				+		return 0;
			
 
				 
			
 
				-	return 0;
			
 
				+	/* POWER9 in radix mode is capable of being a nested hypervisor. */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
			
 
				+		return 0;
			
 
				+
			
 
				+	return -EIO;
			
 
				 }
			
 
				 
			
 
				 #ifdef CONFIG_KVM_XICS
			
@@ -4434,6 +5145,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
 
				 	if (radix && !radix_enabled())
			
 
				 		return -EINVAL;
			
 
				 
			
 
				+	/* If we're a nested hypervisor, we currently only support radix */
			
 
				+	if (kvmhv_on_pseries() && !radix)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	mutex_lock(&kvm->lock);
			
 
				 	if (radix != kvm_is_radix(kvm)) {
			
 
				 		if (kvm->arch.mmu_ready) {
			
@@ -4555,6 +5270,10 @@ static int kvmppc_book3s_init_hv(void)
 
				 	if (r < 0)
			
 
				 		return -ENODEV;
			
 
				 
			
 
				+	r = kvmhv_nested_init();
			
 
				+	if (r)
			
 
				+		return r;
			
 
				+
			
 
				 	r = kvm_init_subcore_bitmap();
			
 
				 	if (r)
			
 
				 		return r;
			
@@ -4565,7 +5284,8 @@ static int kvmppc_book3s_init_hv(void)
 
				 	 * indirectly, via OPAL.
			
 
				 	 */
			
 
				 #ifdef CONFIG_SMP
			
 
				-	if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
			
 
				+	if (!xive_enabled() && !kvmhv_on_pseries() &&
			
 
				+	    !local_paca->kvm_hstate.xics_phys) {
			
 
				 		struct device_node *np;
			
 
				 
			
 
				 		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
			
@@ -4613,6 +5333,7 @@ static void kvmppc_book3s_exit_hv(void)
 
				 	if (kvmppc_radix_possible())
			
 
				 		kvmppc_radix_exit();
			
 
				 	kvmppc_hv_ops = NULL;
			
 
				+	kvmhv_nested_exit();
			
 
				 }
			
 
				 
			
 
				 module_init(kvmppc_book3s_init_hv);
			
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
 
				 	void __iomem *xics_phys;
			
 
				 	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				 
			
 
				+	/* For a nested hypervisor, use the XICS via hcall */
			
 
				+	if (kvmhv_on_pseries()) {
			
 
				+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
			
 
				+
			
 
				+		plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
			
 
				+				IPI_PRIORITY);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	/* On POWER9 we can use msgsnd for any destination cpu. */
			
 
				 	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				 		msg |= get_hard_smp_processor_id(cpu);
			
@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
 
				 		return 1;
			
 
				 
			
 
				 	/* Now read the interrupt from the ICP */
			
 
				-	xics_phys = local_paca->kvm_hstate.xics_phys;
			
 
				-	rc = 0;
			
 
				-	if (!xics_phys)
			
 
				-		rc = opal_int_get_xirr(&xirr, false);
			
 
				-	else
			
 
				-		xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
			
 
				+	if (kvmhv_on_pseries()) {
			
 
				+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
			
 
				+
			
 
				+		rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
			
 
				+		xirr = cpu_to_be32(retbuf[0]);
			
 
				+	} else {
			
 
				+		xics_phys = local_paca->kvm_hstate.xics_phys;
			
 
				+		rc = 0;
			
 
				+		if (!xics_phys)
			
 
				+			rc = opal_int_get_xirr(&xirr, false);
			
 
				+		else
			
 
				+			xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
			
 
				+	}
			
 
				 	if (rc < 0)
			
 
				 		return 1;
			
 
				 
			
@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
 
				 	 */
			
 
				 	if (xisr == XICS_IPI) {
			
 
				 		rc = 0;
			
 
				-		if (xics_phys) {
			
 
				+		if (kvmhv_on_pseries()) {
			
 
				+			unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
			
 
				+
			
 
				+			plpar_hcall_raw(H_IPI, retbuf,
			
 
				+					hard_smp_processor_id(), 0xff);
			
 
				+			plpar_hcall_raw(H_EOI, retbuf, h_xirr);
			
 
				+		} else if (xics_phys) {
			
 
				 			__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
			
 
				 			__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
			
 
				 		} else {
			
@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
 
				 			/* We raced with the host,
			
 
				 			 * we need to resend that IPI, bummer
			
 
				 			 */
			
 
				-			if (xics_phys)
			
 
				+			if (kvmhv_on_pseries()) {
			
 
				+				unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
			
 
				+
			
 
				+				plpar_hcall_raw(H_IPI, retbuf,
			
 
				+						hard_smp_processor_id(),
			
 
				+						IPI_PRIORITY);
			
 
				+			} else if (xics_phys)
			
 
				 				__raw_rm_writeb(IPI_PRIORITY,
			
 
				 						xics_phys + XICS_MFRR);
			
 
				 			else
			
@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
 
				 	smp_mb();
			
 
				 	local_paca->kvm_hstate.kvm_split_mode = NULL;
			
 
				 }
			
 
				+
			
 
				+/*
			
 
				+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
			
 
				+ * Can we inject a Decrementer or a External interrupt?
			
 
				+ */
			
 
				+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	int ext;
			
 
				+	unsigned long vec = 0;
			
 
				+	unsigned long lpcr;
			
 
				+
			
 
				+	/* Insert EXTERNAL bit into LPCR at the MER bit position */
			
 
				+	ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
			
 
				+	lpcr = mfspr(SPRN_LPCR);
			
 
				+	lpcr |= ext << LPCR_MER_SH;
			
 
				+	mtspr(SPRN_LPCR, lpcr);
			
 
				+	isync();
			
 
				+
			
 
				+	if (vcpu->arch.shregs.msr & MSR_EE) {
			
 
				+		if (ext) {
			
 
				+			vec = BOOK3S_INTERRUPT_EXTERNAL;
			
 
				+		} else {
			
 
				+			long int dec = mfspr(SPRN_DEC);
			
 
				+			if (!(lpcr & LPCR_LD))
			
 
				+				dec = (int) dec;
			
 
				+			if (dec < 0)
			
 
				+				vec = BOOK3S_INTERRUPT_DECREMENTER;
			
 
				+		}
			
 
				+	}
			
 
				+	if (vec) {
			
 
				+		unsigned long msr, old_msr = vcpu->arch.shregs.msr;
			
 
				+
			
 
				+		kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
			
 
				+		kvmppc_set_srr1(vcpu, old_msr);
			
 
				+		kvmppc_set_pc(vcpu, vec);
			
 
				+		msr = vcpu->arch.intr_msr;
			
 
				+		if (MSR_TM_ACTIVE(old_msr))
			
 
				+			msr |= MSR_TS_S;
			
 
				+		vcpu->arch.shregs.msr = msr;
			
 
				+	}
			
 
				+
			
 
				+	if (vcpu->arch.doorbell_request) {
			
 
				+		mtspr(SPRN_DPDES, 1);
			
 
				+		vcpu->arch.vcore->dpdes = 1;
			
 
				+		smp_wmb();
			
 
				+		vcpu->arch.doorbell_request = 0;
			
 
				+	}
			
 
				+}
			
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
 
				 END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
			
 
				 
			
 
				 	/* Save host PMU registers */
			
 
				-BEGIN_FTR_SECTION
			
 
				-	/* Work around P8 PMAE bug */
			
 
				-	li	r3, -1
			
 
				-	clrrdi	r3, r3, 10
			
 
				-	mfspr	r8, SPRN_MMCR2
			
 
				-	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
			
 
				-	isync
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	li	r3, 1
			
 
				-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				-	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
			
 
				-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
			
 
				-	mfspr	r6, SPRN_MMCRA
			
 
				-	/* Clear MMCRA in order to disable SDAR updates */
			
 
				-	li	r5, 0
			
 
				-	mtspr	SPRN_MMCRA, r5
			
 
				-	isync
			
 
				-	lbz	r5, PACA_PMCINUSE(r13)	/* is the host using the PMU? */
			
 
				-	cmpwi	r5, 0
			
 
				-	beq	31f			/* skip if not */
			
 
				-	mfspr	r5, SPRN_MMCR1
			
 
				-	mfspr	r9, SPRN_SIAR
			
 
				-	mfspr	r10, SPRN_SDAR
			
 
				-	std	r7, HSTATE_MMCR0(r13)
			
 
				-	std	r5, HSTATE_MMCR1(r13)
			
 
				-	std	r6, HSTATE_MMCRA(r13)
			
 
				-	std	r9, HSTATE_SIAR(r13)
			
 
				-	std	r10, HSTATE_SDAR(r13)
			
 
				-BEGIN_FTR_SECTION
			
 
				-	mfspr	r9, SPRN_SIER
			
 
				-	std	r8, HSTATE_MMCR2(r13)
			
 
				-	std	r9, HSTATE_SIER(r13)
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	mfspr	r3, SPRN_PMC1
			
 
				-	mfspr	r5, SPRN_PMC2
			
 
				-	mfspr	r6, SPRN_PMC3
			
 
				-	mfspr	r7, SPRN_PMC4
			
 
				-	mfspr	r8, SPRN_PMC5
			
 
				-	mfspr	r9, SPRN_PMC6
			
 
				-	stw	r3, HSTATE_PMC1(r13)
			
 
				-	stw	r5, HSTATE_PMC2(r13)
			
 
				-	stw	r6, HSTATE_PMC3(r13)
			
 
				-	stw	r7, HSTATE_PMC4(r13)
			
 
				-	stw	r8, HSTATE_PMC5(r13)
			
 
				-	stw	r9, HSTATE_PMC6(r13)
			
 
				-31:
			
 
				+	bl	kvmhv_save_host_pmu
			
 
				 
			
 
				 	/*
			
 
				 	 * Put whatever is in the decrementer into the
			
@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
				 	ld	r0, PPC_LR_STKOFF(r1)
			
 
				 	mtlr	r0
			
 
				 	blr
			
 
				+
			
 
				+_GLOBAL(kvmhv_save_host_pmu)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/* Work around P8 PMAE bug */
			
 
				+	li	r3, -1
			
 
				+	clrrdi	r3, r3, 10
			
 
				+	mfspr	r8, SPRN_MMCR2
			
 
				+	mtspr	SPRN_MMCR2, r3		/* freeze all counters using MMCR2 */
			
 
				+	isync
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	li	r3, 1
			
 
				+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				+	mfspr	r7, SPRN_MMCR0		/* save MMCR0 */
			
 
				+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable interrupts */
			
 
				+	mfspr	r6, SPRN_MMCRA
			
 
				+	/* Clear MMCRA in order to disable SDAR updates */
			
 
				+	li	r5, 0
			
 
				+	mtspr	SPRN_MMCRA, r5
			
 
				+	isync
			
 
				+	lbz	r5, PACA_PMCINUSE(r13)	/* is the host using the PMU? */
			
 
				+	cmpwi	r5, 0
			
 
				+	beq	31f			/* skip if not */
			
 
				+	mfspr	r5, SPRN_MMCR1
			
 
				+	mfspr	r9, SPRN_SIAR
			
 
				+	mfspr	r10, SPRN_SDAR
			
 
				+	std	r7, HSTATE_MMCR0(r13)
			
 
				+	std	r5, HSTATE_MMCR1(r13)
			
 
				+	std	r6, HSTATE_MMCRA(r13)
			
 
				+	std	r9, HSTATE_SIAR(r13)
			
 
				+	std	r10, HSTATE_SDAR(r13)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	mfspr	r9, SPRN_SIER
			
 
				+	std	r8, HSTATE_MMCR2(r13)
			
 
				+	std	r9, HSTATE_SIER(r13)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	mfspr	r3, SPRN_PMC1
			
 
				+	mfspr	r5, SPRN_PMC2
			
 
				+	mfspr	r6, SPRN_PMC3
			
 
				+	mfspr	r7, SPRN_PMC4
			
 
				+	mfspr	r8, SPRN_PMC5
			
 
				+	mfspr	r9, SPRN_PMC6
			
 
				+	stw	r3, HSTATE_PMC1(r13)
			
 
				+	stw	r5, HSTATE_PMC2(r13)
			
 
				+	stw	r6, HSTATE_PMC3(r13)
			
 
				+	stw	r7, HSTATE_PMC4(r13)
			
 
				+	stw	r8, HSTATE_PMC5(r13)
			
 
				+	stw	r9, HSTATE_PMC6(r13)
			
 
				+31:	blr
			
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1291 @@
 
				+// SPDX-License-Identifier: GPL-2.0
			
 
				+/*
			
 
				+ * Copyright IBM Corporation, 2018
			
 
				+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
			
 
				+ *	   Paul Mackerras <paulus@ozlabs.org>
			
 
				+ *
			
 
				+ * Description: KVM functions specific to running nested KVM-HV guests
			
 
				+ * on Book3S processors (specifically POWER9 and later).
			
 
				+ */
			
 
				+
			
 
				+#include <linux/kernel.h>
			
 
				+#include <linux/kvm_host.h>
			
 
				+#include <linux/llist.h>
			
 
				+
			
 
				+#include <asm/kvm_ppc.h>
			
 
				+#include <asm/kvm_book3s.h>
			
 
				+#include <asm/mmu.h>
			
 
				+#include <asm/pgtable.h>
			
 
				+#include <asm/pgalloc.h>
			
 
				+#include <asm/pte-walk.h>
			
 
				+#include <asm/reg.h>
			
 
				+
			
 
				+static struct patb_entry *pseries_partition_tb;
			
 
				+
			
 
				+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
			
 
				+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
			
 
				+
			
 
				+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+
			
 
				+	hr->pcr = vc->pcr;
			
 
				+	hr->dpdes = vc->dpdes;
			
 
				+	hr->hfscr = vcpu->arch.hfscr;
			
 
				+	hr->tb_offset = vc->tb_offset;
			
 
				+	hr->dawr0 = vcpu->arch.dawr;
			
 
				+	hr->dawrx0 = vcpu->arch.dawrx;
			
 
				+	hr->ciabr = vcpu->arch.ciabr;
			
 
				+	hr->purr = vcpu->arch.purr;
			
 
				+	hr->spurr = vcpu->arch.spurr;
			
 
				+	hr->ic = vcpu->arch.ic;
			
 
				+	hr->vtb = vc->vtb;
			
 
				+	hr->srr0 = vcpu->arch.shregs.srr0;
			
 
				+	hr->srr1 = vcpu->arch.shregs.srr1;
			
 
				+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
			
 
				+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
			
 
				+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
			
 
				+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
			
 
				+	hr->pidr = vcpu->arch.pid;
			
 
				+	hr->cfar = vcpu->arch.cfar;
			
 
				+	hr->ppr = vcpu->arch.ppr;
			
 
				+}
			
 
				+
			
 
				+static void byteswap_pt_regs(struct pt_regs *regs)
			
 
				+{
			
 
				+	unsigned long *addr = (unsigned long *) regs;
			
 
				+
			
 
				+	for (; addr < ((unsigned long *) (regs + 1)); addr++)
			
 
				+		*addr = swab64(*addr);
			
 
				+}
			
 
				+
			
 
				+static void byteswap_hv_regs(struct hv_guest_state *hr)
			
 
				+{
			
 
				+	hr->version = swab64(hr->version);
			
 
				+	hr->lpid = swab32(hr->lpid);
			
 
				+	hr->vcpu_token = swab32(hr->vcpu_token);
			
 
				+	hr->lpcr = swab64(hr->lpcr);
			
 
				+	hr->pcr = swab64(hr->pcr);
			
 
				+	hr->amor = swab64(hr->amor);
			
 
				+	hr->dpdes = swab64(hr->dpdes);
			
 
				+	hr->hfscr = swab64(hr->hfscr);
			
 
				+	hr->tb_offset = swab64(hr->tb_offset);
			
 
				+	hr->dawr0 = swab64(hr->dawr0);
			
 
				+	hr->dawrx0 = swab64(hr->dawrx0);
			
 
				+	hr->ciabr = swab64(hr->ciabr);
			
 
				+	hr->hdec_expiry = swab64(hr->hdec_expiry);
			
 
				+	hr->purr = swab64(hr->purr);
			
 
				+	hr->spurr = swab64(hr->spurr);
			
 
				+	hr->ic = swab64(hr->ic);
			
 
				+	hr->vtb = swab64(hr->vtb);
			
 
				+	hr->hdar = swab64(hr->hdar);
			
 
				+	hr->hdsisr = swab64(hr->hdsisr);
			
 
				+	hr->heir = swab64(hr->heir);
			
 
				+	hr->asdr = swab64(hr->asdr);
			
 
				+	hr->srr0 = swab64(hr->srr0);
			
 
				+	hr->srr1 = swab64(hr->srr1);
			
 
				+	hr->sprg[0] = swab64(hr->sprg[0]);
			
 
				+	hr->sprg[1] = swab64(hr->sprg[1]);
			
 
				+	hr->sprg[2] = swab64(hr->sprg[2]);
			
 
				+	hr->sprg[3] = swab64(hr->sprg[3]);
			
 
				+	hr->pidr = swab64(hr->pidr);
			
 
				+	hr->cfar = swab64(hr->cfar);
			
 
				+	hr->ppr = swab64(hr->ppr);
			
 
				+}
			
 
				+
			
 
				+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
			
 
				+				 struct hv_guest_state *hr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+
			
 
				+	hr->dpdes = vc->dpdes;
			
 
				+	hr->hfscr = vcpu->arch.hfscr;
			
 
				+	hr->purr = vcpu->arch.purr;
			
 
				+	hr->spurr = vcpu->arch.spurr;
			
 
				+	hr->ic = vcpu->arch.ic;
			
 
				+	hr->vtb = vc->vtb;
			
 
				+	hr->srr0 = vcpu->arch.shregs.srr0;
			
 
				+	hr->srr1 = vcpu->arch.shregs.srr1;
			
 
				+	hr->sprg[0] = vcpu->arch.shregs.sprg0;
			
 
				+	hr->sprg[1] = vcpu->arch.shregs.sprg1;
			
 
				+	hr->sprg[2] = vcpu->arch.shregs.sprg2;
			
 
				+	hr->sprg[3] = vcpu->arch.shregs.sprg3;
			
 
				+	hr->pidr = vcpu->arch.pid;
			
 
				+	hr->cfar = vcpu->arch.cfar;
			
 
				+	hr->ppr = vcpu->arch.ppr;
			
 
				+	switch (trap) {
			
 
				+	case BOOK3S_INTERRUPT_H_DATA_STORAGE:
			
 
				+		hr->hdar = vcpu->arch.fault_dar;
			
 
				+		hr->hdsisr = vcpu->arch.fault_dsisr;
			
 
				+		hr->asdr = vcpu->arch.fault_gpa;
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_H_INST_STORAGE:
			
 
				+		hr->asdr = vcpu->arch.fault_gpa;
			
 
				+		break;
			
 
				+	case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
			
 
				+		hr->heir = vcpu->arch.emul_inst;
			
 
				+		break;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
			
 
				+{
			
 
				+	/*
			
 
				+	 * Don't let L1 enable features for L2 which we've disabled for L1,
			
 
				+	 * but preserve the interrupt cause field.
			
 
				+	 */
			
 
				+	hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
			
 
				+
			
 
				+	/* Don't let data address watchpoint match in hypervisor state */
			
 
				+	hr->dawrx0 &= ~DAWRX_HYP;
			
 
				+
			
 
				+	/* Don't let completed instruction address breakpt match in HV state */
			
 
				+	if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
			
 
				+		hr->ciabr &= ~CIABR_PRIV;
			
 
				+}
			
 
				+
			
 
				+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+
			
 
				+	vc->pcr = hr->pcr;
			
 
				+	vc->dpdes = hr->dpdes;
			
 
				+	vcpu->arch.hfscr = hr->hfscr;
			
 
				+	vcpu->arch.dawr = hr->dawr0;
			
 
				+	vcpu->arch.dawrx = hr->dawrx0;
			
 
				+	vcpu->arch.ciabr = hr->ciabr;
			
 
				+	vcpu->arch.purr = hr->purr;
			
 
				+	vcpu->arch.spurr = hr->spurr;
			
 
				+	vcpu->arch.ic = hr->ic;
			
 
				+	vc->vtb = hr->vtb;
			
 
				+	vcpu->arch.shregs.srr0 = hr->srr0;
			
 
				+	vcpu->arch.shregs.srr1 = hr->srr1;
			
 
				+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
			
 
				+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
			
 
				+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
			
 
				+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
			
 
				+	vcpu->arch.pid = hr->pidr;
			
 
				+	vcpu->arch.cfar = hr->cfar;
			
 
				+	vcpu->arch.ppr = hr->ppr;
			
 
				+}
			
 
				+
			
 
				+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
			
 
				+				   struct hv_guest_state *hr)
			
 
				+{
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+
			
 
				+	vc->dpdes = hr->dpdes;
			
 
				+	vcpu->arch.hfscr = hr->hfscr;
			
 
				+	vcpu->arch.purr = hr->purr;
			
 
				+	vcpu->arch.spurr = hr->spurr;
			
 
				+	vcpu->arch.ic = hr->ic;
			
 
				+	vc->vtb = hr->vtb;
			
 
				+	vcpu->arch.fault_dar = hr->hdar;
			
 
				+	vcpu->arch.fault_dsisr = hr->hdsisr;
			
 
				+	vcpu->arch.fault_gpa = hr->asdr;
			
 
				+	vcpu->arch.emul_inst = hr->heir;
			
 
				+	vcpu->arch.shregs.srr0 = hr->srr0;
			
 
				+	vcpu->arch.shregs.srr1 = hr->srr1;
			
 
				+	vcpu->arch.shregs.sprg0 = hr->sprg[0];
			
 
				+	vcpu->arch.shregs.sprg1 = hr->sprg[1];
			
 
				+	vcpu->arch.shregs.sprg2 = hr->sprg[2];
			
 
				+	vcpu->arch.shregs.sprg3 = hr->sprg[3];
			
 
				+	vcpu->arch.pid = hr->pidr;
			
 
				+	vcpu->arch.cfar = hr->cfar;
			
 
				+	vcpu->arch.ppr = hr->ppr;
			
 
				+}
			
 
				+
			
 
				+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	long int err, r;
			
 
				+	struct kvm_nested_guest *l2;
			
 
				+	struct pt_regs l2_regs, saved_l1_regs;
			
 
				+	struct hv_guest_state l2_hv, saved_l1_hv;
			
 
				+	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				+	u64 hv_ptr, regs_ptr;
			
 
				+	u64 hdec_exp;
			
 
				+	s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
			
 
				+	u64 mask;
			
 
				+	unsigned long lpcr;
			
 
				+
			
 
				+	if (vcpu->kvm->arch.l1_ptcr == 0)
			
 
				+		return H_NOT_AVAILABLE;
			
 
				+
			
 
				+	/* copy parameters in */
			
 
				+	hv_ptr = kvmppc_get_gpr(vcpu, 4);
			
 
				+	err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
			
 
				+				  sizeof(struct hv_guest_state));
			
 
				+	if (err)
			
 
				+		return H_PARAMETER;
			
 
				+	if (kvmppc_need_byteswap(vcpu))
			
 
				+		byteswap_hv_regs(&l2_hv);
			
 
				+	if (l2_hv.version != HV_GUEST_STATE_VERSION)
			
 
				+		return H_P2;
			
 
				+
			
 
				+	regs_ptr = kvmppc_get_gpr(vcpu, 5);
			
 
				+	err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
			
 
				+				  sizeof(struct pt_regs));
			
 
				+	if (err)
			
 
				+		return H_PARAMETER;
			
 
				+	if (kvmppc_need_byteswap(vcpu))
			
 
				+		byteswap_pt_regs(&l2_regs);
			
 
				+	if (l2_hv.vcpu_token >= NR_CPUS)
			
 
				+		return H_PARAMETER;
			
 
				+
			
 
				+	/* translate lpid */
			
 
				+	l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
			
 
				+	if (!l2)
			
 
				+		return H_PARAMETER;
			
 
				+	if (!l2->l1_gr_to_hr) {
			
 
				+		mutex_lock(&l2->tlb_lock);
			
 
				+		kvmhv_update_ptbl_cache(l2);
			
 
				+		mutex_unlock(&l2->tlb_lock);
			
 
				+	}
			
 
				+
			
 
				+	/* save l1 values of things */
			
 
				+	vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
			
 
				+	saved_l1_regs = vcpu->arch.regs;
			
 
				+	kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
			
 
				+
			
 
				+	/* convert TB values/offsets to host (L0) values */
			
 
				+	hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
			
 
				+	vc->tb_offset += l2_hv.tb_offset;
			
 
				+
			
 
				+	/* set L1 state to L2 state */
			
 
				+	vcpu->arch.nested = l2;
			
 
				+	vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
			
 
				+	vcpu->arch.regs = l2_regs;
			
 
				+	vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
			
 
				+	mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
			
 
				+		LPCR_LPES | LPCR_MER;
			
 
				+	lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
			
 
				+	sanitise_hv_regs(vcpu, &l2_hv);
			
 
				+	restore_hv_regs(vcpu, &l2_hv);
			
 
				+
			
 
				+	vcpu->arch.ret = RESUME_GUEST;
			
 
				+	vcpu->arch.trap = 0;
			
 
				+	do {
			
 
				+		if (mftb() >= hdec_exp) {
			
 
				+			vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
			
 
				+			r = RESUME_HOST;
			
 
				+			break;
			
 
				+		}
			
 
				+		r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
			
 
				+					  lpcr);
			
 
				+	} while (is_kvmppc_resume_guest(r));
			
 
				+
			
 
				+	/* save L2 state for return */
			
 
				+	l2_regs = vcpu->arch.regs;
			
 
				+	l2_regs.msr = vcpu->arch.shregs.msr;
			
 
				+	delta_purr = vcpu->arch.purr - l2_hv.purr;
			
 
				+	delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
			
 
				+	delta_ic = vcpu->arch.ic - l2_hv.ic;
			
 
				+	delta_vtb = vc->vtb - l2_hv.vtb;
			
 
				+	save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
			
 
				+
			
 
				+	/* restore L1 state */
			
 
				+	vcpu->arch.nested = NULL;
			
 
				+	vcpu->arch.regs = saved_l1_regs;
			
 
				+	vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
			
 
				+	/* set L1 MSR TS field according to L2 transaction state */
			
 
				+	if (l2_regs.msr & MSR_TS_MASK)
			
 
				+		vcpu->arch.shregs.msr |= MSR_TS_S;
			
 
				+	vc->tb_offset = saved_l1_hv.tb_offset;
			
 
				+	restore_hv_regs(vcpu, &saved_l1_hv);
			
 
				+	vcpu->arch.purr += delta_purr;
			
 
				+	vcpu->arch.spurr += delta_spurr;
			
 
				+	vcpu->arch.ic += delta_ic;
			
 
				+	vc->vtb += delta_vtb;
			
 
				+
			
 
				+	kvmhv_put_nested(l2);
			
 
				+
			
 
				+	/* copy l2_hv_state and regs back to guest */
			
 
				+	if (kvmppc_need_byteswap(vcpu)) {
			
 
				+		byteswap_hv_regs(&l2_hv);
			
 
				+		byteswap_pt_regs(&l2_regs);
			
 
				+	}
			
 
				+	err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
			
 
				+				   sizeof(struct hv_guest_state));
			
 
				+	if (err)
			
 
				+		return H_AUTHORITY;
			
 
				+	err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
			
 
				+				   sizeof(struct pt_regs));
			
 
				+	if (err)
			
 
				+		return H_AUTHORITY;
			
 
				+
			
 
				+	if (r == -EINTR)
			
 
				+		return H_INTERRUPT;
			
 
				+
			
 
				+	return vcpu->arch.trap;
			
 
				+}
			
 
				+
			
 
				+long kvmhv_nested_init(void)
			
 
				+{
			
 
				+	long int ptb_order;
			
 
				+	unsigned long ptcr;
			
 
				+	long rc;
			
 
				+
			
 
				+	if (!kvmhv_on_pseries())
			
 
				+		return 0;
			
 
				+	if (!radix_enabled())
			
 
				+		return -ENODEV;
			
 
				+
			
 
				+	/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
			
 
				+	ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
			
 
				+	if (ptb_order < 8)
			
 
				+		ptb_order = 8;
			
 
				+	pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
			
 
				+				       GFP_KERNEL);
			
 
				+	if (!pseries_partition_tb) {
			
 
				+		pr_err("kvm-hv: failed to allocated nested partition table\n");
			
 
				+		return -ENOMEM;
			
 
				+	}
			
 
				+
			
 
				+	ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
			
 
				+	rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
			
 
				+	if (rc != H_SUCCESS) {
			
 
				+		pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
			
 
				+		       rc);
			
 
				+		kfree(pseries_partition_tb);
			
 
				+		pseries_partition_tb = NULL;
			
 
				+		return -ENODEV;
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+void kvmhv_nested_exit(void)
			
 
				+{
			
 
				+	/*
			
 
				+	 * N.B. the kvmhv_on_pseries() test is there because it enables
			
 
				+	 * the compiler to remove the call to plpar_hcall_norets()
			
 
				+	 * when CONFIG_PPC_PSERIES=n.
			
 
				+	 */
			
 
				+	if (kvmhv_on_pseries() && pseries_partition_tb) {
			
 
				+		plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
			
 
				+		kfree(pseries_partition_tb);
			
 
				+		pseries_partition_tb = NULL;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_flush_lpid(unsigned int lpid)
			
 
				+{
			
 
				+	long rc;
			
 
				+
			
 
				+	if (!kvmhv_on_pseries()) {
			
 
				+		radix__flush_tlb_lpid(lpid);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
			
 
				+				lpid, TLBIEL_INVAL_SET_LPID);
			
 
				+	if (rc)
			
 
				+		pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
			
 
				+}
			
 
				+
			
 
				+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
			
 
				+{
			
 
				+	if (!kvmhv_on_pseries()) {
			
 
				+		mmu_partition_table_set_entry(lpid, dw0, dw1);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
			
 
				+	pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
			
 
				+	/* L0 will do the necessary barriers */
			
 
				+	kvmhv_flush_lpid(lpid);
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	unsigned long dw0;
			
 
				+
			
 
				+	dw0 = PATB_HR | radix__get_tree_size() |
			
 
				+		__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
			
 
				+	kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
			
 
				+}
			
 
				+
			
 
				+void kvmhv_vm_nested_init(struct kvm *kvm)
			
 
				+{
			
 
				+	kvm->arch.max_nested_lpid = -1;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Handle the H_SET_PARTITION_TABLE hcall.
			
 
				+ * r4 = guest real address of partition table + log_2(size) - 12
			
 
				+ * (formatted as for the PTCR).
			
 
				+ */
			
 
				+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
			
 
				+	int srcu_idx;
			
 
				+	long ret = H_SUCCESS;
			
 
				+
			
 
				+	srcu_idx = srcu_read_lock(&kvm->srcu);
			
 
				+	/*
			
 
				+	 * Limit the partition table to 4096 entries (because that's what
			
 
				+	 * hardware supports), and check the base address.
			
 
				+	 */
			
 
				+	if ((ptcr & PRTS_MASK) > 12 - 8 ||
			
 
				+	    !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
			
 
				+		ret = H_PARAMETER;
			
 
				+	srcu_read_unlock(&kvm->srcu, srcu_idx);
			
 
				+	if (ret == H_SUCCESS)
			
 
				+		kvm->arch.l1_ptcr = ptcr;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Reload the partition table entry for a guest.
			
 
				+ * Caller must hold gp->tlb_lock.
			
 
				+ */
			
 
				+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	int ret;
			
 
				+	struct patb_entry ptbl_entry;
			
 
				+	unsigned long ptbl_addr;
			
 
				+	struct kvm *kvm = gp->l1_host;
			
 
				+
			
 
				+	ret = -EFAULT;
			
 
				+	ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
			
 
				+	if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
			
 
				+		ret = kvm_read_guest(kvm, ptbl_addr,
			
 
				+				     &ptbl_entry, sizeof(ptbl_entry));
			
 
				+	if (ret) {
			
 
				+		gp->l1_gr_to_hr = 0;
			
 
				+		gp->process_table = 0;
			
 
				+	} else {
			
 
				+		gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
			
 
				+		gp->process_table = be64_to_cpu(ptbl_entry.patb1);
			
 
				+	}
			
 
				+	kvmhv_set_nested_ptbl(gp);
			
 
				+}
			
 
				+
			
 
				+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
			
 
				+{
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	long shadow_lpid;
			
 
				+
			
 
				+	gp = kzalloc(sizeof(*gp), GFP_KERNEL);
			
 
				+	if (!gp)
			
 
				+		return NULL;
			
 
				+	gp->l1_host = kvm;
			
 
				+	gp->l1_lpid = lpid;
			
 
				+	mutex_init(&gp->tlb_lock);
			
 
				+	gp->shadow_pgtable = pgd_alloc(kvm->mm);
			
 
				+	if (!gp->shadow_pgtable)
			
 
				+		goto out_free;
			
 
				+	shadow_lpid = kvmppc_alloc_lpid();
			
 
				+	if (shadow_lpid < 0)
			
 
				+		goto out_free2;
			
 
				+	gp->shadow_lpid = shadow_lpid;
			
 
				+
			
 
				+	memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
			
 
				+
			
 
				+	return gp;
			
 
				+
			
 
				+ out_free2:
			
 
				+	pgd_free(kvm->mm, gp->shadow_pgtable);
			
 
				+ out_free:
			
 
				+	kfree(gp);
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Free up any resources allocated for a nested guest.
			
 
				+ */
			
 
				+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	struct kvm *kvm = gp->l1_host;
			
 
				+
			
 
				+	if (gp->shadow_pgtable) {
			
 
				+		/*
			
 
				+		 * No vcpu is using this struct and no call to
			
 
				+		 * kvmhv_get_nested can find this struct,
			
 
				+		 * so we don't need to hold kvm->mmu_lock.
			
 
				+		 */
			
 
				+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
			
 
				+					  gp->shadow_lpid);
			
 
				+		pgd_free(kvm->mm, gp->shadow_pgtable);
			
 
				+	}
			
 
				+	kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
			
 
				+	kvmppc_free_lpid(gp->shadow_lpid);
			
 
				+	kfree(gp);
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	struct kvm *kvm = gp->l1_host;
			
 
				+	int lpid = gp->l1_lpid;
			
 
				+	long ref;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	if (gp == kvm->arch.nested_guests[lpid]) {
			
 
				+		kvm->arch.nested_guests[lpid] = NULL;
			
 
				+		if (lpid == kvm->arch.max_nested_lpid) {
			
 
				+			while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
			
 
				+				;
			
 
				+			kvm->arch.max_nested_lpid = lpid;
			
 
				+		}
			
 
				+		--gp->refcnt;
			
 
				+	}
			
 
				+	ref = gp->refcnt;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	if (ref == 0)
			
 
				+		kvmhv_release_nested(gp);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * Free up all nested resources allocated for this guest.
			
 
				+ * This is called with no vcpus of the guest running, when
			
 
				+ * switching the guest to HPT mode or when destroying the
			
 
				+ * guest.
			
 
				+ */
			
 
				+void kvmhv_release_all_nested(struct kvm *kvm)
			
 
				+{
			
 
				+	int i;
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	struct kvm_nested_guest *freelist = NULL;
			
 
				+	struct kvm_memory_slot *memslot;
			
 
				+	int srcu_idx;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
			
 
				+		gp = kvm->arch.nested_guests[i];
			
 
				+		if (!gp)
			
 
				+			continue;
			
 
				+		kvm->arch.nested_guests[i] = NULL;
			
 
				+		if (--gp->refcnt == 0) {
			
 
				+			gp->next = freelist;
			
 
				+			freelist = gp;
			
 
				+		}
			
 
				+	}
			
 
				+	kvm->arch.max_nested_lpid = -1;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	while ((gp = freelist) != NULL) {
			
 
				+		freelist = gp->next;
			
 
				+		kvmhv_release_nested(gp);
			
 
				+	}
			
 
				+
			
 
				+	srcu_idx = srcu_read_lock(&kvm->srcu);
			
 
				+	kvm_for_each_memslot(memslot, kvm_memslots(kvm))
			
 
				+		kvmhv_free_memslot_nest_rmap(memslot);
			
 
				+	srcu_read_unlock(&kvm->srcu, srcu_idx);
			
 
				+}
			
 
				+
			
 
				+/* caller must hold gp->tlb_lock */
			
 
				+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	struct kvm *kvm = gp->l1_host;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	kvmhv_flush_lpid(gp->shadow_lpid);
			
 
				+	kvmhv_update_ptbl_cache(gp);
			
 
				+	if (gp->l1_gr_to_hr == 0)
			
 
				+		kvmhv_remove_nested(gp);
			
 
				+}
			
 
				+
			
 
				+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
			
 
				+					  bool create)
			
 
				+{
			
 
				+	struct kvm_nested_guest *gp, *newgp;
			
 
				+
			
 
				+	if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
			
 
				+	    l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
			
 
				+		return NULL;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	gp = kvm->arch.nested_guests[l1_lpid];
			
 
				+	if (gp)
			
 
				+		++gp->refcnt;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+
			
 
				+	if (gp || !create)
			
 
				+		return gp;
			
 
				+
			
 
				+	newgp = kvmhv_alloc_nested(kvm, l1_lpid);
			
 
				+	if (!newgp)
			
 
				+		return NULL;
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	if (kvm->arch.nested_guests[l1_lpid]) {
			
 
				+		/* someone else beat us to it */
			
 
				+		gp = kvm->arch.nested_guests[l1_lpid];
			
 
				+	} else {
			
 
				+		kvm->arch.nested_guests[l1_lpid] = newgp;
			
 
				+		++newgp->refcnt;
			
 
				+		gp = newgp;
			
 
				+		newgp = NULL;
			
 
				+		if (l1_lpid > kvm->arch.max_nested_lpid)
			
 
				+			kvm->arch.max_nested_lpid = l1_lpid;
			
 
				+	}
			
 
				+	++gp->refcnt;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+
			
 
				+	if (newgp)
			
 
				+		kvmhv_release_nested(newgp);
			
 
				+
			
 
				+	return gp;
			
 
				+}
			
 
				+
			
 
				+void kvmhv_put_nested(struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	struct kvm *kvm = gp->l1_host;
			
 
				+	long ref;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	ref = --gp->refcnt;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	if (ref == 0)
			
 
				+		kvmhv_release_nested(gp);
			
 
				+}
			
 
				+
			
 
				+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
			
 
				+{
			
 
				+	if (lpid > kvm->arch.max_nested_lpid)
			
 
				+		return NULL;
			
 
				+	return kvm->arch.nested_guests[lpid];
			
 
				+}
			
 
				+
			
 
				+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
			
 
				+{
			
 
				+	return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
			
 
				+				       RMAP_NESTED_GPA_MASK));
			
 
				+}
			
 
				+
			
 
				+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
			
 
				+			    struct rmap_nested **n_rmap)
			
 
				+{
			
 
				+	struct llist_node *entry = ((struct llist_head *) rmapp)->first;
			
 
				+	struct rmap_nested *cursor;
			
 
				+	u64 rmap, new_rmap = (*n_rmap)->rmap;
			
 
				+
			
 
				+	/* Are there any existing entries? */
			
 
				+	if (!(*rmapp)) {
			
 
				+		/* No -> use the rmap as a single entry */
			
 
				+		*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	/* Do any entries match what we're trying to insert? */
			
 
				+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
			
 
				+		if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
			
 
				+			return;
			
 
				+	}
			
 
				+
			
 
				+	/* Do we need to create a list or just add the new entry? */
			
 
				+	rmap = *rmapp;
			
 
				+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
			
 
				+		*rmapp = 0UL;
			
 
				+	llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
			
 
				+	if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
			
 
				+		(*n_rmap)->list.next = (struct llist_node *) rmap;
			
 
				+
			
 
				+	/* Set NULL so not freed by caller */
			
 
				+	*n_rmap = NULL;
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
			
 
				+				   unsigned long hpa, unsigned long mask)
			
 
				+{
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	unsigned long gpa;
			
 
				+	unsigned int shift, lpid;
			
 
				+	pte_t *ptep;
			
 
				+
			
 
				+	gpa = n_rmap & RMAP_NESTED_GPA_MASK;
			
 
				+	lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
			
 
				+	gp = kvmhv_find_nested(kvm, lpid);
			
 
				+	if (!gp)
			
 
				+		return;
			
 
				+
			
 
				+	/* Find and invalidate the pte */
			
 
				+	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
			
 
				+	/* Don't spuriously invalidate ptes if the pfn has changed */
			
 
				+	if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
			
 
				+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
			
 
				+					unsigned long hpa, unsigned long mask)
			
 
				+{
			
 
				+	struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
			
 
				+	struct rmap_nested *cursor;
			
 
				+	unsigned long rmap;
			
 
				+
			
 
				+	for_each_nest_rmap_safe(cursor, entry, &rmap) {
			
 
				+		kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
			
 
				+		kfree(cursor);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* called with kvm->mmu_lock held */
			
 
				+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
			
 
				+				  struct kvm_memory_slot *memslot,
			
 
				+				  unsigned long gpa, unsigned long hpa,
			
 
				+				  unsigned long nbytes)
			
 
				+{
			
 
				+	unsigned long gfn, end_gfn;
			
 
				+	unsigned long addr_mask;
			
 
				+
			
 
				+	if (!memslot)
			
 
				+		return;
			
 
				+	gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
			
 
				+	end_gfn = gfn + (nbytes >> PAGE_SHIFT);
			
 
				+
			
 
				+	addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
			
 
				+	hpa &= addr_mask;
			
 
				+
			
 
				+	for (; gfn < end_gfn; gfn++) {
			
 
				+		unsigned long *rmap = &memslot->arch.rmap[gfn];
			
 
				+		kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
			
 
				+{
			
 
				+	unsigned long page;
			
 
				+
			
 
				+	for (page = 0; page < free->npages; page++) {
			
 
				+		unsigned long rmap, *rmapp = &free->arch.rmap[page];
			
 
				+		struct rmap_nested *cursor;
			
 
				+		struct llist_node *entry;
			
 
				+
			
 
				+		entry = llist_del_all((struct llist_head *) rmapp);
			
 
				+		for_each_nest_rmap_safe(cursor, entry, &rmap)
			
 
				+			kfree(cursor);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
			
 
				+					struct kvm_nested_guest *gp,
			
 
				+					long gpa, int *shift_ret)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	bool ret = false;
			
 
				+	pte_t *ptep;
			
 
				+	int shift;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
			
 
				+	if (!shift)
			
 
				+		shift = PAGE_SHIFT;
			
 
				+	if (ptep && pte_present(*ptep)) {
			
 
				+		kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
			
 
				+		ret = true;
			
 
				+	}
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+
			
 
				+	if (shift_ret)
			
 
				+		*shift_ret = shift;
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static inline int get_ric(unsigned int instr)
			
 
				+{
			
 
				+	return (instr >> 18) & 0x3;
			
 
				+}
			
 
				+
			
 
				+static inline int get_prs(unsigned int instr)
			
 
				+{
			
 
				+	return (instr >> 17) & 0x1;
			
 
				+}
			
 
				+
			
 
				+static inline int get_r(unsigned int instr)
			
 
				+{
			
 
				+	return (instr >> 16) & 0x1;
			
 
				+}
			
 
				+
			
 
				+static inline int get_lpid(unsigned long r_val)
			
 
				+{
			
 
				+	return r_val & 0xffffffff;
			
 
				+}
			
 
				+
			
 
				+static inline int get_is(unsigned long r_val)
			
 
				+{
			
 
				+	return (r_val >> 10) & 0x3;
			
 
				+}
			
 
				+
			
 
				+static inline int get_ap(unsigned long r_val)
			
 
				+{
			
 
				+	return (r_val >> 5) & 0x7;
			
 
				+}
			
 
				+
			
 
				+static inline long get_epn(unsigned long r_val)
			
 
				+{
			
 
				+	return r_val >> 12;
			
 
				+}
			
 
				+
			
 
				+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
			
 
				+					int ap, long epn)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	long npages;
			
 
				+	int shift, shadow_shift;
			
 
				+	unsigned long addr;
			
 
				+
			
 
				+	shift = ap_to_shift(ap);
			
 
				+	addr = epn << 12;
			
 
				+	if (shift < 0)
			
 
				+		/* Invalid ap encoding */
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	addr &= ~((1UL << shift) - 1);
			
 
				+	npages = 1UL << (shift - PAGE_SHIFT);
			
 
				+
			
 
				+	gp = kvmhv_get_nested(kvm, lpid, false);
			
 
				+	if (!gp) /* No such guest -> nothing to do */
			
 
				+		return 0;
			
 
				+	mutex_lock(&gp->tlb_lock);
			
 
				+
			
 
				+	/* There may be more than one host page backing this single guest pte */
			
 
				+	do {
			
 
				+		kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
			
 
				+
			
 
				+		npages -= 1UL << (shadow_shift - PAGE_SHIFT);
			
 
				+		addr += 1UL << shadow_shift;
			
 
				+	} while (npages > 0);
			
 
				+
			
 
				+	mutex_unlock(&gp->tlb_lock);
			
 
				+	kvmhv_put_nested(gp);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
			
 
				+				     struct kvm_nested_guest *gp, int ric)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+
			
 
				+	mutex_lock(&gp->tlb_lock);
			
 
				+	switch (ric) {
			
 
				+	case 0:
			
 
				+		/* Invalidate TLB */
			
 
				+		spin_lock(&kvm->mmu_lock);
			
 
				+		kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
			
 
				+					  gp->shadow_lpid);
			
 
				+		kvmhv_flush_lpid(gp->shadow_lpid);
			
 
				+		spin_unlock(&kvm->mmu_lock);
			
 
				+		break;
			
 
				+	case 1:
			
 
				+		/*
			
 
				+		 * Invalidate PWC
			
 
				+		 * We don't cache this -> nothing to do
			
 
				+		 */
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		/* Invalidate TLB, PWC and caching of partition table entries */
			
 
				+		kvmhv_flush_nested(gp);
			
 
				+		break;
			
 
				+	default:
			
 
				+		break;
			
 
				+	}
			
 
				+	mutex_unlock(&gp->tlb_lock);
			
 
				+}
			
 
				+
			
 
				+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	int i;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
			
 
				+		gp = kvm->arch.nested_guests[i];
			
 
				+		if (gp) {
			
 
				+			spin_unlock(&kvm->mmu_lock);
			
 
				+			kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
			
 
				+			spin_lock(&kvm->mmu_lock);
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+}
			
 
				+
			
 
				+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
			
 
				+				    unsigned long rsval, unsigned long rbval)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	struct kvm_nested_guest *gp;
			
 
				+	int r, ric, prs, is, ap;
			
 
				+	int lpid;
			
 
				+	long epn;
			
 
				+	int ret = 0;
			
 
				+
			
 
				+	ric = get_ric(instr);
			
 
				+	prs = get_prs(instr);
			
 
				+	r = get_r(instr);
			
 
				+	lpid = get_lpid(rsval);
			
 
				+	is = get_is(rbval);
			
 
				+
			
 
				+	/*
			
 
				+	 * These cases are invalid and are not handled:
			
 
				+	 * r   != 1 -> Only radix supported
			
 
				+	 * prs == 1 -> Not HV privileged
			
 
				+	 * ric == 3 -> No cluster bombs for radix
			
 
				+	 * is  == 1 -> Partition scoped translations not associated with pid
			
 
				+	 * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
			
 
				+	 */
			
 
				+	if ((!r) || (prs) || (ric == 3) || (is == 1) ||
			
 
				+	    ((!is) && (ric == 1 || ric == 2)))
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	switch (is) {
			
 
				+	case 0:
			
 
				+		/*
			
 
				+		 * We know ric == 0
			
 
				+		 * Invalidate TLB for a given target address
			
 
				+		 */
			
 
				+		epn = get_epn(rbval);
			
 
				+		ap = get_ap(rbval);
			
 
				+		ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
			
 
				+		break;
			
 
				+	case 2:
			
 
				+		/* Invalidate matching LPID */
			
 
				+		gp = kvmhv_get_nested(kvm, lpid, false);
			
 
				+		if (gp) {
			
 
				+			kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
			
 
				+			kvmhv_put_nested(gp);
			
 
				+		}
			
 
				+		break;
			
 
				+	case 3:
			
 
				+		/* Invalidate ALL LPIDs */
			
 
				+		kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
			
 
				+		break;
			
 
				+	default:
			
 
				+		ret = -EINVAL;
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+ * This handles the H_TLB_INVALIDATE hcall.
			
 
				+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
			
 
				+ * (r6) rB contents.
			
 
				+ */
			
 
				+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
			
 
				+			kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
			
 
				+	if (ret)
			
 
				+		return H_PARAMETER;
			
 
				+	return H_SUCCESS;
			
 
				+}
			
 
				+
			
 
				+/* Used to convert a nested guest real address to a L1 guest real address */
			
 
				+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
			
 
				+				       struct kvm_nested_guest *gp,
			
 
				+				       unsigned long n_gpa, unsigned long dsisr,
			
 
				+				       struct kvmppc_pte *gpte_p)
			
 
				+{
			
 
				+	u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
			
 
				+	int ret;
			
 
				+
			
 
				+	ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
			
 
				+					 &fault_addr);
			
 
				+
			
 
				+	if (ret) {
			
 
				+		/* We didn't find a pte */
			
 
				+		if (ret == -EINVAL) {
			
 
				+			/* Unsupported mmu config */
			
 
				+			flags |= DSISR_UNSUPP_MMU;
			
 
				+		} else if (ret == -ENOENT) {
			
 
				+			/* No translation found */
			
 
				+			flags |= DSISR_NOHPTE;
			
 
				+		} else if (ret == -EFAULT) {
			
 
				+			/* Couldn't access L1 real address */
			
 
				+			flags |= DSISR_PRTABLE_FAULT;
			
 
				+			vcpu->arch.fault_gpa = fault_addr;
			
 
				+		} else {
			
 
				+			/* Unknown error */
			
 
				+			return ret;
			
 
				+		}
			
 
				+		goto forward_to_l1;
			
 
				+	} else {
			
 
				+		/* We found a pte -> check permissions */
			
 
				+		if (dsisr & DSISR_ISSTORE) {
			
 
				+			/* Can we write? */
			
 
				+			if (!gpte_p->may_write) {
			
 
				+				flags |= DSISR_PROTFAULT;
			
 
				+				goto forward_to_l1;
			
 
				+			}
			
 
				+		} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
			
 
				+			/* Can we execute? */
			
 
				+			if (!gpte_p->may_execute) {
			
 
				+				flags |= SRR1_ISI_N_OR_G;
			
 
				+				goto forward_to_l1;
			
 
				+			}
			
 
				+		} else {
			
 
				+			/* Can we read? */
			
 
				+			if (!gpte_p->may_read && !gpte_p->may_write) {
			
 
				+				flags |= DSISR_PROTFAULT;
			
 
				+				goto forward_to_l1;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return 0;
			
 
				+
			
 
				+forward_to_l1:
			
 
				+	vcpu->arch.fault_dsisr = flags;
			
 
				+	if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
			
 
				+		vcpu->arch.shregs.msr &= ~0x783f0000ul;
			
 
				+		vcpu->arch.shregs.msr |= flags;
			
 
				+	}
			
 
				+	return RESUME_HOST;
			
 
				+}
			
 
				+
			
 
				+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
			
 
				+				       struct kvm_nested_guest *gp,
			
 
				+				       unsigned long n_gpa,
			
 
				+				       struct kvmppc_pte gpte,
			
 
				+				       unsigned long dsisr)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	bool writing = !!(dsisr & DSISR_ISSTORE);
			
 
				+	u64 pgflags;
			
 
				+	bool ret;
			
 
				+
			
 
				+	/* Are the rc bits set in the L1 partition scoped pte? */
			
 
				+	pgflags = _PAGE_ACCESSED;
			
 
				+	if (writing)
			
 
				+		pgflags |= _PAGE_DIRTY;
			
 
				+	if (pgflags & ~gpte.rc)
			
 
				+		return RESUME_HOST;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
			
 
				+	ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
			
 
				+				     gpte.raddr, kvm->arch.lpid);
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	if (!ret)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				+	/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
			
 
				+	ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
			
 
				+				      gp->shadow_lpid);
			
 
				+	if (!ret)
			
 
				+		return -EINVAL;
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+static inline int kvmppc_radix_level_to_shift(int level)
			
 
				+{
			
 
				+	switch (level) {
			
 
				+	case 2:
			
 
				+		return PUD_SHIFT;
			
 
				+	case 1:
			
 
				+		return PMD_SHIFT;
			
 
				+	default:
			
 
				+		return PAGE_SHIFT;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static inline int kvmppc_radix_shift_to_level(int shift)
			
 
				+{
			
 
				+	if (shift == PUD_SHIFT)
			
 
				+		return 2;
			
 
				+	if (shift == PMD_SHIFT)
			
 
				+		return 1;
			
 
				+	if (shift == PAGE_SHIFT)
			
 
				+		return 0;
			
 
				+	WARN_ON_ONCE(1);
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+/* called with gp->tlb_lock held */
			
 
				+static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
			
 
				+					  struct kvm_nested_guest *gp)
			
 
				+{
			
 
				+	struct kvm *kvm = vcpu->kvm;
			
 
				+	struct kvm_memory_slot *memslot;
			
 
				+	struct rmap_nested *n_rmap;
			
 
				+	struct kvmppc_pte gpte;
			
 
				+	pte_t pte, *pte_p;
			
 
				+	unsigned long mmu_seq;
			
 
				+	unsigned long dsisr = vcpu->arch.fault_dsisr;
			
 
				+	unsigned long ea = vcpu->arch.fault_dar;
			
 
				+	unsigned long *rmapp;
			
 
				+	unsigned long n_gpa, gpa, gfn, perm = 0UL;
			
 
				+	unsigned int shift, l1_shift, level;
			
 
				+	bool writing = !!(dsisr & DSISR_ISSTORE);
			
 
				+	bool kvm_ro = false;
			
 
				+	long int ret;
			
 
				+
			
 
				+	if (!gp->l1_gr_to_hr) {
			
 
				+		kvmhv_update_ptbl_cache(gp);
			
 
				+		if (!gp->l1_gr_to_hr)
			
 
				+			return RESUME_HOST;
			
 
				+	}
			
 
				+
			
 
				+	/* Convert the nested guest real address into a L1 guest real address */
			
 
				+
			
 
				+	n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
			
 
				+	if (!(dsisr & DSISR_PRTABLE_FAULT))
			
 
				+		n_gpa |= ea & 0xFFF;
			
 
				+	ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
			
 
				+
			
 
				+	/*
			
 
				+	 * If the hardware found a translation but we don't now have a usable
			
 
				+	 * translation in the l1 partition-scoped tree, remove the shadow pte
			
 
				+	 * and let the guest retry.
			
 
				+	 */
			
 
				+	if (ret == RESUME_HOST &&
			
 
				+	    (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
			
 
				+		      DSISR_BAD_COPYPASTE)))
			
 
				+		goto inval;
			
 
				+	if (ret)
			
 
				+		return ret;
			
 
				+
			
 
				+	/* Failed to set the reference/change bits */
			
 
				+	if (dsisr & DSISR_SET_RC) {
			
 
				+		ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
			
 
				+		if (ret == RESUME_HOST)
			
 
				+			return ret;
			
 
				+		if (ret)
			
 
				+			goto inval;
			
 
				+		dsisr &= ~DSISR_SET_RC;
			
 
				+		if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
			
 
				+			       DSISR_PROTFAULT)))
			
 
				+			return RESUME_GUEST;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * We took an HISI or HDSI while we were running a nested guest which
			
 
				+	 * means we have no partition scoped translation for that. This means
			
 
				+	 * we need to insert a pte for the mapping into our shadow_pgtable.
			
 
				+	 */
			
 
				+
			
 
				+	l1_shift = gpte.page_shift;
			
 
				+	if (l1_shift < PAGE_SHIFT) {
			
 
				+		/* We don't support l1 using a page size smaller than our own */
			
 
				+		pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
			
 
				+			l1_shift, PAGE_SHIFT);
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	gpa = gpte.raddr;
			
 
				+	gfn = gpa >> PAGE_SHIFT;
			
 
				+
			
 
				+	/* 1. Get the corresponding host memslot */
			
 
				+
			
 
				+	memslot = gfn_to_memslot(kvm, gfn);
			
 
				+	if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
			
 
				+		if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
			
 
				+			/* unusual error -> reflect to the guest as a DSI */
			
 
				+			kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
			
 
				+			return RESUME_GUEST;
			
 
				+		}
			
 
				+		/* passthrough of emulated MMIO case... */
			
 
				+		pr_err("emulated MMIO passthrough?\n");
			
 
				+		return -EINVAL;
			
 
				+	}
			
 
				+	if (memslot->flags & KVM_MEM_READONLY) {
			
 
				+		if (writing) {
			
 
				+			/* Give the guest a DSI */
			
 
				+			kvmppc_core_queue_data_storage(vcpu, ea,
			
 
				+					DSISR_ISSTORE | DSISR_PROTFAULT);
			
 
				+			return RESUME_GUEST;
			
 
				+		}
			
 
				+		kvm_ro = true;
			
 
				+	}
			
 
				+
			
 
				+	/* 2. Find the host pte for this L1 guest real address */
			
 
				+
			
 
				+	/* Used to check for invalidations in progress */
			
 
				+	mmu_seq = kvm->mmu_notifier_seq;
			
 
				+	smp_rmb();
			
 
				+
			
 
				+	/* See if can find translation in our partition scoped tables for L1 */
			
 
				+	pte = __pte(0);
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
			
 
				+	if (!shift)
			
 
				+		shift = PAGE_SHIFT;
			
 
				+	if (pte_p)
			
 
				+		pte = *pte_p;
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+
			
 
				+	if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
			
 
				+		/* No suitable pte found -> try to insert a mapping */
			
 
				+		ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
			
 
				+					writing, kvm_ro, &pte, &level);
			
 
				+		if (ret == -EAGAIN)
			
 
				+			return RESUME_GUEST;
			
 
				+		else if (ret)
			
 
				+			return ret;
			
 
				+		shift = kvmppc_radix_level_to_shift(level);
			
 
				+	}
			
 
				+
			
 
				+	/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
			
 
				+
			
 
				+	/* The permissions is the combination of the host and l1 guest ptes */
			
 
				+	perm |= gpte.may_read ? 0UL : _PAGE_READ;
			
 
				+	perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
			
 
				+	perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
			
 
				+	pte = __pte(pte_val(pte) & ~perm);
			
 
				+
			
 
				+	/* What size pte can we insert? */
			
 
				+	if (shift > l1_shift) {
			
 
				+		u64 mask;
			
 
				+		unsigned int actual_shift = PAGE_SHIFT;
			
 
				+		if (PMD_SHIFT < l1_shift)
			
 
				+			actual_shift = PMD_SHIFT;
			
 
				+		mask = (1UL << shift) - (1UL << actual_shift);
			
 
				+		pte = __pte(pte_val(pte) | (gpa & mask));
			
 
				+		shift = actual_shift;
			
 
				+	}
			
 
				+	level = kvmppc_radix_shift_to_level(shift);
			
 
				+	n_gpa &= ~((1UL << shift) - 1);
			
 
				+
			
 
				+	/* 4. Insert the pte into our shadow_pgtable */
			
 
				+
			
 
				+	n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
			
 
				+	if (!n_rmap)
			
 
				+		return RESUME_GUEST; /* Let the guest try again */
			
 
				+	n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
			
 
				+		(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
			
 
				+	rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
			
 
				+	ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
			
 
				+				mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
			
 
				+	if (n_rmap)
			
 
				+		kfree(n_rmap);
			
 
				+	if (ret == -EAGAIN)
			
 
				+		ret = RESUME_GUEST;	/* Let the guest try again */
			
 
				+
			
 
				+	return ret;
			
 
				+
			
 
				+ inval:
			
 
				+	kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
			
 
				+	return RESUME_GUEST;
			
 
				+}
			
 
				+
			
 
				+long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	struct kvm_nested_guest *gp = vcpu->arch.nested;
			
 
				+	long int ret;
			
 
				+
			
 
				+	mutex_lock(&gp->tlb_lock);
			
 
				+	ret = __kvmhv_nested_page_fault(vcpu, gp);
			
 
				+	mutex_unlock(&gp->tlb_lock);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
			
 
				+{
			
 
				+	int ret = -1;
			
 
				+
			
 
				+	spin_lock(&kvm->mmu_lock);
			
 
				+	while (++lpid <= kvm->arch.max_nested_lpid) {
			
 
				+		if (kvm->arch.nested_guests[lpid]) {
			
 
				+			ret = lpid;
			
 
				+			break;
			
 
				+		}
			
 
				+	}
			
 
				+	spin_unlock(&kvm->mmu_lock);
			
 
				+	return ret;
			
 
				+}
			
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
 
				 
			
 
				 	local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
			
 
				 
			
 
				 void kvmppc_subcore_exit_guest(void)
			
 
				 {
			
@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
 
				 
			
 
				 	local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
			
 
				 }
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
			
 
				 
			
 
				 static bool kvmppc_tb_resync_required(void)
			
 
				 {
			
@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
 
				 	} else {
			
 
				 		wait_for_tb_resync();
			
 
				 	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Reset tb_offset_applied so the guest exit code won't try
			
 
				+	 * to subtract the previous timebase offset from the timebase.
			
 
				+	 */
			
 
				+	if (local_paca->kvm_hstate.kvm_vcore)
			
 
				+		local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
			
 
				+
			
 
				 	return 0;
			
 
				 }
			
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
				 
			
 
				 	/* Mark the target VCPU as having an interrupt pending */
			
 
				 	vcpu->stat.queue_intr++;
			
 
				-	set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
			
 
				+	set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
			
 
				 
			
 
				 	/* Kick self ? Just set MER and return */
			
 
				 	if (vcpu == this_vcpu) {
			
@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
 
				 static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	/* Note: Only called on self ! */
			
 
				-	clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
			
 
				-		  &vcpu->arch.pending_exceptions);
			
 
				+	clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
			
 
				 	mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
			
 
				 }
			
 
				 
			
@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
 
				 	void __iomem *xics_phys;
			
 
				 	int64_t rc;
			
 
				 
			
 
				+	if (kvmhv_on_pseries()) {
			
 
				+		unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
			
 
				+
			
 
				+		iosync();
			
 
				+		plpar_hcall_raw(H_EOI, retbuf, hwirq);
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				 	rc = pnv_opal_pci_msi_eoi(c, hwirq);
			
 
				 
			
 
				 	if (rc)
			
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -28,6 +28,7 @@
 
				 #include <asm/exception-64s.h>
			
 
				 #include <asm/kvm_book3s_asm.h>
			
 
				 #include <asm/book3s/64/mmu-hash.h>
			
 
				+#include <asm/export.h>
			
 
				 #include <asm/tm.h>
			
 
				 #include <asm/opal.h>
			
 
				 #include <asm/xive-regs.h>
			
@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
				 #define NAPPING_NOVCPU	2
			
 
				 
			
 
				 /* Stack frame offsets for kvmppc_hv_entry */
			
 
				-#define SFS			160
			
 
				+#define SFS			208
			
 
				 #define STACK_SLOT_TRAP		(SFS-4)
			
 
				+#define STACK_SLOT_SHORT_PATH	(SFS-8)
			
 
				 #define STACK_SLOT_TID		(SFS-16)
			
 
				 #define STACK_SLOT_PSSCR	(SFS-24)
			
 
				 #define STACK_SLOT_PID		(SFS-32)
			
@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
 
				 #define STACK_SLOT_DAWR		(SFS-56)
			
 
				 #define STACK_SLOT_DAWRX	(SFS-64)
			
 
				 #define STACK_SLOT_HFSCR	(SFS-72)
			
 
				+/* the following is used by the P9 short path */
			
 
				+#define STACK_SLOT_NVGPRS	(SFS-152)	/* 18 gprs */
			
 
				 
			
 
				 /*
			
 
				  * Call kvmppc_hv_entry in real mode.
			
@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
				 	mtspr	SPRN_SPRG_VDSO_WRITE,r3
			
 
				 
			
 
				 	/* Reload the host's PMU registers */
			
 
				-	lbz	r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
			
 
				-	cmpwi	r4, 0
			
 
				-	beq	23f			/* skip if not */
			
 
				-BEGIN_FTR_SECTION
			
 
				-	ld	r3, HSTATE_MMCR0(r13)
			
 
				-	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
			
 
				-	cmpwi	r4, MMCR0_PMAO
			
 
				-	beql	kvmppc_fix_pmao
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
			
 
				-	lwz	r3, HSTATE_PMC1(r13)
			
 
				-	lwz	r4, HSTATE_PMC2(r13)
			
 
				-	lwz	r5, HSTATE_PMC3(r13)
			
 
				-	lwz	r6, HSTATE_PMC4(r13)
			
 
				-	lwz	r8, HSTATE_PMC5(r13)
			
 
				-	lwz	r9, HSTATE_PMC6(r13)
			
 
				-	mtspr	SPRN_PMC1, r3
			
 
				-	mtspr	SPRN_PMC2, r4
			
 
				-	mtspr	SPRN_PMC3, r5
			
 
				-	mtspr	SPRN_PMC4, r6
			
 
				-	mtspr	SPRN_PMC5, r8
			
 
				-	mtspr	SPRN_PMC6, r9
			
 
				-	ld	r3, HSTATE_MMCR0(r13)
			
 
				-	ld	r4, HSTATE_MMCR1(r13)
			
 
				-	ld	r5, HSTATE_MMCRA(r13)
			
 
				-	ld	r6, HSTATE_SIAR(r13)
			
 
				-	ld	r7, HSTATE_SDAR(r13)
			
 
				-	mtspr	SPRN_MMCR1, r4
			
 
				-	mtspr	SPRN_MMCRA, r5
			
 
				-	mtspr	SPRN_SIAR, r6
			
 
				-	mtspr	SPRN_SDAR, r7
			
 
				-BEGIN_FTR_SECTION
			
 
				-	ld	r8, HSTATE_MMCR2(r13)
			
 
				-	ld	r9, HSTATE_SIER(r13)
			
 
				-	mtspr	SPRN_MMCR2, r8
			
 
				-	mtspr	SPRN_SIER, r9
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	mtspr	SPRN_MMCR0, r3
			
 
				-	isync
			
 
				-23:
			
 
				+	bl	kvmhv_load_host_pmu
			
 
				 
			
 
				 	/*
			
 
				 	 * Reload DEC.  HDEC interrupts were disabled when
			
@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
 
				 	b	91f
			
 
				 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
			
 
				 	/*
			
 
				-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
			
 
				+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
			
 
				 	 */
			
 
				 	mr      r3, r4
			
 
				 	ld      r4, VCPU_MSR(r3)
			
 
				+	li	r5, 0			/* don't preserve non-vol regs */
			
 
				 	bl	kvmppc_restore_tm_hv
			
 
				+	nop
			
 
				 	ld	r4, HSTATE_KVM_VCPU(r13)
			
 
				 91:
			
 
				 #endif
			
 
				 
			
 
				-	/* Load guest PMU registers */
			
 
				-	/* R4 is live here (vcpu pointer) */
			
 
				-	li	r3, 1
			
 
				-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
			
 
				-	isync
			
 
				-BEGIN_FTR_SECTION
			
 
				-	ld	r3, VCPU_MMCR(r4)
			
 
				-	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
			
 
				-	cmpwi	r5, MMCR0_PMAO
			
 
				-	beql	kvmppc_fix_pmao
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
			
 
				-	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
			
 
				-	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
			
 
				-	lwz	r6, VCPU_PMC + 8(r4)
			
 
				-	lwz	r7, VCPU_PMC + 12(r4)
			
 
				-	lwz	r8, VCPU_PMC + 16(r4)
			
 
				-	lwz	r9, VCPU_PMC + 20(r4)
			
 
				-	mtspr	SPRN_PMC1, r3
			
 
				-	mtspr	SPRN_PMC2, r5
			
 
				-	mtspr	SPRN_PMC3, r6
			
 
				-	mtspr	SPRN_PMC4, r7
			
 
				-	mtspr	SPRN_PMC5, r8
			
 
				-	mtspr	SPRN_PMC6, r9
			
 
				-	ld	r3, VCPU_MMCR(r4)
			
 
				-	ld	r5, VCPU_MMCR + 8(r4)
			
 
				-	ld	r6, VCPU_MMCR + 16(r4)
			
 
				-	ld	r7, VCPU_SIAR(r4)
			
 
				-	ld	r8, VCPU_SDAR(r4)
			
 
				-	mtspr	SPRN_MMCR1, r5
			
 
				-	mtspr	SPRN_MMCRA, r6
			
 
				-	mtspr	SPRN_SIAR, r7
			
 
				-	mtspr	SPRN_SDAR, r8
			
 
				-BEGIN_FTR_SECTION
			
 
				-	ld	r5, VCPU_MMCR + 24(r4)
			
 
				-	ld	r6, VCPU_SIER(r4)
			
 
				-	mtspr	SPRN_MMCR2, r5
			
 
				-	mtspr	SPRN_SIER, r6
			
 
				-BEGIN_FTR_SECTION_NESTED(96)
			
 
				-	lwz	r7, VCPU_PMC + 24(r4)
			
 
				-	lwz	r8, VCPU_PMC + 28(r4)
			
 
				-	ld	r9, VCPU_MMCR + 32(r4)
			
 
				-	mtspr	SPRN_SPMC1, r7
			
 
				-	mtspr	SPRN_SPMC2, r8
			
 
				-	mtspr	SPRN_MMCRS, r9
			
 
				-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	mtspr	SPRN_MMCR0, r3
			
 
				-	isync
			
 
				+	/* Load guest PMU registers; r4 = vcpu pointer here */
			
 
				+	mr	r3, r4
			
 
				+	bl	kvmhv_load_guest_pmu
			
 
				 
			
 
				 	/* Load up FP, VMX and VSX registers */
			
 
				+	ld	r4, HSTATE_KVM_VCPU(r13)
			
 
				 	bl	kvmppc_load_fp
			
 
				 
			
 
				 	ld	r14, VCPU_GPR(R14)(r4)
			
@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
 
				 no_xive:
			
 
				 #endif /* CONFIG_KVM_XICS */
			
 
				 
			
 
				-deliver_guest_interrupt:
			
 
				-	ld	r6, VCPU_CTR(r4)
			
 
				-	ld	r7, VCPU_XER(r4)
			
 
				-
			
 
				-	mtctr	r6
			
 
				-	mtxer	r7
			
 
				+	li	r0, 0
			
 
				+	stw	r0, STACK_SLOT_SHORT_PATH(r1)
			
 
				 
			
 
				-kvmppc_cede_reentry:		/* r4 = vcpu, r13 = paca */
			
 
				-	ld	r10, VCPU_PC(r4)
			
 
				-	ld	r11, VCPU_MSR(r4)
			
 
				+deliver_guest_interrupt:	/* r4 = vcpu, r13 = paca */
			
 
				+	/* Check if we can deliver an external or decrementer interrupt now */
			
 
				+	ld	r0, VCPU_PENDING_EXC(r4)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/* On POWER9, also check for emulated doorbell interrupt */
			
 
				+	lbz	r3, VCPU_DBELL_REQ(r4)
			
 
				+	or	r0, r0, r3
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				+	cmpdi	r0, 0
			
 
				+	beq	71f
			
 
				+	mr	r3, r4
			
 
				+	bl	kvmppc_guest_entry_inject_int
			
 
				+	ld	r4, HSTATE_KVM_VCPU(r13)
			
 
				+71:
			
 
				 	ld	r6, VCPU_SRR0(r4)
			
 
				 	ld	r7, VCPU_SRR1(r4)
			
 
				 	mtspr	SPRN_SRR0, r6
			
 
				 	mtspr	SPRN_SRR1, r7
			
 
				 
			
 
				+fast_guest_entry_c:
			
 
				+	ld	r10, VCPU_PC(r4)
			
 
				+	ld	r11, VCPU_MSR(r4)
			
 
				 	/* r11 = vcpu->arch.msr & ~MSR_HV */
			
 
				 	rldicl	r11, r11, 63 - MSR_HV_LG, 1
			
 
				 	rotldi	r11, r11, 1 + MSR_HV_LG
			
 
				 	ori	r11, r11, MSR_ME
			
 
				 
			
 
				-	/* Check if we can deliver an external or decrementer interrupt now */
			
 
				-	ld	r0, VCPU_PENDING_EXC(r4)
			
 
				-	rldicl	r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
			
 
				-	cmpdi	cr1, r0, 0
			
 
				-	andi.	r8, r11, MSR_EE
			
 
				-	mfspr	r8, SPRN_LPCR
			
 
				-	/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
			
 
				-	rldimi	r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
			
 
				-	mtspr	SPRN_LPCR, r8
			
 
				-	isync
			
 
				-	beq	5f
			
 
				-	li	r0, BOOK3S_INTERRUPT_EXTERNAL
			
 
				-	bne	cr1, 12f
			
 
				-	mfspr	r0, SPRN_DEC
			
 
				-BEGIN_FTR_SECTION
			
 
				-	/* On POWER9 check whether the guest has large decrementer enabled */
			
 
				-	andis.	r8, r8, LPCR_LD@h
			
 
				-	bne	15f
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				-	extsw	r0, r0
			
 
				-15:	cmpdi	r0, 0
			
 
				-	li	r0, BOOK3S_INTERRUPT_DECREMENTER
			
 
				-	bge	5f
			
 
				-
			
 
				-12:	mtspr	SPRN_SRR0, r10
			
 
				-	mr	r10,r0
			
 
				-	mtspr	SPRN_SRR1, r11
			
 
				-	mr	r9, r4
			
 
				-	bl	kvmppc_msr_interrupt
			
 
				-5:
			
 
				-BEGIN_FTR_SECTION
			
 
				-	b	fast_guest_return
			
 
				-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
			
 
				-	/* On POWER9, check for pending doorbell requests */
			
 
				-	lbz	r0, VCPU_DBELL_REQ(r4)
			
 
				-	cmpwi	r0, 0
			
 
				-	beq	fast_guest_return
			
 
				-	ld	r5, HSTATE_KVM_VCORE(r13)
			
 
				-	/* Set DPDES register so the CPU will take a doorbell interrupt */
			
 
				-	li	r0, 1
			
 
				-	mtspr	SPRN_DPDES, r0
			
 
				-	std	r0, VCORE_DPDES(r5)
			
 
				-	/* Make sure other cpus see vcore->dpdes set before dbell req clear */
			
 
				-	lwsync
			
 
				-	/* Clear the pending doorbell request */
			
 
				-	li	r0, 0
			
 
				-	stb	r0, VCPU_DBELL_REQ(r4)
			
 
				+	ld	r6, VCPU_CTR(r4)
			
 
				+	ld	r7, VCPU_XER(r4)
			
 
				+	mtctr	r6
			
 
				+	mtxer	r7
			
 
				 
			
 
				 /*
			
 
				  * Required state:
			
@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
 
				 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
			
 
				 
			
 
				 	ld	r5, VCPU_LR(r4)
			
 
				-	lwz	r6, VCPU_CR(r4)
			
 
				+	ld	r6, VCPU_CR(r4)
			
 
				 	mtlr	r5
			
 
				 	mtcr	r6
			
 
				 
			
@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
				 	HRFI_TO_GUEST
			
 
				 	b	.
			
 
				 
			
 
				+/*
			
 
				+ * Enter the guest on a P9 or later system where we have exactly
			
 
				+ * one vcpu per vcore and we don't need to go to real mode
			
 
				+ * (which implies that host and guest are both using radix MMU mode).
			
 
				+ * r3 = vcpu pointer
			
 
				+ * Most SPRs and all the VSRs have been loaded already.
			
 
				+ */
			
 
				+_GLOBAL(__kvmhv_vcpu_entry_p9)
			
 
				+EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
			
 
				+	mflr	r0
			
 
				+	std	r0, PPC_LR_STKOFF(r1)
			
 
				+	stdu	r1, -SFS(r1)
			
 
				+
			
 
				+	li	r0, 1
			
 
				+	stw	r0, STACK_SLOT_SHORT_PATH(r1)
			
 
				+
			
 
				+	std	r3, HSTATE_KVM_VCPU(r13)
			
 
				+	mfcr	r4
			
 
				+	stw	r4, SFS+8(r1)
			
 
				+
			
 
				+	std	r1, HSTATE_HOST_R1(r13)
			
 
				+
			
 
				+	reg = 14
			
 
				+	.rept	18
			
 
				+	std	reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
			
 
				+	reg = reg + 1
			
 
				+	.endr
			
 
				+
			
 
				+	reg = 14
			
 
				+	.rept	18
			
 
				+	ld	reg, __VCPU_GPR(reg)(r3)
			
 
				+	reg = reg + 1
			
 
				+	.endr
			
 
				+
			
 
				+	mfmsr	r10
			
 
				+	std	r10, HSTATE_HOST_MSR(r13)
			
 
				+
			
 
				+	mr	r4, r3
			
 
				+	b	fast_guest_entry_c
			
 
				+guest_exit_short_path:
			
 
				+
			
 
				+	li	r0, KVM_GUEST_MODE_NONE
			
 
				+	stb	r0, HSTATE_IN_GUEST(r13)
			
 
				+
			
 
				+	reg = 14
			
 
				+	.rept	18
			
 
				+	std	reg, __VCPU_GPR(reg)(r9)
			
 
				+	reg = reg + 1
			
 
				+	.endr
			
 
				+
			
 
				+	reg = 14
			
 
				+	.rept	18
			
 
				+	ld	reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
			
 
				+	reg = reg + 1
			
 
				+	.endr
			
 
				+
			
 
				+	lwz	r4, SFS+8(r1)
			
 
				+	mtcr	r4
			
 
				+
			
 
				+	mr	r3, r12		/* trap number */
			
 
				+
			
 
				+	addi	r1, r1, SFS
			
 
				+	ld	r0, PPC_LR_STKOFF(r1)
			
 
				+	mtlr	r0
			
 
				+
			
 
				+	/* If we are in real mode, do a rfid to get back to the caller */
			
 
				+	mfmsr	r4
			
 
				+	andi.	r5, r4, MSR_IR
			
 
				+	bnelr
			
 
				+	rldicl	r5, r4, 64 - MSR_TS_S_LG, 62	/* extract TS field */
			
 
				+	mtspr	SPRN_SRR0, r0
			
 
				+	ld	r10, HSTATE_HOST_MSR(r13)
			
 
				+	rldimi	r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				+	mtspr	SPRN_SRR1, r10
			
 
				+	RFI_TO_KERNEL
			
 
				+	b	.
			
 
				+
			
 
				 secondary_too_late:
			
 
				 	li	r12, 0
			
 
				 	stw	r12, STACK_SLOT_TRAP(r1)
			
@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
 
				 	std	r3, VCPU_GPR(R12)(r9)
			
 
				 	/* CR is in the high half of r12 */
			
 
				 	srdi	r4, r12, 32
			
 
				-	stw	r4, VCPU_CR(r9)
			
 
				+	std	r4, VCPU_CR(r9)
			
 
				 BEGIN_FTR_SECTION
			
 
				 	ld	r3, HSTATE_CFAR(r13)
			
 
				 	std	r3, VCPU_CFAR(r9)
			
@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
				 	std	r3, VCPU_CTR(r9)
			
 
				 	std	r4, VCPU_XER(r9)
			
 
				 
			
 
				-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				-	/* For softpatch interrupt, go off and do TM instruction emulation */
			
 
				-	cmpwi	r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
			
 
				-	beq	kvmppc_tm_emul
			
 
				-#endif
			
 
				+	/* Save more register state  */
			
 
				+	mfdar	r3
			
 
				+	mfdsisr	r4
			
 
				+	std	r3, VCPU_DAR(r9)
			
 
				+	stw	r4, VCPU_DSISR(r9)
			
 
				 
			
 
				 	/* If this is a page table miss then see if it's theirs or ours */
			
 
				 	cmpwi	r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
			
 
				 	beq	kvmppc_hdsi
			
 
				+	std	r3, VCPU_FAULT_DAR(r9)
			
 
				+	stw	r4, VCPU_FAULT_DSISR(r9)
			
 
				 	cmpwi	r12, BOOK3S_INTERRUPT_H_INST_STORAGE
			
 
				 	beq	kvmppc_hisi
			
 
				 
			
 
				+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				+	/* For softpatch interrupt, go off and do TM instruction emulation */
			
 
				+	cmpwi	r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
			
 
				+	beq	kvmppc_tm_emul
			
 
				+#endif
			
 
				+
			
 
				 	/* See if this is a leftover HDEC interrupt */
			
 
				 	cmpwi	r12,BOOK3S_INTERRUPT_HV_DECREMENTER
			
 
				 	bne	2f
			
@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 
				 BEGIN_FTR_SECTION
			
 
				 	PPC_MSGSYNC
			
 
				 	lwsync
			
 
				+	/* always exit if we're running a nested guest */
			
 
				+	ld	r0, VCPU_NESTED(r9)
			
 
				+	cmpdi	r0, 0
			
 
				+	bne	guest_exit_cont
			
 
				 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				 	lbz	r0, HSTATE_HOST_IPI(r13)
			
 
				 	cmpwi	r0, 0
			
 
				-	beq	4f
			
 
				+	beq	maybe_reenter_guest
			
 
				 	b	guest_exit_cont
			
 
				 3:
			
 
				 	/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
			
@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
				 14:
			
 
				 	/* External interrupt ? */
			
 
				 	cmpwi	r12, BOOK3S_INTERRUPT_EXTERNAL
			
 
				-	bne+	guest_exit_cont
			
 
				-
			
 
				-	/* External interrupt, first check for host_ipi. If this is
			
 
				-	 * set, we know the host wants us out so let's do it now
			
 
				-	 */
			
 
				-	bl	kvmppc_read_intr
			
 
				-
			
 
				-	/*
			
 
				-	 * Restore the active volatile registers after returning from
			
 
				-	 * a C function.
			
 
				-	 */
			
 
				-	ld	r9, HSTATE_KVM_VCPU(r13)
			
 
				-	li	r12, BOOK3S_INTERRUPT_EXTERNAL
			
 
				-
			
 
				-	/*
			
 
				-	 * kvmppc_read_intr return codes:
			
 
				-	 *
			
 
				-	 * Exit to host (r3 > 0)
			
 
				-	 *   1 An interrupt is pending that needs to be handled by the host
			
 
				-	 *     Exit guest and return to host by branching to guest_exit_cont
			
 
				-	 *
			
 
				-	 *   2 Passthrough that needs completion in the host
			
 
				-	 *     Exit guest and return to host by branching to guest_exit_cont
			
 
				-	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
			
 
				-	 *     to indicate to the host to complete handling the interrupt
			
 
				-	 *
			
 
				-	 * Before returning to guest, we check if any CPU is heading out
			
 
				-	 * to the host and if so, we head out also. If no CPUs are heading
			
 
				-	 * check return values <= 0.
			
 
				-	 *
			
 
				-	 * Return to guest (r3 <= 0)
			
 
				-	 *  0 No external interrupt is pending
			
 
				-	 * -1 A guest wakeup IPI (which has now been cleared)
			
 
				-	 *    In either case, we return to guest to deliver any pending
			
 
				-	 *    guest interrupts.
			
 
				-	 *
			
 
				-	 * -2 A PCI passthrough external interrupt was handled
			
 
				-	 *    (interrupt was delivered directly to guest)
			
 
				-	 *    Return to guest to deliver any pending guest interrupts.
			
 
				-	 */
			
 
				-
			
 
				-	cmpdi	r3, 1
			
 
				-	ble	1f
			
 
				-
			
 
				-	/* Return code = 2 */
			
 
				-	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
			
 
				-	stw	r12, VCPU_TRAP(r9)
			
 
				-	b	guest_exit_cont
			
 
				-
			
 
				-1:	/* Return code <= 1 */
			
 
				-	cmpdi	r3, 0
			
 
				-	bgt	guest_exit_cont
			
 
				-
			
 
				-	/* Return code <= 0 */
			
 
				-4:	ld	r5, HSTATE_KVM_VCORE(r13)
			
 
				-	lwz	r0, VCORE_ENTRY_EXIT(r5)
			
 
				-	cmpwi	r0, 0x100
			
 
				-	mr	r4, r9
			
 
				-	blt	deliver_guest_interrupt
			
 
				-
			
 
				-guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
			
 
				-	/* Save more register state  */
			
 
				-	mfdar	r6
			
 
				-	mfdsisr	r7
			
 
				-	std	r6, VCPU_DAR(r9)
			
 
				-	stw	r7, VCPU_DSISR(r9)
			
 
				-	/* don't overwrite fault_dar/fault_dsisr if HDSI */
			
 
				-	cmpwi	r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
			
 
				-	beq	mc_cont
			
 
				-	std	r6, VCPU_FAULT_DAR(r9)
			
 
				-	stw	r7, VCPU_FAULT_DSISR(r9)
			
 
				-
			
 
				+	beq	kvmppc_guest_external
			
 
				 	/* See if it is a machine check */
			
 
				 	cmpwi	r12, BOOK3S_INTERRUPT_MACHINE_CHECK
			
 
				 	beq	machine_check_realmode
			
 
				-mc_cont:
			
 
				+	/* Or a hypervisor maintenance interrupt */
			
 
				+	cmpwi	r12, BOOK3S_INTERRUPT_HMI
			
 
				+	beq	hmi_realmode
			
 
				+
			
 
				+guest_exit_cont:		/* r9 = vcpu, r12 = trap, r13 = paca */
			
 
				+
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
			
 
				 	addi	r3, r9, VCPU_TB_RMEXIT
			
 
				 	mr	r4, r9
			
@@ -1552,6 +1465,11 @@ mc_cont:
 
				 1:
			
 
				 #endif /* CONFIG_KVM_XICS */
			
 
				 
			
 
				+	/* If we came in through the P9 short path, go back out to C now */
			
 
				+	lwz	r0, STACK_SLOT_SHORT_PATH(r1)
			
 
				+	cmpwi	r0, 0
			
 
				+	bne	guest_exit_short_path
			
 
				+
			
 
				 	/* For hash guest, read the guest SLB and save it away */
			
 
				 	ld	r5, VCPU_KVM(r9)
			
 
				 	lbz	r0, KVM_RADIX(r5)
			
@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
 
				 	b	91f
			
 
				 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
			
 
				 	/*
			
 
				-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
			
 
				+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
			
 
				 	 */
			
 
				 	mr      r3, r9
			
 
				 	ld      r4, VCPU_MSR(r3)
			
 
				+	li	r5, 0			/* don't preserve non-vol regs */
			
 
				 	bl	kvmppc_save_tm_hv
			
 
				+	nop
			
 
				 	ld	r9, HSTATE_KVM_VCPU(r13)
			
 
				 91:
			
 
				 #endif
			
@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 
				 25:
			
 
				 	/* Save PMU registers if requested */
			
 
				 	/* r8 and cr0.eq are live here */
			
 
				+	mr	r3, r9
			
 
				+	li	r4, 1
			
 
				+	beq	21f			/* if no VPA, save PMU stuff anyway */
			
 
				+	lbz	r4, LPPACA_PMCINUSE(r8)
			
 
				+21:	bl	kvmhv_save_guest_pmu
			
 
				+	ld	r9, HSTATE_KVM_VCPU(r13)
			
 
				+
			
 
				+	/* Restore host values of some registers */
			
 
				 BEGIN_FTR_SECTION
			
 
				-	/*
			
 
				-	 * POWER8 seems to have a hardware bug where setting
			
 
				-	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
			
 
				-	 * when some counters are already negative doesn't seem
			
 
				-	 * to cause a performance monitor alert (and hence interrupt).
			
 
				-	 * The effect of this is that when saving the PMU state,
			
 
				-	 * if there is no PMU alert pending when we read MMCR0
			
 
				-	 * before freezing the counters, but one becomes pending
			
 
				-	 * before we read the counters, we lose it.
			
 
				-	 * To work around this, we need a way to freeze the counters
			
 
				-	 * before reading MMCR0.  Normally, freezing the counters
			
 
				-	 * is done by writing MMCR0 (to set MMCR0[FC]) which
			
 
				-	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
			
 
				-	 * we can also freeze the counters using MMCR2, by writing
			
 
				-	 * 1s to all the counter freeze condition bits (there are
			
 
				-	 * 9 bits each for 6 counters).
			
 
				-	 */
			
 
				-	li	r3, -1			/* set all freeze bits */
			
 
				-	clrrdi	r3, r3, 10
			
 
				-	mfspr	r10, SPRN_MMCR2
			
 
				-	mtspr	SPRN_MMCR2, r3
			
 
				-	isync
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	li	r3, 1
			
 
				-	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				-	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
			
 
				-	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
			
 
				-	mfspr	r6, SPRN_MMCRA
			
 
				-	/* Clear MMCRA in order to disable SDAR updates */
			
 
				-	li	r7, 0
			
 
				-	mtspr	SPRN_MMCRA, r7
			
 
				-	isync
			
 
				-	beq	21f			/* if no VPA, save PMU stuff anyway */
			
 
				-	lbz	r7, LPPACA_PMCINUSE(r8)
			
 
				-	cmpwi	r7, 0			/* did they ask for PMU stuff to be saved? */
			
 
				-	bne	21f
			
 
				-	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
			
 
				-	b	22f
			
 
				-21:	mfspr	r5, SPRN_MMCR1
			
 
				-	mfspr	r7, SPRN_SIAR
			
 
				-	mfspr	r8, SPRN_SDAR
			
 
				-	std	r4, VCPU_MMCR(r9)
			
 
				-	std	r5, VCPU_MMCR + 8(r9)
			
 
				-	std	r6, VCPU_MMCR + 16(r9)
			
 
				-BEGIN_FTR_SECTION
			
 
				-	std	r10, VCPU_MMCR + 24(r9)
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-	std	r7, VCPU_SIAR(r9)
			
 
				-	std	r8, VCPU_SDAR(r9)
			
 
				-	mfspr	r3, SPRN_PMC1
			
 
				-	mfspr	r4, SPRN_PMC2
			
 
				-	mfspr	r5, SPRN_PMC3
			
 
				-	mfspr	r6, SPRN_PMC4
			
 
				-	mfspr	r7, SPRN_PMC5
			
 
				-	mfspr	r8, SPRN_PMC6
			
 
				-	stw	r3, VCPU_PMC(r9)
			
 
				-	stw	r4, VCPU_PMC + 4(r9)
			
 
				-	stw	r5, VCPU_PMC + 8(r9)
			
 
				-	stw	r6, VCPU_PMC + 12(r9)
			
 
				-	stw	r7, VCPU_PMC + 16(r9)
			
 
				-	stw	r8, VCPU_PMC + 20(r9)
			
 
				-BEGIN_FTR_SECTION
			
 
				-	mfspr	r5, SPRN_SIER
			
 
				-	std	r5, VCPU_SIER(r9)
			
 
				-BEGIN_FTR_SECTION_NESTED(96)
			
 
				-	mfspr	r6, SPRN_SPMC1
			
 
				-	mfspr	r7, SPRN_SPMC2
			
 
				-	mfspr	r8, SPRN_MMCRS
			
 
				-	stw	r6, VCPU_PMC + 24(r9)
			
 
				-	stw	r7, VCPU_PMC + 28(r9)
			
 
				-	std	r8, VCPU_MMCR + 32(r9)
			
 
				-	lis	r4, 0x8000
			
 
				-	mtspr	SPRN_MMCRS, r4
			
 
				-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				-22:
			
 
				-
			
 
				-	/* Restore host values of some registers */
			
 
				-BEGIN_FTR_SECTION
			
 
				-	ld	r5, STACK_SLOT_CIABR(r1)
			
 
				-	ld	r6, STACK_SLOT_DAWR(r1)
			
 
				-	ld	r7, STACK_SLOT_DAWRX(r1)
			
 
				-	mtspr	SPRN_CIABR, r5
			
 
				+	ld	r5, STACK_SLOT_CIABR(r1)
			
 
				+	ld	r6, STACK_SLOT_DAWR(r1)
			
 
				+	ld	r7, STACK_SLOT_DAWRX(r1)
			
 
				+	mtspr	SPRN_CIABR, r5
			
 
				 	/*
			
 
				 	 * If the DAWR doesn't work, it's ok to write these here as
			
 
				 	 * this value should always be zero
			
@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
 
				 	mtspr	SPRN_DPDES, r8
			
 
				 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				 
			
 
				-	/* If HMI, call kvmppc_realmode_hmi_handler() */
			
 
				-	lwz	r12, STACK_SLOT_TRAP(r1)
			
 
				-	cmpwi	r12, BOOK3S_INTERRUPT_HMI
			
 
				-	bne	27f
			
 
				-	bl	kvmppc_realmode_hmi_handler
			
 
				-	nop
			
 
				-	cmpdi	r3, 0
			
 
				-	/*
			
 
				-	 * At this point kvmppc_realmode_hmi_handler may have resync-ed
			
 
				-	 * the TB, and if it has, we must not subtract the guest timebase
			
 
				-	 * offset from the timebase. So, skip it.
			
 
				-	 *
			
 
				-	 * Also, do not call kvmppc_subcore_exit_guest() because it has
			
 
				-	 * been invoked as part of kvmppc_realmode_hmi_handler().
			
 
				-	 */
			
 
				-	beq	30f
			
 
				-
			
 
				-27:
			
 
				 	/* Subtract timebase offset from timebase */
			
 
				 	ld	r8, VCORE_TB_OFFSET_APPL(r5)
			
 
				 	cmpdi	r8,0
			
@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				 	addis	r8,r8,0x100		/* if so, increment upper 40 bits */
			
 
				 	mtspr	SPRN_TBU40,r8
			
 
				 
			
 
				-17:	bl	kvmppc_subcore_exit_guest
			
 
				+17:
			
 
				+	/*
			
 
				+	 * If this is an HMI, we called kvmppc_realmode_hmi_handler
			
 
				+	 * above, which may or may not have already called
			
 
				+	 * kvmppc_subcore_exit_guest.  Fortunately, all that
			
 
				+	 * kvmppc_subcore_exit_guest does is clear a flag, so calling
			
 
				+	 * it again here is benign even if kvmppc_realmode_hmi_handler
			
 
				+	 * has already called it.
			
 
				+	 */
			
 
				+	bl	kvmppc_subcore_exit_guest
			
 
				 	nop
			
 
				 30:	ld	r5,HSTATE_KVM_VCORE(r13)
			
 
				 	ld	r4,VCORE_KVM(r5)	/* pointer to struct kvm */
			
@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 
				 	mtlr	r0
			
 
				 	blr
			
 
				 
			
 
				+kvmppc_guest_external:
			
 
				+	/* External interrupt, first check for host_ipi. If this is
			
 
				+	 * set, we know the host wants us out so let's do it now
			
 
				+	 */
			
 
				+	bl	kvmppc_read_intr
			
 
				+
			
 
				+	/*
			
 
				+	 * Restore the active volatile registers after returning from
			
 
				+	 * a C function.
			
 
				+	 */
			
 
				+	ld	r9, HSTATE_KVM_VCPU(r13)
			
 
				+	li	r12, BOOK3S_INTERRUPT_EXTERNAL
			
 
				+
			
 
				+	/*
			
 
				+	 * kvmppc_read_intr return codes:
			
 
				+	 *
			
 
				+	 * Exit to host (r3 > 0)
			
 
				+	 *   1 An interrupt is pending that needs to be handled by the host
			
 
				+	 *     Exit guest and return to host by branching to guest_exit_cont
			
 
				+	 *
			
 
				+	 *   2 Passthrough that needs completion in the host
			
 
				+	 *     Exit guest and return to host by branching to guest_exit_cont
			
 
				+	 *     However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
			
 
				+	 *     to indicate to the host to complete handling the interrupt
			
 
				+	 *
			
 
				+	 * Before returning to guest, we check if any CPU is heading out
			
 
				+	 * to the host and if so, we head out also. If no CPUs are heading
			
 
				+	 * check return values <= 0.
			
 
				+	 *
			
 
				+	 * Return to guest (r3 <= 0)
			
 
				+	 *  0 No external interrupt is pending
			
 
				+	 * -1 A guest wakeup IPI (which has now been cleared)
			
 
				+	 *    In either case, we return to guest to deliver any pending
			
 
				+	 *    guest interrupts.
			
 
				+	 *
			
 
				+	 * -2 A PCI passthrough external interrupt was handled
			
 
				+	 *    (interrupt was delivered directly to guest)
			
 
				+	 *    Return to guest to deliver any pending guest interrupts.
			
 
				+	 */
			
 
				+
			
 
				+	cmpdi	r3, 1
			
 
				+	ble	1f
			
 
				+
			
 
				+	/* Return code = 2 */
			
 
				+	li	r12, BOOK3S_INTERRUPT_HV_RM_HARD
			
 
				+	stw	r12, VCPU_TRAP(r9)
			
 
				+	b	guest_exit_cont
			
 
				+
			
 
				+1:	/* Return code <= 1 */
			
 
				+	cmpdi	r3, 0
			
 
				+	bgt	guest_exit_cont
			
 
				+
			
 
				+	/* Return code <= 0 */
			
 
				+maybe_reenter_guest:
			
 
				+	ld	r5, HSTATE_KVM_VCORE(r13)
			
 
				+	lwz	r0, VCORE_ENTRY_EXIT(r5)
			
 
				+	cmpwi	r0, 0x100
			
 
				+	mr	r4, r9
			
 
				+	blt	deliver_guest_interrupt
			
 
				+	b	guest_exit_cont
			
 
				+
			
 
				 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
			
 
				 /*
			
 
				  * Softpatch interrupt for transactional memory emulation cases
			
@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
 
				 	andi.	r0,r11,MSR_PR
			
 
				 	/* sc 1 from userspace - reflect to guest syscall */
			
 
				 	bne	sc_1_fast_return
			
 
				+	/* sc 1 from nested guest - give it to L1 to handle */
			
 
				+	ld	r0, VCPU_NESTED(r9)
			
 
				+	cmpdi	r0, 0
			
 
				+	bne	guest_exit_cont
			
 
				 	clrrdi	r3,r3,2
			
 
				 	cmpldi	r3,hcall_real_table_end - hcall_real_table
			
 
				 	bge	guest_exit_cont
			
@@ -2561,6 +2466,7 @@ hcall_real_table:
 
				 hcall_real_table_end:
			
 
				 
			
 
				 _GLOBAL(kvmppc_h_set_xdabr)
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
			
 
				 	andi.	r0, r5, DABRX_USER | DABRX_KERNEL
			
 
				 	beq	6f
			
 
				 	li	r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
			
@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
 
				 	blr
			
 
				 
			
 
				 _GLOBAL(kvmppc_h_set_dabr)
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
			
 
				 	li	r5, DABRX_USER | DABRX_KERNEL
			
 
				 3:
			
 
				 BEGIN_FTR_SECTION
			
@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
 
				 	b	91f
			
 
				 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
			
 
				 	/*
			
 
				-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
			
 
				+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
			
 
				 	 */
			
 
				 	ld	r3, HSTATE_KVM_VCPU(r13)
			
 
				 	ld      r4, VCPU_MSR(r3)
			
 
				+	li	r5, 0			/* don't preserve non-vol regs */
			
 
				 	bl	kvmppc_save_tm_hv
			
 
				+	nop
			
 
				 91:
			
 
				 #endif
			
 
				 
			
@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
 
				 	b	91f
			
 
				 END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
			
 
				 	/*
			
 
				-	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
			
 
				+	 * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
			
 
				 	 */
			
 
				 	mr      r3, r4
			
 
				 	ld      r4, VCPU_MSR(r3)
			
 
				+	li	r5, 0			/* don't preserve non-vol regs */
			
 
				 	bl	kvmppc_restore_tm_hv
			
 
				+	nop
			
 
				 	ld	r4, HSTATE_KVM_VCPU(r13)
			
 
				 91:
			
 
				 #endif
			
@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
 
				 	mr	r9, r4
			
 
				 	cmpdi	r3, 0
			
 
				 	bgt	guest_exit_cont
			
 
				-
			
 
				-	/* see if any other thread is already exiting */
			
 
				-	lwz	r0,VCORE_ENTRY_EXIT(r5)
			
 
				-	cmpwi	r0,0x100
			
 
				-	bge	guest_exit_cont
			
 
				-
			
 
				-	b	kvmppc_cede_reentry	/* if not go back to guest */
			
 
				+	b	maybe_reenter_guest
			
 
				 
			
 
				 	/* cede when already previously prodded case */
			
 
				 kvm_cede_prodded:
			
@@ -2947,12 +2852,12 @@ machine_check_realmode:
 
				 	 */
			
 
				 	ld	r11, VCPU_MSR(r9)
			
 
				 	rldicl.	r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
			
 
				-	bne	mc_cont			/* if so, exit to host */
			
 
				+	bne	guest_exit_cont		/* if so, exit to host */
			
 
				 	/* Check if guest is capable of handling NMI exit */
			
 
				 	ld	r10, VCPU_KVM(r9)
			
 
				 	lbz	r10, KVM_FWNMI(r10)
			
 
				 	cmpdi	r10, 1			/* FWNMI capable? */
			
 
				-	beq	mc_cont			/* if so, exit with KVM_EXIT_NMI. */
			
 
				+	beq	guest_exit_cont		/* if so, exit with KVM_EXIT_NMI. */
			
 
				 
			
 
				 	/* if not, fall through for backward compatibility. */
			
 
				 	andi.	r10, r11, MSR_RI	/* check for unrecoverable exception */
			
@@ -2965,6 +2870,21 @@ machine_check_realmode:
 
				 	bl	kvmppc_msr_interrupt
			
 
				 2:	b	fast_interrupt_c_return
			
 
				 
			
 
				+/*
			
 
				+ * Call C code to handle a HMI in real mode.
			
 
				+ * Only the primary thread does the call, secondary threads are handled
			
 
				+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
			
 
				+ * r9 points to the vcpu on entry
			
 
				+ */
			
 
				+hmi_realmode:
			
 
				+	lbz	r0, HSTATE_PTID(r13)
			
 
				+	cmpwi	r0, 0
			
 
				+	bne	guest_exit_cont
			
 
				+	bl	kvmppc_realmode_hmi_handler
			
 
				+	ld	r9, HSTATE_KVM_VCPU(r13)
			
 
				+	li	r12, BOOK3S_INTERRUPT_HMI
			
 
				+	b	guest_exit_cont
			
 
				+
			
 
				 /*
			
 
				  * Check the reason we woke from nap, and take appropriate action.
			
 
				  * Returns (in r3):
			
@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 
				  * Save transactional state and TM-related registers.
			
 
				  * Called with r3 pointing to the vcpu struct and r4 containing
			
 
				  * the guest MSR value.
			
 
				- * This can modify all checkpointed registers, but
			
 
				+ * r5 is non-zero iff non-volatile register state needs to be maintained.
			
 
				+ * If r5 == 0, this can modify all checkpointed registers, but
			
 
				  * restores r1 and r2 before exit.
			
 
				  */
			
 
				-kvmppc_save_tm_hv:
			
 
				+_GLOBAL_TOC(kvmppc_save_tm_hv)
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
			
 
				 	/* See if we need to handle fake suspend mode */
			
 
				 BEGIN_FTR_SECTION
			
 
				 	b	__kvmppc_save_tm
			
@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
 
				 END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
			
 
				 	nop
			
 
				 
			
 
				-	std	r1, HSTATE_HOST_R1(r13)
			
 
				-
			
 
				-	/* Clear the MSR RI since r1, r13 may be foobar. */
			
 
				-	li	r5, 0
			
 
				-	mtmsrd	r5, 1
			
 
				-
			
 
				 	/* We have to treclaim here because that's the only way to do S->N */
			
 
				 	li	r3, TM_CAUSE_KVM_RESCHED
			
 
				 	TRECLAIM(R3)
			
@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
 
				 	 * We were in fake suspend, so we are not going to save the
			
 
				 	 * register state as the guest checkpointed state (since
			
 
				 	 * we already have it), therefore we can now use any volatile GPR.
			
 
				+	 * In fact treclaim in fake suspend state doesn't modify
			
 
				+	 * any registers.
			
 
				 	 */
			
 
				-	/* Reload PACA pointer, stack pointer and TOC. */
			
 
				-	GET_PACA(r13)
			
 
				-	ld	r1, HSTATE_HOST_R1(r13)
			
 
				-	ld	r2, PACATOC(r13)
			
 
				 
			
 
				-	/* Set MSR RI now we have r1 and r13 back. */
			
 
				-	li	r5, MSR_RI
			
 
				-	mtmsrd	r5, 1
			
 
				-
			
 
				-	HMT_MEDIUM
			
 
				-	ld	r6, HSTATE_DSCR(r13)
			
 
				-	mtspr	SPRN_DSCR, r6
			
 
				-BEGIN_FTR_SECTION_NESTED(96)
			
 
				+BEGIN_FTR_SECTION
			
 
				 	bl	pnv_power9_force_smt4_release
			
 
				-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
			
 
				 	nop
			
 
				 
			
 
				 4:
			
@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
 
				  * Restore transactional state and TM-related registers.
			
 
				  * Called with r3 pointing to the vcpu struct
			
 
				  * and r4 containing the guest MSR value.
			
 
				+ * r5 is non-zero iff non-volatile register state needs to be maintained.
			
 
				  * This potentially modifies all checkpointed registers.
			
 
				  * It restores r1 and r2 from the PACA.
			
 
				  */
			
 
				-kvmppc_restore_tm_hv:
			
 
				+_GLOBAL_TOC(kvmppc_restore_tm_hv)
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
			
 
				 	/*
			
 
				 	 * If we are doing TM emulation for the guest on a POWER9 DD2,
			
 
				 	 * then we don't actually do a trechkpt -- we either set up
			
@@ -3423,6 +3332,194 @@ kvmppc_msr_interrupt:
 
				 1:	rldimi	r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				 	blr
			
 
				 
			
 
				+/*
			
 
				+ * Load up guest PMU state.  R3 points to the vcpu struct.
			
 
				+ */
			
 
				+_GLOBAL(kvmhv_load_guest_pmu)
			
 
				+EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
			
 
				+	mr	r4, r3
			
 
				+	mflr	r0
			
 
				+	li	r3, 1
			
 
				+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
			
 
				+	isync
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r3, VCPU_MMCR(r4)
			
 
				+	andi.	r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
			
 
				+	cmpwi	r5, MMCR0_PMAO
			
 
				+	beql	kvmppc_fix_pmao
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
			
 
				+	lwz	r3, VCPU_PMC(r4)	/* always load up guest PMU registers */
			
 
				+	lwz	r5, VCPU_PMC + 4(r4)	/* to prevent information leak */
			
 
				+	lwz	r6, VCPU_PMC + 8(r4)
			
 
				+	lwz	r7, VCPU_PMC + 12(r4)
			
 
				+	lwz	r8, VCPU_PMC + 16(r4)
			
 
				+	lwz	r9, VCPU_PMC + 20(r4)
			
 
				+	mtspr	SPRN_PMC1, r3
			
 
				+	mtspr	SPRN_PMC2, r5
			
 
				+	mtspr	SPRN_PMC3, r6
			
 
				+	mtspr	SPRN_PMC4, r7
			
 
				+	mtspr	SPRN_PMC5, r8
			
 
				+	mtspr	SPRN_PMC6, r9
			
 
				+	ld	r3, VCPU_MMCR(r4)
			
 
				+	ld	r5, VCPU_MMCR + 8(r4)
			
 
				+	ld	r6, VCPU_MMCR + 16(r4)
			
 
				+	ld	r7, VCPU_SIAR(r4)
			
 
				+	ld	r8, VCPU_SDAR(r4)
			
 
				+	mtspr	SPRN_MMCR1, r5
			
 
				+	mtspr	SPRN_MMCRA, r6
			
 
				+	mtspr	SPRN_SIAR, r7
			
 
				+	mtspr	SPRN_SDAR, r8
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r5, VCPU_MMCR + 24(r4)
			
 
				+	ld	r6, VCPU_SIER(r4)
			
 
				+	mtspr	SPRN_MMCR2, r5
			
 
				+	mtspr	SPRN_SIER, r6
			
 
				+BEGIN_FTR_SECTION_NESTED(96)
			
 
				+	lwz	r7, VCPU_PMC + 24(r4)
			
 
				+	lwz	r8, VCPU_PMC + 28(r4)
			
 
				+	ld	r9, VCPU_MMCR + 32(r4)
			
 
				+	mtspr	SPRN_SPMC1, r7
			
 
				+	mtspr	SPRN_SPMC2, r8
			
 
				+	mtspr	SPRN_MMCRS, r9
			
 
				+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	mtspr	SPRN_MMCR0, r3
			
 
				+	isync
			
 
				+	mtlr	r0
			
 
				+	blr
			
 
				+
			
 
				+/*
			
 
				+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
			
 
				+ */
			
 
				+_GLOBAL(kvmhv_load_host_pmu)
			
 
				+EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
			
 
				+	mflr	r0
			
 
				+	lbz	r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
			
 
				+	cmpwi	r4, 0
			
 
				+	beq	23f			/* skip if not */
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r3, HSTATE_MMCR0(r13)
			
 
				+	andi.	r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
			
 
				+	cmpwi	r4, MMCR0_PMAO
			
 
				+	beql	kvmppc_fix_pmao
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
			
 
				+	lwz	r3, HSTATE_PMC1(r13)
			
 
				+	lwz	r4, HSTATE_PMC2(r13)
			
 
				+	lwz	r5, HSTATE_PMC3(r13)
			
 
				+	lwz	r6, HSTATE_PMC4(r13)
			
 
				+	lwz	r8, HSTATE_PMC5(r13)
			
 
				+	lwz	r9, HSTATE_PMC6(r13)
			
 
				+	mtspr	SPRN_PMC1, r3
			
 
				+	mtspr	SPRN_PMC2, r4
			
 
				+	mtspr	SPRN_PMC3, r5
			
 
				+	mtspr	SPRN_PMC4, r6
			
 
				+	mtspr	SPRN_PMC5, r8
			
 
				+	mtspr	SPRN_PMC6, r9
			
 
				+	ld	r3, HSTATE_MMCR0(r13)
			
 
				+	ld	r4, HSTATE_MMCR1(r13)
			
 
				+	ld	r5, HSTATE_MMCRA(r13)
			
 
				+	ld	r6, HSTATE_SIAR(r13)
			
 
				+	ld	r7, HSTATE_SDAR(r13)
			
 
				+	mtspr	SPRN_MMCR1, r4
			
 
				+	mtspr	SPRN_MMCRA, r5
			
 
				+	mtspr	SPRN_SIAR, r6
			
 
				+	mtspr	SPRN_SDAR, r7
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r8, HSTATE_MMCR2(r13)
			
 
				+	ld	r9, HSTATE_SIER(r13)
			
 
				+	mtspr	SPRN_MMCR2, r8
			
 
				+	mtspr	SPRN_SIER, r9
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	mtspr	SPRN_MMCR0, r3
			
 
				+	isync
			
 
				+	mtlr	r0
			
 
				+23:	blr
			
 
				+
			
 
				+/*
			
 
				+ * Save guest PMU state into the vcpu struct.
			
 
				+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
			
 
				+ */
			
 
				+_GLOBAL(kvmhv_save_guest_pmu)
			
 
				+EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
			
 
				+	mr	r9, r3
			
 
				+	mr	r8, r4
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/*
			
 
				+	 * POWER8 seems to have a hardware bug where setting
			
 
				+	 * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
			
 
				+	 * when some counters are already negative doesn't seem
			
 
				+	 * to cause a performance monitor alert (and hence interrupt).
			
 
				+	 * The effect of this is that when saving the PMU state,
			
 
				+	 * if there is no PMU alert pending when we read MMCR0
			
 
				+	 * before freezing the counters, but one becomes pending
			
 
				+	 * before we read the counters, we lose it.
			
 
				+	 * To work around this, we need a way to freeze the counters
			
 
				+	 * before reading MMCR0.  Normally, freezing the counters
			
 
				+	 * is done by writing MMCR0 (to set MMCR0[FC]) which
			
 
				+	 * unavoidably writes MMCR0[PMA0] as well.  On POWER8,
			
 
				+	 * we can also freeze the counters using MMCR2, by writing
			
 
				+	 * 1s to all the counter freeze condition bits (there are
			
 
				+	 * 9 bits each for 6 counters).
			
 
				+	 */
			
 
				+	li	r3, -1			/* set all freeze bits */
			
 
				+	clrrdi	r3, r3, 10
			
 
				+	mfspr	r10, SPRN_MMCR2
			
 
				+	mtspr	SPRN_MMCR2, r3
			
 
				+	isync
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	li	r3, 1
			
 
				+	sldi	r3, r3, 31		/* MMCR0_FC (freeze counters) bit */
			
 
				+	mfspr	r4, SPRN_MMCR0		/* save MMCR0 */
			
 
				+	mtspr	SPRN_MMCR0, r3		/* freeze all counters, disable ints */
			
 
				+	mfspr	r6, SPRN_MMCRA
			
 
				+	/* Clear MMCRA in order to disable SDAR updates */
			
 
				+	li	r7, 0
			
 
				+	mtspr	SPRN_MMCRA, r7
			
 
				+	isync
			
 
				+	cmpwi	r8, 0			/* did they ask for PMU stuff to be saved? */
			
 
				+	bne	21f
			
 
				+	std	r3, VCPU_MMCR(r9)	/* if not, set saved MMCR0 to FC */
			
 
				+	b	22f
			
 
				+21:	mfspr	r5, SPRN_MMCR1
			
 
				+	mfspr	r7, SPRN_SIAR
			
 
				+	mfspr	r8, SPRN_SDAR
			
 
				+	std	r4, VCPU_MMCR(r9)
			
 
				+	std	r5, VCPU_MMCR + 8(r9)
			
 
				+	std	r6, VCPU_MMCR + 16(r9)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	std	r10, VCPU_MMCR + 24(r9)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	std	r7, VCPU_SIAR(r9)
			
 
				+	std	r8, VCPU_SDAR(r9)
			
 
				+	mfspr	r3, SPRN_PMC1
			
 
				+	mfspr	r4, SPRN_PMC2
			
 
				+	mfspr	r5, SPRN_PMC3
			
 
				+	mfspr	r6, SPRN_PMC4
			
 
				+	mfspr	r7, SPRN_PMC5
			
 
				+	mfspr	r8, SPRN_PMC6
			
 
				+	stw	r3, VCPU_PMC(r9)
			
 
				+	stw	r4, VCPU_PMC + 4(r9)
			
 
				+	stw	r5, VCPU_PMC + 8(r9)
			
 
				+	stw	r6, VCPU_PMC + 12(r9)
			
 
				+	stw	r7, VCPU_PMC + 16(r9)
			
 
				+	stw	r8, VCPU_PMC + 20(r9)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	mfspr	r5, SPRN_SIER
			
 
				+	std	r5, VCPU_SIER(r9)
			
 
				+BEGIN_FTR_SECTION_NESTED(96)
			
 
				+	mfspr	r6, SPRN_SPMC1
			
 
				+	mfspr	r7, SPRN_SPMC2
			
 
				+	mfspr	r8, SPRN_MMCRS
			
 
				+	stw	r6, VCPU_PMC + 24(r9)
			
 
				+	stw	r7, VCPU_PMC + 28(r9)
			
 
				+	std	r8, VCPU_MMCR + 32(r9)
			
 
				+	lis	r4, 0x8000
			
 
				+	mtspr	SPRN_MMCRS, r4
			
 
				+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+22:	blr
			
 
				+
			
 
				 /*
			
 
				  * This works around a hardware bug on POWER8E processors, where
			
 
				  * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
			
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
				 			return RESUME_GUEST;
			
 
				 		}
			
 
				 		/* Set CR0 to indicate previous transactional state */
			
 
				-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
			
 
				+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
			
 
				 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
			
 
				 		/* L=1 => tresume, L=0 => tsuspend */
			
 
				 		if (instr & (1 << 21)) {
			
@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
				 		copy_from_checkpoint(vcpu);
			
 
				 
			
 
				 		/* Set CR0 to indicate previous transactional state */
			
 
				-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
			
 
				+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
			
 
				 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
			
 
				 		vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
			
 
				 		return RESUME_GUEST;
			
@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
 
				 		copy_to_checkpoint(vcpu);
			
 
				 
			
 
				 		/* Set CR0 to indicate previous transactional state */
			
 
				-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
			
 
				+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
			
 
				 			(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
			
 
				 		vcpu->arch.shregs.msr = msr | MSR_TS_S;
			
 
				 		return RESUME_GUEST;
			
--- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
 
				 		if (instr & (1 << 21))
			
 
				 			vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
			
 
				 		/* Set CR0 to 0b0010 */
			
 
				-		vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
			
 
				+		vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
			
 
				+			0x20000000;
			
 
				 		return 1;
			
 
				 	}
			
 
				 
			
@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
 
				 	vcpu->arch.shregs.msr &= ~MSR_TS_MASK;	/* go to N state */
			
 
				 	vcpu->arch.regs.nip = vcpu->arch.tfhar;
			
 
				 	copy_from_checkpoint(vcpu);
			
 
				-	vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
			
 
				+	vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
			
 
				 }
			
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
 
				 	svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
			
 
				 	svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
			
 
				 	svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
			
 
				-	svcpu->cr  = vcpu->arch.cr;
			
 
				+	svcpu->cr  = vcpu->arch.regs.ccr;
			
 
				 	svcpu->xer = vcpu->arch.regs.xer;
			
 
				 	svcpu->ctr = vcpu->arch.regs.ctr;
			
 
				 	svcpu->lr  = vcpu->arch.regs.link;
			
@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
 
				 	vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
			
 
				 	vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
			
 
				 	vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
			
 
				-	vcpu->arch.cr  = svcpu->cr;
			
 
				+	vcpu->arch.regs.ccr  = svcpu->cr;
			
 
				 	vcpu->arch.regs.xer = svcpu->xer;
			
 
				 	vcpu->arch.regs.ctr = svcpu->ctr;
			
 
				 	vcpu->arch.regs.link  = svcpu->lr;
			
@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 		r = RESUME_GUEST;
			
 
				 		break;
			
 
				 	case BOOK3S_INTERRUPT_EXTERNAL:
			
 
				-	case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
			
 
				 	case BOOK3S_INTERRUPT_EXTERNAL_HV:
			
 
				 	case BOOK3S_INTERRUPT_H_VIRT:
			
 
				 		vcpu->stat.ext_intr_exits++;
			
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
 
				 	 */
			
 
				 	if (new.out_ee) {
			
 
				 		kvmppc_book3s_queue_irqprio(icp->vcpu,
			
 
				-					    BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
			
 
				+					    BOOK3S_INTERRUPT_EXTERNAL);
			
 
				 		if (!change_self)
			
 
				 			kvmppc_fast_vcpu_kick(icp->vcpu);
			
 
				 	}
			
@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
 
				 	u32 xirr;
			
 
				 
			
 
				 	/* First, remove EE from the processor */
			
 
				-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
			
 
				-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
			
 
				+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
			
 
				 
			
 
				 	/*
			
 
				 	 * ICP State: Accept_Interrupt
			
@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
 
				 	 * We can remove EE from the current processor, the update
			
 
				 	 * transaction will set it again if needed
			
 
				 	 */
			
 
				-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
			
 
				-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
			
 
				+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
			
 
				 
			
 
				 	do {
			
 
				 		old_state = new_state = READ_ONCE(icp->state);
			
@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
 
				 	 * Deassert the CPU interrupt request.
			
 
				 	 * icp_try_update will reassert it if necessary.
			
 
				 	 */
			
 
				-	kvmppc_book3s_dequeue_irqprio(icp->vcpu,
			
 
				-				      BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
			
 
				+	kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
			
 
				 
			
 
				 	/*
			
 
				 	 * Note that if we displace an interrupt from old_state.xisr,
			
@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
 
				 	}
			
 
				 
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_206)) {
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_206) &&
			
 
				+	    cpu_has_feature(CPU_FTR_HVMODE)) {
			
 
				 		/* Enable real mode support */
			
 
				 		xics->real_mode = ENABLE_REALMODE;
			
 
				 		xics->real_mode_dbg = DEBUG_REALMODE;
			
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -61,6 +61,69 @@
 
				  */
			
 
				 #define XIVE_Q_GAP	2
			
 
				 
			
 
				+/*
			
 
				+ * Push a vcpu's context to the XIVE on guest entry.
			
 
				+ * This assumes we are in virtual mode (MMU on)
			
 
				+ */
			
 
				+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
			
 
				+	u64 pq;
			
 
				+
			
 
				+	if (!tima)
			
 
				+		return;
			
 
				+	eieio();
			
 
				+	__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
			
 
				+	__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
			
 
				+	vcpu->arch.xive_pushed = 1;
			
 
				+	eieio();
			
 
				+
			
 
				+	/*
			
 
				+	 * We clear the irq_pending flag. There is a small chance of a
			
 
				+	 * race vs. the escalation interrupt happening on another
			
 
				+	 * processor setting it again, but the only consequence is to
			
 
				+	 * cause a spurious wakeup on the next H_CEDE, which is not an
			
 
				+	 * issue.
			
 
				+	 */
			
 
				+	vcpu->arch.irq_pending = 0;
			
 
				+
			
 
				+	/*
			
 
				+	 * In single escalation mode, if the escalation interrupt is
			
 
				+	 * on, we mask it.
			
 
				+	 */
			
 
				+	if (vcpu->arch.xive_esc_on) {
			
 
				+		pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
			
 
				+						  XIVE_ESB_SET_PQ_01));
			
 
				+		mb();
			
 
				+
			
 
				+		/*
			
 
				+		 * We have a possible subtle race here: The escalation
			
 
				+		 * interrupt might have fired and be on its way to the
			
 
				+		 * host queue while we mask it, and if we unmask it
			
 
				+		 * early enough (re-cede right away), there is a
			
 
				+		 * theorical possibility that it fires again, thus
			
 
				+		 * landing in the target queue more than once which is
			
 
				+		 * a big no-no.
			
 
				+		 *
			
 
				+		 * Fortunately, solving this is rather easy. If the
			
 
				+		 * above load setting PQ to 01 returns a previous
			
 
				+		 * value where P is set, then we know the escalation
			
 
				+		 * interrupt is somewhere on its way to the host. In
			
 
				+		 * that case we simply don't clear the xive_esc_on
			
 
				+		 * flag below. It will be eventually cleared by the
			
 
				+		 * handler for the escalation interrupt.
			
 
				+		 *
			
 
				+		 * Then, when doing a cede, we check that flag again
			
 
				+		 * before re-enabling the escalation interrupt, and if
			
 
				+		 * set, we abort the cede.
			
 
				+		 */
			
 
				+		if (!(pq & XIVE_ESB_VAL_P))
			
 
				+			/* Now P is 0, we can clear the flag */
			
 
				+			vcpu->arch.xive_esc_on = 0;
			
 
				+	}
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
			
 
				+
			
 
				 /*
			
 
				  * This is a simple trigger for a generic XIVE IRQ. This must
			
 
				  * only be called for interrupts that support a trigger page
			
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
 
				 	/* First collect pending bits from HW */
			
 
				 	GLUE(X_PFX,ack_pending)(xc);
			
 
				 
			
 
				-	/*
			
 
				-	 * Cleanup the old-style bits if needed (they may have been
			
 
				-	 * set by pull or an escalation interrupts).
			
 
				-	 */
			
 
				-	if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
			
 
				-		clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
			
 
				-			  &vcpu->arch.pending_exceptions);
			
 
				-
			
 
				 	pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
			
 
				 		 xc->pending, xc->hw_cppr, xc->cppr);
			
 
				 
			
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -182,7 +182,7 @@
 
				 	 */
			
 
				 	PPC_LL	r4, PACACURRENT(r13)
			
 
				 	PPC_LL	r4, (THREAD + THREAD_KVM_VCPU)(r4)
			
 
				-	stw	r10, VCPU_CR(r4)
			
 
				+	PPC_STL	r10, VCPU_CR(r4)
			
 
				 	PPC_STL r11, VCPU_GPR(R4)(r4)
			
 
				 	PPC_STL	r5, VCPU_GPR(R5)(r4)
			
 
				 	PPC_STL	r6, VCPU_GPR(R6)(r4)
			
@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 
				 	PPC_STL	r4, VCPU_GPR(R4)(r11)
			
 
				 	PPC_LL	r4, THREAD_NORMSAVE(0)(r10)
			
 
				 	PPC_STL	r5, VCPU_GPR(R5)(r11)
			
 
				-	stw	r13, VCPU_CR(r11)
			
 
				+	PPC_STL	r13, VCPU_CR(r11)
			
 
				 	mfspr	r5, \srr0
			
 
				 	PPC_STL	r3, VCPU_GPR(R10)(r11)
			
 
				 	PPC_LL	r3, THREAD_NORMSAVE(2)(r10)
			
@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
 
				 	PPC_STL	r4, VCPU_GPR(R4)(r11)
			
 
				 	PPC_LL	r4, GPR9(r8)
			
 
				 	PPC_STL	r5, VCPU_GPR(R5)(r11)
			
 
				-	stw	r9, VCPU_CR(r11)
			
 
				+	PPC_STL	r9, VCPU_CR(r11)
			
 
				 	mfspr	r5, \srr0
			
 
				 	PPC_STL	r3, VCPU_GPR(R8)(r11)
			
 
				 	PPC_LL	r3, GPR10(r8)
			
@@ -643,7 +643,7 @@ lightweight_exit:
 
				 	PPC_LL	r3, VCPU_LR(r4)
			
 
				 	PPC_LL	r5, VCPU_XER(r4)
			
 
				 	PPC_LL	r6, VCPU_CTR(r4)
			
 
				-	lwz	r7, VCPU_CR(r4)
			
 
				+	PPC_LL	r7, VCPU_CR(r4)
			
 
				 	PPC_LL	r8, VCPU_PC(r4)
			
 
				 	PPC_LD(r9, VCPU_SHARED_MSR, r11)
			
 
				 	PPC_LL	r0, VCPU_GPR(R0)(r4)
			
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
 
				 
			
 
				 	emulated = EMULATE_FAIL;
			
 
				 	vcpu->arch.regs.msr = vcpu->arch.shared->msr;
			
 
				-	vcpu->arch.regs.ccr = vcpu->arch.cr;
			
 
				 	if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
			
 
				 		int type = op.type & INSTR_TYPE_MASK;
			
 
				 		int size = GETSIZE(op.type);
			
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -594,7 +594,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
				 		r = !!(hv_enabled && radix_enabled());
			
 
				 		break;
			
 
				 	case KVM_CAP_PPC_MMU_HASH_V3:
			
 
				-		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
			
 
				+		r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
			
 
				+		       cpu_has_feature(CPU_FTR_HVMODE));
			
 
				 		break;
			
 
				 #endif
			
 
				 	case KVM_CAP_SYNC_MMU:
			
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -28,17 +28,25 @@
 
				  * Save transactional state and TM-related registers.
			
 
				  * Called with:
			
 
				  * - r3 pointing to the vcpu struct
			
 
				- * - r4 points to the MSR with current TS bits:
			
 
				+ * - r4 containing the MSR with current TS bits:
			
 
				  * 	(For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
			
 
				- * This can modify all checkpointed registers, but
			
 
				- * restores r1, r2 before exit.
			
 
				+ * - r5 containing a flag indicating that non-volatile registers
			
 
				+ *	must be preserved.
			
 
				+ * If r5 == 0, this can modify all checkpointed registers, but
			
 
				+ * restores r1, r2 before exit.  If r5 != 0, this restores the
			
 
				+ * MSR TM/FP/VEC/VSX bits to their state on entry.
			
 
				  */
			
 
				 _GLOBAL(__kvmppc_save_tm)
			
 
				 	mflr	r0
			
 
				 	std	r0, PPC_LR_STKOFF(r1)
			
 
				+	stdu    r1, -SWITCH_FRAME_SIZE(r1)
			
 
				+
			
 
				+	mr	r9, r3
			
 
				+	cmpdi	cr7, r5, 0
			
 
				 
			
 
				 	/* Turn on TM. */
			
 
				 	mfmsr	r8
			
 
				+	mr	r10, r8
			
 
				 	li	r0, 1
			
 
				 	rldimi	r8, r0, MSR_TM_LG, 63-MSR_TM_LG
			
 
				 	ori     r8, r8, MSR_FP
			
@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
 
				 	std	r1, HSTATE_SCRATCH2(r13)
			
 
				 	std	r3, HSTATE_SCRATCH1(r13)
			
 
				 
			
 
				+	/* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
			
 
				+	mfcr	r6
			
 
				+	SAVE_GPR(6, r1)
			
 
				+
			
 
				+	/* Save DSCR so we can restore it to avoid running with user value */
			
 
				+	mfspr	r7, SPRN_DSCR
			
 
				+	SAVE_GPR(7, r1)
			
 
				+
			
 
				+	/*
			
 
				+	 * We are going to do treclaim., which will modify all checkpointed
			
 
				+	 * registers.  Save the non-volatile registers on the stack if
			
 
				+	 * preservation of non-volatile state has been requested.
			
 
				+	 */
			
 
				+	beq	cr7, 3f
			
 
				+	SAVE_NVGPRS(r1)
			
 
				+
			
 
				+	/* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
			
 
				+	li	r0, 0
			
 
				+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				+	SAVE_GPR(10, r1)	/* final MSR value */
			
 
				+3:
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 BEGIN_FTR_SECTION
			
 
				 	/* Emulation of the treclaim instruction needs TEXASR before treclaim */
			
@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
				 	std	r9, PACATMSCRATCH(r13)
			
 
				 	ld	r9, HSTATE_SCRATCH1(r13)
			
 
				 
			
 
				-	/* Get a few more GPRs free. */
			
 
				-	std	r29, VCPU_GPRS_TM(29)(r9)
			
 
				-	std	r30, VCPU_GPRS_TM(30)(r9)
			
 
				-	std	r31, VCPU_GPRS_TM(31)(r9)
			
 
				-
			
 
				-	/* Save away PPR and DSCR soon so don't run with user values. */
			
 
				-	mfspr	r31, SPRN_PPR
			
 
				+	/* Save away PPR soon so we don't run with user value. */
			
 
				+	std	r0, VCPU_GPRS_TM(0)(r9)
			
 
				+	mfspr	r0, SPRN_PPR
			
 
				 	HMT_MEDIUM
			
 
				-	mfspr	r30, SPRN_DSCR
			
 
				-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				-	ld	r29, HSTATE_DSCR(r13)
			
 
				-	mtspr	SPRN_DSCR, r29
			
 
				-#endif
			
 
				 
			
 
				-	/* Save all but r9, r13 & r29-r31 */
			
 
				-	reg = 0
			
 
				+	/* Reload stack pointer. */
			
 
				+	std	r1, VCPU_GPRS_TM(1)(r9)
			
 
				+	ld	r1, HSTATE_SCRATCH2(r13)
			
 
				+
			
 
				+	/* Set MSR RI now we have r1 and r13 back. */
			
 
				+	std	r2, VCPU_GPRS_TM(2)(r9)
			
 
				+	li	r2, MSR_RI
			
 
				+	mtmsrd	r2, 1
			
 
				+
			
 
				+	/* Reload TOC pointer. */
			
 
				+	ld	r2, PACATOC(r13)
			
 
				+
			
 
				+	/* Save all but r0-r2, r9 & r13 */
			
 
				+	reg = 3
			
 
				 	.rept	29
			
 
				 	.if (reg != 9) && (reg != 13)
			
 
				 	std	reg, VCPU_GPRS_TM(reg)(r9)
			
@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
				 	ld	r4, PACATMSCRATCH(r13)
			
 
				 	std	r4, VCPU_GPRS_TM(9)(r9)
			
 
				 
			
 
				-	/* Reload stack pointer and TOC. */
			
 
				-	ld	r1, HSTATE_SCRATCH2(r13)
			
 
				-	ld	r2, PACATOC(r13)
			
 
				-
			
 
				-	/* Set MSR RI now we have r1 and r13 back. */
			
 
				-	li	r5, MSR_RI
			
 
				-	mtmsrd	r5, 1
			
 
				+	/* Restore host DSCR and CR values, after saving guest values */
			
 
				+	mfcr	r6
			
 
				+	mfspr	r7, SPRN_DSCR
			
 
				+	stw	r6, VCPU_CR_TM(r9)
			
 
				+	std	r7, VCPU_DSCR_TM(r9)
			
 
				+	REST_GPR(6, r1)
			
 
				+	REST_GPR(7, r1)
			
 
				+	mtcr	r6
			
 
				+	mtspr	SPRN_DSCR, r7
			
 
				 
			
 
				-	/* Save away checkpinted SPRs. */
			
 
				-	std	r31, VCPU_PPR_TM(r9)
			
 
				-	std	r30, VCPU_DSCR_TM(r9)
			
 
				+	/* Save away checkpointed SPRs. */
			
 
				+	std	r0, VCPU_PPR_TM(r9)
			
 
				 	mflr	r5
			
 
				-	mfcr	r6
			
 
				 	mfctr	r7
			
 
				 	mfspr	r8, SPRN_AMR
			
 
				 	mfspr	r10, SPRN_TAR
			
 
				 	mfxer	r11
			
 
				 	std	r5, VCPU_LR_TM(r9)
			
 
				-	stw	r6, VCPU_CR_TM(r9)
			
 
				 	std	r7, VCPU_CTR_TM(r9)
			
 
				 	std	r8, VCPU_AMR_TM(r9)
			
 
				 	std	r10, VCPU_TAR_TM(r9)
			
 
				 	std	r11, VCPU_XER_TM(r9)
			
 
				 
			
 
				-	/* Restore r12 as trap number. */
			
 
				-	lwz	r12, VCPU_TRAP(r9)
			
 
				-
			
 
				 	/* Save FP/VSX. */
			
 
				 	addi	r3, r9, VCPU_FPRS_TM
			
 
				 	bl	store_fp_state
			
@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
				 	bl	store_vr_state
			
 
				 	mfspr	r6, SPRN_VRSAVE
			
 
				 	stw	r6, VCPU_VRSAVE_TM(r9)
			
 
				+
			
 
				+	/* Restore non-volatile registers if requested to */
			
 
				+	beq	cr7, 1f
			
 
				+	REST_NVGPRS(r1)
			
 
				+	REST_GPR(10, r1)
			
 
				 1:
			
 
				 	/*
			
 
				 	 * We need to save these SPRs after the treclaim so that the software
			
@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
				 	 */
			
 
				 	mfspr	r7, SPRN_TEXASR
			
 
				 	std	r7, VCPU_TEXASR(r9)
			
 
				-11:
			
 
				 	mfspr	r5, SPRN_TFHAR
			
 
				 	mfspr	r6, SPRN_TFIAR
			
 
				 	std	r5, VCPU_TFHAR(r9)
			
 
				 	std	r6, VCPU_TFIAR(r9)
			
 
				 
			
 
				+	/* Restore MSR state if requested */
			
 
				+	beq	cr7, 2f
			
 
				+	mtmsrd	r10, 0
			
 
				+2:
			
 
				+	addi	r1, r1, SWITCH_FRAME_SIZE
			
 
				 	ld	r0, PPC_LR_STKOFF(r1)
			
 
				 	mtlr	r0
			
 
				 	blr
			
@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
 
				  * be invoked from C function by PR KVM only.
			
 
				  */
			
 
				 _GLOBAL(_kvmppc_save_tm_pr)
			
 
				-	mflr	r5
			
 
				-	std	r5, PPC_LR_STKOFF(r1)
			
 
				-	stdu    r1, -SWITCH_FRAME_SIZE(r1)
			
 
				-	SAVE_NVGPRS(r1)
			
 
				-
			
 
				-	/* save MSR since TM/math bits might be impacted
			
 
				-	 * by __kvmppc_save_tm().
			
 
				-	 */
			
 
				-	mfmsr	r5
			
 
				-	SAVE_GPR(5, r1)
			
 
				-
			
 
				-	/* also save DSCR/CR/TAR so that it can be recovered later */
			
 
				-	mfspr   r6, SPRN_DSCR
			
 
				-	SAVE_GPR(6, r1)
			
 
				-
			
 
				-	mfcr    r7
			
 
				-	stw     r7, _CCR(r1)
			
 
				+	mflr	r0
			
 
				+	std	r0, PPC_LR_STKOFF(r1)
			
 
				+	stdu    r1, -PPC_MIN_STKFRM(r1)
			
 
				 
			
 
				 	mfspr   r8, SPRN_TAR
			
 
				-	SAVE_GPR(8, r1)
			
 
				+	std	r8, PPC_MIN_STKFRM-8(r1)
			
 
				 
			
 
				+	li	r5, 1		/* preserve non-volatile registers */
			
 
				 	bl	__kvmppc_save_tm
			
 
				 
			
 
				-	REST_GPR(8, r1)
			
 
				+	ld	r8, PPC_MIN_STKFRM-8(r1)
			
 
				 	mtspr   SPRN_TAR, r8
			
 
				 
			
 
				-	ld      r7, _CCR(r1)
			
 
				-	mtcr	r7
			
 
				-
			
 
				-	REST_GPR(6, r1)
			
 
				-	mtspr   SPRN_DSCR, r6
			
 
				-
			
 
				-	/* need preserve current MSR's MSR_TS bits */
			
 
				-	REST_GPR(5, r1)
			
 
				-	mfmsr   r6
			
 
				-	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
			
 
				-	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				-	mtmsrd  r5
			
 
				-
			
 
				-	REST_NVGPRS(r1)
			
 
				-	addi    r1, r1, SWITCH_FRAME_SIZE
			
 
				-	ld	r5, PPC_LR_STKOFF(r1)
			
 
				-	mtlr	r5
			
 
				+	addi    r1, r1, PPC_MIN_STKFRM
			
 
				+	ld	r0, PPC_LR_STKOFF(r1)
			
 
				+	mtlr	r0
			
 
				 	blr
			
 
				 
			
 
				 EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
			
@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
 
				  *  - r4 is the guest MSR with desired TS bits:
			
 
				  * 	For HV KVM, it is VCPU_MSR
			
 
				  * 	For PR KVM, it is provided by caller
			
 
				- * This potentially modifies all checkpointed registers.
			
 
				- * It restores r1, r2 from the PACA.
			
 
				+ * - r5 containing a flag indicating that non-volatile registers
			
 
				+ *	must be preserved.
			
 
				+ * If r5 == 0, this potentially modifies all checkpointed registers, but
			
 
				+ * restores r1, r2 from the PACA before exit.
			
 
				+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
			
 
				  */
			
 
				 _GLOBAL(__kvmppc_restore_tm)
			
 
				 	mflr	r0
			
 
				 	std	r0, PPC_LR_STKOFF(r1)
			
 
				 
			
 
				+	cmpdi	cr7, r5, 0
			
 
				+
			
 
				 	/* Turn on TM/FP/VSX/VMX so we can restore them. */
			
 
				 	mfmsr	r5
			
 
				+	mr	r10, r5
			
 
				 	li	r6, MSR_TM >> 32
			
 
				 	sldi	r6, r6, 32
			
 
				 	or	r5, r5, r6
			
@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
 
				 
			
 
				 	mr	r5, r4
			
 
				 	rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
			
 
				-	beqlr		/* TM not active in guest */
			
 
				-	std	r1, HSTATE_SCRATCH2(r13)
			
 
				+	beq	9f		/* TM not active in guest */
			
 
				 
			
 
				 	/* Make sure the failure summary is set, otherwise we'll program check
			
 
				 	 * when we trechkpt.  It's possible that this might have been not set
			
@@ -255,6 +270,26 @@ _GLOBAL(__kvmppc_restore_tm)
 
				 	oris	r7, r7, (TEXASR_FS)@h
			
 
				 	mtspr	SPRN_TEXASR, r7
			
 
				 
			
 
				+	/*
			
 
				+	 * Make a stack frame and save non-volatile registers if requested.
			
 
				+	 */
			
 
				+	stdu	r1, -SWITCH_FRAME_SIZE(r1)
			
 
				+	std	r1, HSTATE_SCRATCH2(r13)
			
 
				+
			
 
				+	mfcr	r6
			
 
				+	mfspr	r7, SPRN_DSCR
			
 
				+	SAVE_GPR(2, r1)
			
 
				+	SAVE_GPR(6, r1)
			
 
				+	SAVE_GPR(7, r1)
			
 
				+
			
 
				+	beq	cr7, 4f
			
 
				+	SAVE_NVGPRS(r1)
			
 
				+
			
 
				+	/* MSR[TS] will be 1 (suspended) once we do trechkpt */
			
 
				+	li	r0, 1
			
 
				+	rldimi	r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				+	SAVE_GPR(10, r1)	/* final MSR value */
			
 
				+4:
			
 
				 	/*
			
 
				 	 * We need to load up the checkpointed state for the guest.
			
 
				 	 * We need to do this early as it will blow away any GPRs, VSRs and
			
@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
 
				 	ld	r29, VCPU_DSCR_TM(r3)
			
 
				 	ld	r30, VCPU_PPR_TM(r3)
			
 
				 
			
 
				-	std	r2, PACATMSCRATCH(r13) /* Save TOC */
			
 
				-
			
 
				 	/* Clear the MSR RI since r1, r13 are all going to be foobar. */
			
 
				 	li	r5, 0
			
 
				 	mtmsrd	r5, 1
			
@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
 
				 	/* Now let's get back the state we need. */
			
 
				 	HMT_MEDIUM
			
 
				 	GET_PACA(r13)
			
 
				-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				-	ld	r29, HSTATE_DSCR(r13)
			
 
				-	mtspr	SPRN_DSCR, r29
			
 
				-#endif
			
 
				 	ld	r1, HSTATE_SCRATCH2(r13)
			
 
				-	ld	r2, PACATMSCRATCH(r13)
			
 
				+	REST_GPR(7, r1)
			
 
				+	mtspr	SPRN_DSCR, r7
			
 
				 
			
 
				 	/* Set the MSR RI since we have our registers back. */
			
 
				 	li	r5, MSR_RI
			
 
				 	mtmsrd	r5, 1
			
 
				+
			
 
				+	/* Restore TOC pointer and CR */
			
 
				+	REST_GPR(2, r1)
			
 
				+	REST_GPR(6, r1)
			
 
				+	mtcr	r6
			
 
				+
			
 
				+	/* Restore non-volatile registers if requested to. */
			
 
				+	beq	cr7, 5f
			
 
				+	REST_GPR(10, r1)
			
 
				+	REST_NVGPRS(r1)
			
 
				+
			
 
				+5:	addi	r1, r1, SWITCH_FRAME_SIZE
			
 
				 	ld	r0, PPC_LR_STKOFF(r1)
			
 
				 	mtlr	r0
			
 
				+
			
 
				+9:	/* Restore MSR bits if requested */
			
 
				+	beqlr	cr7
			
 
				+	mtmsrd	r10, 0
			
 
				 	blr
			
 
				 
			
 
				 /*
			
@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
 
				  * can be invoked from C function by PR KVM only.
			
 
				  */
			
 
				 _GLOBAL(_kvmppc_restore_tm_pr)
			
 
				-	mflr	r5
			
 
				-	std	r5, PPC_LR_STKOFF(r1)
			
 
				-	stdu    r1, -SWITCH_FRAME_SIZE(r1)
			
 
				-	SAVE_NVGPRS(r1)
			
 
				-
			
 
				-	/* save MSR to avoid TM/math bits change */
			
 
				-	mfmsr	r5
			
 
				-	SAVE_GPR(5, r1)
			
 
				-
			
 
				-	/* also save DSCR/CR/TAR so that it can be recovered later */
			
 
				-	mfspr   r6, SPRN_DSCR
			
 
				-	SAVE_GPR(6, r1)
			
 
				-
			
 
				-	mfcr    r7
			
 
				-	stw     r7, _CCR(r1)
			
 
				+	mflr	r0
			
 
				+	std	r0, PPC_LR_STKOFF(r1)
			
 
				+	stdu    r1, -PPC_MIN_STKFRM(r1)
			
 
				 
			
 
				+	/* save TAR so that it can be recovered later */
			
 
				 	mfspr   r8, SPRN_TAR
			
 
				-	SAVE_GPR(8, r1)
			
 
				+	std	r8, PPC_MIN_STKFRM-8(r1)
			
 
				 
			
 
				+	li	r5, 1
			
 
				 	bl	__kvmppc_restore_tm
			
 
				 
			
 
				-	REST_GPR(8, r1)
			
 
				+	ld	r8, PPC_MIN_STKFRM-8(r1)
			
 
				 	mtspr   SPRN_TAR, r8
			
 
				 
			
 
				-	ld      r7, _CCR(r1)
			
 
				-	mtcr	r7
			
 
				-
			
 
				-	REST_GPR(6, r1)
			
 
				-	mtspr   SPRN_DSCR, r6
			
 
				-
			
 
				-	/* need preserve current MSR's MSR_TS bits */
			
 
				-	REST_GPR(5, r1)
			
 
				-	mfmsr   r6
			
 
				-	rldicl  r6, r6, 64 - MSR_TS_S_LG, 62
			
 
				-	rldimi  r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
			
 
				-	mtmsrd  r5
			
 
				-
			
 
				-	REST_NVGPRS(r1)
			
 
				-	addi    r1, r1, SWITCH_FRAME_SIZE
			
 
				-	ld	r5, PPC_LR_STKOFF(r1)
			
 
				-	mtlr	r5
			
 
				+	addi    r1, r1, PPC_MIN_STKFRM
			
 
				+	ld	r0, PPC_LR_STKOFF(r1)
			
 
				+	mtlr	r0
			
 
				 	blr
			
 
				 
			
 
				 EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
			
--- a/arch/powerpc/kvm/trace_book3s.h
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -14,7 +14,6 @@
 
				 	{0x400, "INST_STORAGE"}, \
			
 
				 	{0x480, "INST_SEGMENT"}, \
			
 
				 	{0x500, "EXTERNAL"}, \
			
 
				-	{0x501, "EXTERNAL_LEVEL"}, \
			
 
				 	{0x502, "EXTERNAL_HV"}, \
			
 
				 	{0x600, "ALIGNMENT"}, \
			
 
				 	{0x700, "PROGRAM"}, \
			
--- a/arch/powerpc/mm/tlb-radix.c
+++ b/arch/powerpc/mm/tlb-radix.c
@@ -830,6 +830,15 @@ void radix__flush_pwc_lpid(unsigned int lpid)
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
			
 
				 
			
 
				+/*
			
 
				+ * Flush partition scoped translations from LPID (=LPIDR)
			
 
				+ */
			
 
				+void radix__flush_tlb_lpid(unsigned int lpid)
			
 
				+{
			
 
				+	_tlbie_lpid(lpid, RIC_FLUSH_ALL);
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
			
 
				+
			
 
				 /*
			
 
				  * Flush partition scoped translations from LPID (=LPIDR)
			
 
				  */
			
--- a/tools/perf/arch/powerpc/util/book3s_hv_exits.h
+++ b/tools/perf/arch/powerpc/util/book3s_hv_exits.h
@@ -15,7 +15,6 @@
 
				 	{0x400, "INST_STORAGE"}, \
			
 
				 	{0x480, "INST_SEGMENT"}, \
			
 
				 	{0x500, "EXTERNAL"}, \
			
 
				-	{0x501, "EXTERNAL_LEVEL"}, \
			
 
				 	{0x502, "EXTERNAL_HV"}, \
			
 
				 	{0x600, "ALIGNMENT"}, \
			
 
				 	{0x700, "PROGRAM"}, \