9 лет назад · ffcb09f27f
--- a/Documentation/virtual/kvm/00-INDEX
+++ b/Documentation/virtual/kvm/00-INDEX
@@ -6,6 +6,8 @@ cpuid.txt
 
				 	- KVM-specific cpuid leaves (x86).
			
 
				 devices/
			
 
				 	- KVM_CAP_DEVICE_CTRL userspace API.
			
 
				+halt-polling.txt
			
 
				+	- notes on halt-polling
			
 
				 hypercalls.txt
			
 
				 	- KVM hypercalls.
			
 
				 locking.txt
			
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2023,6 +2023,8 @@ registers, find a list below:
 
				   PPC   | KVM_REG_PPC_WORT              | 64
			
 
				   PPC	| KVM_REG_PPC_SPRG9             | 64
			
 
				   PPC	| KVM_REG_PPC_DBSR              | 32
			
 
				+  PPC   | KVM_REG_PPC_TIDR              | 64
			
 
				+  PPC   | KVM_REG_PPC_PSSCR             | 64
			
 
				   PPC   | KVM_REG_PPC_TM_GPR0           | 64
			
 
				           ...
			
 
				   PPC   | KVM_REG_PPC_TM_GPR31          | 64
			
@@ -2039,6 +2041,7 @@ registers, find a list below:
 
				   PPC   | KVM_REG_PPC_TM_VSCR           | 32
			
 
				   PPC   | KVM_REG_PPC_TM_DSCR           | 64
			
 
				   PPC   | KVM_REG_PPC_TM_TAR            | 64
			
 
				+  PPC   | KVM_REG_PPC_TM_XER            | 64
			
 
				         |                               |
			
 
				   MIPS  | KVM_REG_MIPS_R0               | 64
			
 
				           ...
			
--- a/Documentation/virtual/kvm/halt-polling.txt
+++ b/Documentation/virtual/kvm/halt-polling.txt
@@ -0,0 +1,127 @@
 
				+The KVM halt polling system
			
 
				+===========================
			
 
				+
			
 
				+The KVM halt polling system provides a feature within KVM whereby the latency
			
 
				+of a guest can, under some circumstances, be reduced by polling in the host
			
 
				+for some time period after the guest has elected to no longer run by cedeing.
			
 
				+That is, when a guest vcpu has ceded, or in the case of powerpc when all of the
			
 
				+vcpus of a single vcore have ceded, the host kernel polls for wakeup conditions
			
 
				+before giving up the cpu to the scheduler in order to let something else run.
			
 
				+
			
 
				+Polling provides a latency advantage in cases where the guest can be run again
			
 
				+very quickly by at least saving us a trip through the scheduler, normally on
			
 
				+the order of a few micro-seconds, although performance benefits are workload
			
 
				+dependant. In the event that no wakeup source arrives during the polling
			
 
				+interval or some other task on the runqueue is runnable the scheduler is
			
 
				+invoked. Thus halt polling is especially useful on workloads with very short
			
 
				+wakeup periods where the time spent halt polling is minimised and the time
			
 
				+savings of not invoking the scheduler are distinguishable.
			
 
				+
			
 
				+The generic halt polling code is implemented in:
			
 
				+
			
 
				+	virt/kvm/kvm_main.c: kvm_vcpu_block()
			
 
				+
			
 
				+The powerpc kvm-hv specific case is implemented in:
			
 
				+
			
 
				+	arch/powerpc/kvm/book3s_hv.c: kvmppc_vcore_blocked()
			
 
				+
			
 
				+Halt Polling Interval
			
 
				+=====================
			
 
				+
			
 
				+The maximum time for which to poll before invoking the scheduler, referred to
			
 
				+as the halt polling interval, is increased and decreased based on the perceived
			
 
				+effectiveness of the polling in an attempt to limit pointless polling.
			
 
				+This value is stored in either the vcpu struct:
			
 
				+
			
 
				+	kvm_vcpu->halt_poll_ns
			
 
				+
			
 
				+or in the case of powerpc kvm-hv, in the vcore struct:
			
 
				+
			
 
				+	kvmppc_vcore->halt_poll_ns
			
 
				+
			
 
				+Thus this is a per vcpu (or vcore) value.
			
 
				+
			
 
				+During polling if a wakeup source is received within the halt polling interval,
			
 
				+the interval is left unchanged. In the event that a wakeup source isn't
			
 
				+received during the polling interval (and thus schedule is invoked) there are
			
 
				+two options, either the polling interval and total block time[0] were less than
			
 
				+the global max polling interval (see module params below), or the total block
			
 
				+time was greater than the global max polling interval.
			
 
				+
			
 
				+In the event that both the polling interval and total block time were less than
			
 
				+the global max polling interval then the polling interval can be increased in
			
 
				+the hope that next time during the longer polling interval the wake up source
			
 
				+will be received while the host is polling and the latency benefits will be
			
 
				+received. The polling interval is grown in the function grow_halt_poll_ns() and
			
 
				+is multiplied by the module parameter halt_poll_ns_grow.
			
 
				+
			
 
				+In the event that the total block time was greater than the global max polling
			
 
				+interval then the host will never poll for long enough (limited by the global
			
 
				+max) to wakeup during the polling interval so it may as well be shrunk in order
			
 
				+to avoid pointless polling. The polling interval is shrunk in the function
			
 
				+shrink_halt_poll_ns() and is divided by the module parameter
			
 
				+halt_poll_ns_shrink, or set to 0 iff halt_poll_ns_shrink == 0.
			
 
				+
			
 
				+It is worth noting that this adjustment process attempts to hone in on some
			
 
				+steady state polling interval but will only really do a good job for wakeups
			
 
				+which come at an approximately constant rate, otherwise there will be constant
			
 
				+adjustment of the polling interval.
			
 
				+
			
 
				+[0] total block time: the time between when the halt polling function is
			
 
				+		      invoked and a wakeup source received (irrespective of
			
 
				+		      whether the scheduler is invoked within that function).
			
 
				+
			
 
				+Module Parameters
			
 
				+=================
			
 
				+
			
 
				+The kvm module has 3 tuneable module parameters to adjust the global max
			
 
				+polling interval as well as the rate at which the polling interval is grown and
			
 
				+shrunk. These variables are defined in include/linux/kvm_host.h and as module
			
 
				+parameters in virt/kvm/kvm_main.c, or arch/powerpc/kvm/book3s_hv.c in the
			
 
				+powerpc kvm-hv case.
			
 
				+
			
 
				+Module Parameter    |	     Description	      |	     Default Value
			
 
				+--------------------------------------------------------------------------------
			
 
				+halt_poll_ns	    | The global max polling interval | KVM_HALT_POLL_NS_DEFAULT
			
 
				+		    | which defines the ceiling value |
			
 
				+		    | of the polling interval for     | (per arch value)
			
 
				+		    | each vcpu. 		      |
			
 
				+--------------------------------------------------------------------------------
			
 
				+halt_poll_ns_grow   | The value by which the halt     |	2
			
 
				+		    | polling interval is multiplied  |
			
 
				+		    | in the grow_halt_poll_ns()      |
			
 
				+		    | function.			      |
			
 
				+--------------------------------------------------------------------------------
			
 
				+halt_poll_ns_shrink | The value by which the halt     |	0
			
 
				+		    | polling interval is divided in  |
			
 
				+		    | the shrink_halt_poll_ns()	      |
			
 
				+		    | function.			      |
			
 
				+--------------------------------------------------------------------------------
			
 
				+
			
 
				+These module parameters can be set from the debugfs files in:
			
 
				+
			
 
				+	/sys/module/kvm/parameters/
			
 
				+
			
 
				+Note: that these module parameters are system wide values and are not able to
			
 
				+      be tuned on a per vm basis.
			
 
				+
			
 
				+Further Notes
			
 
				+=============
			
 
				+
			
 
				+- Care should be taken when setting the halt_poll_ns module parameter as a
			
 
				+large value has the potential to drive the cpu usage to 100% on a machine which
			
 
				+would be almost entirely idle otherwise. This is because even if a guest has
			
 
				+wakeups during which very little work is done and which are quite far apart, if
			
 
				+the period is shorter than the global max polling interval (halt_poll_ns) then
			
 
				+the host will always poll for the entire block time and thus cpu utilisation
			
 
				+will go to 100%.
			
 
				+
			
 
				+- Halt polling essentially presents a trade off between power usage and latency
			
 
				+and the module parameters should be used to tune the affinity for this. Idle
			
 
				+cpu time is essentially converted to host kernel time with the aim of decreasing
			
 
				+latency when entering the guest.
			
 
				+
			
 
				+- Halt polling will only be conducted by the host when no other tasks are
			
 
				+runnable on that cpu, otherwise the polling will cease immediately and
			
 
				+schedule will be invoked to allow that other task to run. Thus this doesn't
			
 
				+allow a guest to denial of service the cpu.
			
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -14,6 +14,9 @@
 
				 
			
 
				 #include <linux/threads.h>
			
 
				 #include <linux/kprobes.h>
			
 
				+#ifdef CONFIG_KVM
			
 
				+#include <linux/kvm_host.h>
			
 
				+#endif
			
 
				 
			
 
				 #include <uapi/asm/ucontext.h>
			
 
				 
			
@@ -109,4 +112,45 @@ void early_setup_secondary(void);
 
				 /* time */
			
 
				 void accumulate_stolen_time(void);
			
 
				 
			
 
				+/* kvm */
			
 
				+#ifdef CONFIG_KVM
			
 
				+long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
			
 
				+			 unsigned long ioba, unsigned long tce);
			
 
				+long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
			
 
				+				  unsigned long liobn, unsigned long ioba,
			
 
				+				  unsigned long tce_list, unsigned long npages);
			
 
				+long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu,
			
 
				+			   unsigned long liobn, unsigned long ioba,
			
 
				+			   unsigned long tce_value, unsigned long npages);
			
 
				+long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target,
			
 
				+                            unsigned int yield_count);
			
 
				+long kvmppc_h_random(struct kvm_vcpu *vcpu);
			
 
				+void kvmhv_commence_exit(int trap);
			
 
				+long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
			
 
				+void kvmppc_subcore_enter_guest(void);
			
 
				+void kvmppc_subcore_exit_guest(void);
			
 
				+long kvmppc_realmode_hmi_handler(void);
			
 
				+long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                    long pte_index, unsigned long pteh, unsigned long ptel);
			
 
				+long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                     unsigned long pte_index, unsigned long avpn);
			
 
				+long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu);
			
 
				+long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                      unsigned long pte_index, unsigned long avpn,
			
 
				+                      unsigned long va);
			
 
				+long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                   unsigned long pte_index);
			
 
				+long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                        unsigned long pte_index);
			
 
				+long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
			
 
				+                        unsigned long pte_index);
			
 
				+long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
			
 
				+                          unsigned long slb_v, unsigned int status, bool data);
			
 
				+unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu);
			
 
				+int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
			
 
				+                    unsigned long mfrr);
			
 
				+int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
			
 
				+int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
			
 
				+#endif
			
 
				+
			
 
				 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
			
--- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h
+++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h
@@ -70,7 +70,9 @@
 
				 
			
 
				 #define HPTE_V_SSIZE_SHIFT	62
			
 
				 #define HPTE_V_AVPN_SHIFT	7
			
 
				+#define HPTE_V_COMMON_BITS	ASM_CONST(0x000fffffffffffff)
			
 
				 #define HPTE_V_AVPN		ASM_CONST(0x3fffffffffffff80)
			
 
				+#define HPTE_V_AVPN_3_0		ASM_CONST(0x000fffffffffff80)
			
 
				 #define HPTE_V_AVPN_VAL(x)	(((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT)
			
 
				 #define HPTE_V_COMPARE(x,y)	(!(((x) ^ (y)) & 0xffffffffffffff80UL))
			
 
				 #define HPTE_V_BOLTED		ASM_CONST(0x0000000000000010)
			
@@ -80,14 +82,16 @@
 
				 #define HPTE_V_VALID		ASM_CONST(0x0000000000000001)
			
 
				 
			
 
				 /*
			
 
				- * ISA 3.0 have a different HPTE format.
			
 
				+ * ISA 3.0 has a different HPTE format.
			
 
				  */
			
 
				 #define HPTE_R_3_0_SSIZE_SHIFT	58
			
 
				+#define HPTE_R_3_0_SSIZE_MASK	(3ull << HPTE_R_3_0_SSIZE_SHIFT)
			
 
				 #define HPTE_R_PP0		ASM_CONST(0x8000000000000000)
			
 
				 #define HPTE_R_TS		ASM_CONST(0x4000000000000000)
			
 
				 #define HPTE_R_KEY_HI		ASM_CONST(0x3000000000000000)
			
 
				 #define HPTE_R_RPN_SHIFT	12
			
 
				 #define HPTE_R_RPN		ASM_CONST(0x0ffffffffffff000)
			
 
				+#define HPTE_R_RPN_3_0		ASM_CONST(0x01fffffffffff000)
			
 
				 #define HPTE_R_PP		ASM_CONST(0x0000000000000003)
			
 
				 #define HPTE_R_PPP		ASM_CONST(0x8000000000000003)
			
 
				 #define HPTE_R_N		ASM_CONST(0x0000000000000004)
			
@@ -316,11 +320,42 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize,
 
				 	 */
			
 
				 	v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm);
			
 
				 	v <<= HPTE_V_AVPN_SHIFT;
			
 
				-	if (!cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-		v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
			
 
				+	v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT;
			
 
				 	return v;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * ISA v3.0 defines a new HPTE format, which differs from the old
			
 
				+ * format in having smaller AVPN and ARPN fields, and the B field
			
 
				+ * in the second dword instead of the first.
			
 
				+ */
			
 
				+static inline unsigned long hpte_old_to_new_v(unsigned long v)
			
 
				+{
			
 
				+	/* trim AVPN, drop B */
			
 
				+	return v & HPTE_V_COMMON_BITS;
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r)
			
 
				+{
			
 
				+	/* move B field from 1st to 2nd dword, trim ARPN */
			
 
				+	return (r & ~HPTE_R_3_0_SSIZE_MASK) |
			
 
				+		(((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT);
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r)
			
 
				+{
			
 
				+	/* insert B field */
			
 
				+	return (v & HPTE_V_COMMON_BITS) |
			
 
				+		((r & HPTE_R_3_0_SSIZE_MASK) <<
			
 
				+		 (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT));
			
 
				+}
			
 
				+
			
 
				+static inline unsigned long hpte_new_to_old_r(unsigned long r)
			
 
				+{
			
 
				+	/* clear out B field */
			
 
				+	return r & ~HPTE_R_3_0_SSIZE_MASK;
			
 
				+}
			
 
				+
			
 
				 /*
			
 
				  * This function sets the AVPN and L fields of the HPTE  appropriately
			
 
				  * using the base page size and actual page size.
			
@@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize,
 
				  * aligned for the requested page size
			
 
				  */
			
 
				 static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize,
			
 
				-					  int actual_psize, int ssize)
			
 
				+					  int actual_psize)
			
 
				 {
			
 
				-
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-		pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT;
			
 
				-
			
 
				 	/* A 4K page needs no special encoding */
			
 
				 	if (actual_psize == MMU_PAGE_4K)
			
 
				 		return pa & HPTE_R_RPN;
			
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -99,6 +99,7 @@
 
				 #define BOOK3S_INTERRUPT_H_EMUL_ASSIST	0xe40
			
 
				 #define BOOK3S_INTERRUPT_HMI		0xe60
			
 
				 #define BOOK3S_INTERRUPT_H_DOORBELL	0xe80
			
 
				+#define BOOK3S_INTERRUPT_H_VIRT		0xea0
			
 
				 #define BOOK3S_INTERRUPT_PERFMON	0xf00
			
 
				 #define BOOK3S_INTERRUPT_ALTIVEC	0xf20
			
 
				 #define BOOK3S_INTERRUPT_VSX		0xf40
			
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -48,7 +48,7 @@
 
				 #ifdef CONFIG_KVM_MMIO
			
 
				 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
			
 
				 #endif
			
 
				-#define KVM_HALT_POLL_NS_DEFAULT 500000
			
 
				+#define KVM_HALT_POLL_NS_DEFAULT 10000	/* 10 us */
			
 
				 
			
 
				 /* These values are internal and can be increased later */
			
 
				 #define KVM_NR_IRQCHIPS          1
			
@@ -244,8 +244,10 @@ struct kvm_arch_memory_slot {
 
				 struct kvm_arch {
			
 
				 	unsigned int lpid;
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				+	unsigned int tlb_sets;
			
 
				 	unsigned long hpt_virt;
			
 
				 	struct revmap_entry *revmap;
			
 
				+	atomic64_t mmio_update;
			
 
				 	unsigned int host_lpid;
			
 
				 	unsigned long host_lpcr;
			
 
				 	unsigned long sdr1;
			
@@ -408,6 +410,24 @@ struct kvmppc_passthru_irqmap {
 
				 #define KVMPPC_IRQ_MPIC		1
			
 
				 #define KVMPPC_IRQ_XICS		2
			
 
				 
			
 
				+#define MMIO_HPTE_CACHE_SIZE	4
			
 
				+
			
 
				+struct mmio_hpte_cache_entry {
			
 
				+	unsigned long hpte_v;
			
 
				+	unsigned long hpte_r;
			
 
				+	unsigned long rpte;
			
 
				+	unsigned long pte_index;
			
 
				+	unsigned long eaddr;
			
 
				+	unsigned long slb_v;
			
 
				+	long mmio_update;
			
 
				+	unsigned int slb_base_pshift;
			
 
				+};
			
 
				+
			
 
				+struct mmio_hpte_cache {
			
 
				+	struct mmio_hpte_cache_entry entry[MMIO_HPTE_CACHE_SIZE];
			
 
				+	unsigned int index;
			
 
				+};
			
 
				+
			
 
				 struct openpic;
			
 
				 
			
 
				 struct kvm_vcpu_arch {
			
@@ -498,6 +518,8 @@ struct kvm_vcpu_arch {
 
				 	ulong tcscr;
			
 
				 	ulong acop;
			
 
				 	ulong wort;
			
 
				+	ulong tid;
			
 
				+	ulong psscr;
			
 
				 	ulong shadow_srr1;
			
 
				 #endif
			
 
				 	u32 vrsave; /* also USPRG0 */
			
@@ -546,6 +568,7 @@ struct kvm_vcpu_arch {
 
				 	u64 tfiar;
			
 
				 
			
 
				 	u32 cr_tm;
			
 
				+	u64 xer_tm;
			
 
				 	u64 lr_tm;
			
 
				 	u64 ctr_tm;
			
 
				 	u64 amr_tm;
			
@@ -655,9 +678,11 @@ struct kvm_vcpu_arch {
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 	struct kvm_vcpu_arch_shared shregs;
			
 
				 
			
 
				+	struct mmio_hpte_cache mmio_cache;
			
 
				 	unsigned long pgfault_addr;
			
 
				 	long pgfault_index;
			
 
				 	unsigned long pgfault_hpte[2];
			
 
				+	struct mmio_hpte_cache_entry *pgfault_cache;
			
 
				 
			
 
				 	struct task_struct *run_task;
			
 
				 	struct kvm_run *kvm_run;
			
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -483,9 +483,10 @@ extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
 
				 				   unsigned long host_irq);
			
 
				 extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
			
 
				 				   unsigned long host_irq);
			
 
				-extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
			
 
				-				 struct kvmppc_irq_map *irq_map,
			
 
				-				 struct kvmppc_passthru_irqmap *pimap);
			
 
				+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr,
			
 
				+					struct kvmppc_irq_map *irq_map,
			
 
				+					struct kvmppc_passthru_irqmap *pimap,
			
 
				+					bool *again);
			
 
				 extern int h_ipi_redirect;
			
 
				 #else
			
 
				 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
			
--- a/arch/powerpc/include/asm/mmu.h
+++ b/arch/powerpc/include/asm/mmu.h
@@ -208,6 +208,11 @@ extern u64 ppc64_rma_size;
 
				 /* Cleanup function used by kexec */
			
 
				 extern void mmu_cleanup_all(void);
			
 
				 extern void radix__mmu_cleanup_all(void);
			
 
				+
			
 
				+/* Functions for creating and updating partition table on POWER9 */
			
 
				+extern void mmu_partition_table_init(void);
			
 
				+extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
			
 
				+					  unsigned long dw1);
			
 
				 #endif /* CONFIG_PPC64 */
			
 
				 
			
 
				 struct mm_struct;
			
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id,
 
				 int64_t opal_pci_poll2(uint64_t id, uint64_t data);
			
 
				 
			
 
				 int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll);
			
 
				+int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll);
			
 
				 int64_t opal_int_set_cppr(uint8_t cppr);
			
 
				 int64_t opal_int_eoi(uint32_t xirr);
			
 
				+int64_t opal_rm_int_eoi(uint32_t xirr);
			
 
				 int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
			
 
				+int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr);
			
 
				 int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type,
			
 
				 			  uint32_t pe_num, uint32_t tce_size,
			
 
				 			  uint64_t dma_addr, uint32_t npages);
			
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -153,6 +153,8 @@
 
				 #define PSSCR_EC		0x00100000 /* Exit Criterion */
			
 
				 #define PSSCR_ESL		0x00200000 /* Enable State Loss */
			
 
				 #define PSSCR_SD		0x00400000 /* Status Disable */
			
 
				+#define PSSCR_PLS	0xf000000000000000 /* Power-saving Level Status */
			
 
				+#define PSSCR_GUEST_VIS	0xf0000000000003ff /* Guest-visible PSSCR fields */
			
 
				 
			
 
				 /* Floating Point Status and Control Register (FPSCR) Fields */
			
 
				 #define FPSCR_FX	0x80000000	/* FPU exception summary */
			
@@ -236,6 +238,7 @@
 
				 #define SPRN_TEXASRU	0x83	/* ''	   ''	   ''	 Upper 32  */
			
 
				 #define   TEXASR_FS	__MASK(63-36) /* TEXASR Failure Summary */
			
 
				 #define SPRN_TFHAR	0x80	/* Transaction Failure Handler Addr */
			
 
				+#define SPRN_TIDR	144	/* Thread ID register */
			
 
				 #define SPRN_CTRLF	0x088
			
 
				 #define SPRN_CTRLT	0x098
			
 
				 #define   CTRL_CT	0xc0000000	/* current thread */
			
@@ -294,6 +297,7 @@
 
				 #define SPRN_HSRR1	0x13B	/* Hypervisor Save/Restore 1 */
			
 
				 #define SPRN_LMRR	0x32D	/* Load Monitor Region Register */
			
 
				 #define SPRN_LMSER	0x32E	/* Load Monitor Section Enable Register */
			
 
				+#define SPRN_ASDR	0x330	/* Access segment descriptor register */
			
 
				 #define SPRN_IC		0x350	/* Virtual Instruction Count */
			
 
				 #define SPRN_VTB	0x351	/* Virtual Time Base */
			
 
				 #define SPRN_LDBAR	0x352	/* LD Base Address Register */
			
@@ -305,6 +309,7 @@
 
				 
			
 
				 /* HFSCR and FSCR bit numbers are the same */
			
 
				 #define FSCR_LM_LG	11	/* Enable Load Monitor Registers */
			
 
				+#define FSCR_MSGP_LG	10	/* Enable MSGP */
			
 
				 #define FSCR_TAR_LG	8	/* Enable Target Address Register */
			
 
				 #define FSCR_EBB_LG	7	/* Enable Event Based Branching */
			
 
				 #define FSCR_TM_LG	5	/* Enable Transactional Memory */
			
@@ -320,6 +325,7 @@
 
				 #define   FSCR_DSCR	__MASK(FSCR_DSCR_LG)
			
 
				 #define SPRN_HFSCR	0xbe	/* HV=1 Facility Status & Control Register */
			
 
				 #define   HFSCR_LM	__MASK(FSCR_LM_LG)
			
 
				+#define   HFSCR_MSGP	__MASK(FSCR_MSGP_LG)
			
 
				 #define   HFSCR_TAR	__MASK(FSCR_TAR_LG)
			
 
				 #define   HFSCR_EBB	__MASK(FSCR_EBB_LG)
			
 
				 #define   HFSCR_TM	__MASK(FSCR_TM_LG)
			
@@ -355,8 +361,10 @@
 
				 #define     LPCR_PECE0		ASM_CONST(0x0000000000004000)	/* ext. exceptions can cause exit */
			
 
				 #define     LPCR_PECE1		ASM_CONST(0x0000000000002000)	/* decrementer can cause exit */
			
 
				 #define     LPCR_PECE2		ASM_CONST(0x0000000000001000)	/* machine check etc can cause exit */
			
 
				+#define     LPCR_PECE_HVEE	ASM_CONST(0x0000400000000000)	/* P9 Wakeup on HV interrupts */
			
 
				 #define   LPCR_MER		ASM_CONST(0x0000000000000800)	/* Mediated External Exception */
			
 
				 #define   LPCR_MER_SH		11
			
 
				+#define	  LPCR_GTSE		ASM_CONST(0x0000000000000400)  	/* Guest Translation Shootdown Enable */
			
 
				 #define   LPCR_TC		ASM_CONST(0x0000000000000200)	/* Translation control */
			
 
				 #define   LPCR_LPES		0x0000000c
			
 
				 #define   LPCR_LPES0		ASM_CONST(0x0000000000000008)      /* LPAR Env selector 0 */
			
@@ -377,6 +385,12 @@
 
				 #define   PCR_VEC_DIS	(1ul << (63-0))	/* Vec. disable (bit NA since POWER8) */
			
 
				 #define   PCR_VSX_DIS	(1ul << (63-1))	/* VSX disable (bit NA since POWER8) */
			
 
				 #define   PCR_TM_DIS	(1ul << (63-2))	/* Trans. memory disable (POWER8) */
			
 
				+/*
			
 
				+ * These bits are used in the function kvmppc_set_arch_compat() to specify and
			
 
				+ * determine both the compatibility level which we want to emulate and the
			
 
				+ * compatibility level which the host is capable of emulating.
			
 
				+ */
			
 
				+#define   PCR_ARCH_207	0x8		/* Architecture 2.07 */
			
 
				 #define   PCR_ARCH_206	0x4		/* Architecture 2.06 */
			
 
				 #define   PCR_ARCH_205	0x2		/* Architecture 2.05 */
			
 
				 #define	SPRN_HEIR	0x153	/* Hypervisor Emulated Instruction Register */
			
@@ -1218,6 +1232,7 @@
 
				 #define PVR_ARCH_206	0x0f000003
			
 
				 #define PVR_ARCH_206p	0x0f100003
			
 
				 #define PVR_ARCH_207	0x0f000004
			
 
				+#define PVR_ARCH_300	0x0f000005
			
 
				 
			
 
				 /* Macros for setting and retrieving special purpose registers */
			
 
				 #ifndef __ASSEMBLY__
			
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -573,6 +573,10 @@ struct kvm_get_htab_header {
 
				 #define KVM_REG_PPC_SPRG9	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba)
			
 
				 #define KVM_REG_PPC_DBSR	(KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb)
			
 
				 
			
 
				+/* POWER9 registers */
			
 
				+#define KVM_REG_PPC_TIDR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc)
			
 
				+#define KVM_REG_PPC_PSSCR	(KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd)
			
 
				+
			
 
				 /* Transactional Memory checkpointed state:
			
 
				  * This is all GPRs, all VSX regs and a subset of SPRs
			
 
				  */
			
@@ -596,6 +600,7 @@ struct kvm_get_htab_header {
 
				 #define KVM_REG_PPC_TM_VSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67)
			
 
				 #define KVM_REG_PPC_TM_DSCR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68)
			
 
				 #define KVM_REG_PPC_TM_TAR	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69)
			
 
				+#define KVM_REG_PPC_TM_XER	(KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a)
			
 
				 
			
 
				 /* PPC64 eXternal Interrupt Controller Specification */
			
 
				 #define KVM_DEV_XICS_GRP_SOURCES	1	/* 64-bit source attributes */
			
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -487,6 +487,7 @@ int main(void)
 
				 
			
 
				 	/* book3s */
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				+	DEFINE(KVM_TLB_SETS, offsetof(struct kvm, arch.tlb_sets));
			
 
				 	DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1));
			
 
				 	DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid));
			
 
				 	DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr));
			
@@ -548,6 +549,8 @@ int main(void)
 
				 	DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr));
			
 
				 	DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop));
			
 
				 	DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort));
			
 
				+	DEFINE(VCPU_TID, offsetof(struct kvm_vcpu, arch.tid));
			
 
				+	DEFINE(VCPU_PSSCR, offsetof(struct kvm_vcpu, arch.psscr));
			
 
				 	DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map));
			
 
				 	DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest));
			
 
				 	DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads));
			
@@ -569,6 +572,7 @@ int main(void)
 
				 	DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr));
			
 
				 	DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm));
			
 
				 	DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm));
			
 
				+	DEFINE(VCPU_XER_TM, offsetof(struct kvm_vcpu, arch.xer_tm));
			
 
				 	DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm));
			
 
				 	DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm));
			
 
				 	DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm));
			
--- a/arch/powerpc/kernel/cpu_setup_power.S
+++ b/arch/powerpc/kernel/cpu_setup_power.S
@@ -174,7 +174,7 @@ __init_FSCR:
 
				 __init_HFSCR:
			
 
				 	mfspr	r3,SPRN_HFSCR
			
 
				 	ori	r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\
			
 
				-		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB
			
 
				+		      HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP
			
 
				 	mtspr	SPRN_HFSCR,r3
			
 
				 	blr
			
 
				 
			
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -88,6 +88,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp)
 
				 	/* 128 (2**7) bytes in each HPTEG */
			
 
				 	kvm->arch.hpt_mask = (1ul << (order - 7)) - 1;
			
 
				 
			
 
				+	atomic64_set(&kvm->arch.mmio_update, 0);
			
 
				+
			
 
				 	/* Allocate reverse map array */
			
 
				 	rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte);
			
 
				 	if (!rev) {
			
@@ -255,7 +257,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu)
 
				 	kvmppc_set_msr(vcpu, msr);
			
 
				 }
			
 
				 
			
 
				-long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
			
 
				+static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags,
			
 
				 				long pte_index, unsigned long pteh,
			
 
				 				unsigned long ptel, unsigned long *pte_idx_ret)
			
 
				 {
			
@@ -312,7 +314,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
				 	struct kvmppc_slb *slbe;
			
 
				 	unsigned long slb_v;
			
 
				 	unsigned long pp, key;
			
 
				-	unsigned long v, gr;
			
 
				+	unsigned long v, orig_v, gr;
			
 
				 	__be64 *hptep;
			
 
				 	int index;
			
 
				 	int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
			
@@ -337,10 +339,12 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
 
				 		return -ENOENT;
			
 
				 	}
			
 
				 	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
			
 
				-	v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
			
 
				+	v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK;
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1]));
			
 
				 	gr = kvm->arch.revmap[index].guest_rpte;
			
 
				 
			
 
				-	unlock_hpte(hptep, v);
			
 
				+	unlock_hpte(hptep, orig_v);
			
 
				 	preempt_enable();
			
 
				 
			
 
				 	gpte->eaddr = eaddr;
			
@@ -438,6 +442,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 {
			
 
				 	struct kvm *kvm = vcpu->kvm;
			
 
				 	unsigned long hpte[3], r;
			
 
				+	unsigned long hnow_v, hnow_r;
			
 
				 	__be64 *hptep;
			
 
				 	unsigned long mmu_seq, psize, pte_size;
			
 
				 	unsigned long gpa_base, gfn_base;
			
@@ -451,6 +456,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	unsigned int writing, write_ok;
			
 
				 	struct vm_area_struct *vma;
			
 
				 	unsigned long rcbits;
			
 
				+	long mmio_update;
			
 
				 
			
 
				 	/*
			
 
				 	 * Real-mode code has already searched the HPT and found the
			
@@ -460,6 +466,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	 */
			
 
				 	if (ea != vcpu->arch.pgfault_addr)
			
 
				 		return RESUME_GUEST;
			
 
				+
			
 
				+	if (vcpu->arch.pgfault_cache) {
			
 
				+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
			
 
				+		if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
			
 
				+			r = vcpu->arch.pgfault_cache->rpte;
			
 
				+			psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
			
 
				+			gpa_base = r & HPTE_R_RPN & ~(psize - 1);
			
 
				+			gfn_base = gpa_base >> PAGE_SHIFT;
			
 
				+			gpa = gpa_base | (ea & (psize - 1));
			
 
				+			return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
			
 
				+						dsisr & DSISR_ISSTORE);
			
 
				+		}
			
 
				+	}
			
 
				 	index = vcpu->arch.pgfault_index;
			
 
				 	hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
			
 
				 	rev = &kvm->arch.revmap[index];
			
@@ -472,6 +491,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	unlock_hpte(hptep, hpte[0]);
			
 
				 	preempt_enable();
			
 
				 
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]);
			
 
				+		hpte[1] = hpte_new_to_old_r(hpte[1]);
			
 
				+	}
			
 
				 	if (hpte[0] != vcpu->arch.pgfault_hpte[0] ||
			
 
				 	    hpte[1] != vcpu->arch.pgfault_hpte[1])
			
 
				 		return RESUME_GUEST;
			
@@ -575,16 +598,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 	 */
			
 
				 	if (psize < PAGE_SIZE)
			
 
				 		psize = PAGE_SIZE;
			
 
				-	r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1));
			
 
				+	r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) |
			
 
				+					((pfn << PAGE_SHIFT) & ~(psize - 1));
			
 
				 	if (hpte_is_writable(r) && !write_ok)
			
 
				 		r = hpte_make_readonly(r);
			
 
				 	ret = RESUME_GUEST;
			
 
				 	preempt_disable();
			
 
				 	while (!try_lock_hpte(hptep, HPTE_V_HVLOCK))
			
 
				 		cpu_relax();
			
 
				-	if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] ||
			
 
				-		be64_to_cpu(hptep[1]) != hpte[1] ||
			
 
				-		rev->guest_rpte != hpte[2])
			
 
				+	hnow_v = be64_to_cpu(hptep[0]);
			
 
				+	hnow_r = be64_to_cpu(hptep[1]);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hnow_v = hpte_new_to_old_v(hnow_v, hnow_r);
			
 
				+		hnow_r = hpte_new_to_old_r(hnow_r);
			
 
				+	}
			
 
				+	if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] ||
			
 
				+	    rev->guest_rpte != hpte[2])
			
 
				 		/* HPTE has been changed under us; let the guest retry */
			
 
				 		goto out_unlock;
			
 
				 	hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID;
			
@@ -615,6 +644,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 		kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0);
			
 
				 	}
			
 
				 
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		r = hpte_old_to_new_r(hpte[0], r);
			
 
				+		hpte[0] = hpte_old_to_new_v(hpte[0]);
			
 
				+	}
			
 
				 	hptep[1] = cpu_to_be64(r);
			
 
				 	eieio();
			
 
				 	__unlock_hpte(hptep, hpte[0]);
			
@@ -758,6 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp,
 
				 		    hpte_rpn(ptel, psize) == gfn) {
			
 
				 			hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
			
 
				 			kvmppc_invalidate_hpte(kvm, hptep, i);
			
 
				+			hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
			
 
				 			/* Harvest R and C */
			
 
				 			rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
			
 
				 			*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
			
@@ -1165,7 +1199,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 
				 			unsigned long *hpte, struct revmap_entry *revp,
			
 
				 			int want_valid, int first_pass)
			
 
				 {
			
 
				-	unsigned long v, r;
			
 
				+	unsigned long v, r, hr;
			
 
				 	unsigned long rcbits_unset;
			
 
				 	int ok = 1;
			
 
				 	int valid, dirty;
			
@@ -1192,6 +1226,11 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 
				 		while (!try_lock_hpte(hptp, HPTE_V_HVLOCK))
			
 
				 			cpu_relax();
			
 
				 		v = be64_to_cpu(hptp[0]);
			
 
				+		hr = be64_to_cpu(hptp[1]);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+			v = hpte_new_to_old_v(v, hr);
			
 
				+			hr = hpte_new_to_old_r(hr);
			
 
				+		}
			
 
				 
			
 
				 		/* re-evaluate valid and dirty from synchronized HPTE value */
			
 
				 		valid = !!(v & HPTE_V_VALID);
			
@@ -1199,8 +1238,8 @@ static long record_hpte(unsigned long flags, __be64 *hptp,
 
				 
			
 
				 		/* Harvest R and C into guest view if necessary */
			
 
				 		rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C);
			
 
				-		if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) {
			
 
				-			revp->guest_rpte |= (be64_to_cpu(hptp[1]) &
			
 
				+		if (valid && (rcbits_unset & hr)) {
			
 
				+			revp->guest_rpte |= (hr &
			
 
				 				(HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED;
			
 
				 			dirty = 1;
			
 
				 		}
			
@@ -1608,7 +1647,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				-ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
			
 
				+static ssize_t debugfs_htab_write(struct file *file, const char __user *buf,
			
 
				 			   size_t len, loff_t *ppos)
			
 
				 {
			
 
				 	return -EACCES;
			
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -39,7 +39,7 @@
 
				 #include <asm/udbg.h>
			
 
				 #include <asm/iommu.h>
			
 
				 #include <asm/tce.h>
			
 
				-#include <asm/iommu.h>
			
 
				+#include <asm/asm-prototypes.h>
			
 
				 
			
 
				 #define TCES_PER_PAGE	(PAGE_SIZE / sizeof(u64))
			
 
				 
			
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -54,6 +54,9 @@
 
				 #include <asm/dbell.h>
			
 
				 #include <asm/hmi.h>
			
 
				 #include <asm/pnv-pci.h>
			
 
				+#include <asm/mmu.h>
			
 
				+#include <asm/opal.h>
			
 
				+#include <asm/xics.h>
			
 
				 #include <linux/gfp.h>
			
 
				 #include <linux/vmalloc.h>
			
 
				 #include <linux/highmem.h>
			
@@ -62,6 +65,7 @@
 
				 #include <linux/irqbypass.h>
			
 
				 #include <linux/module.h>
			
 
				 #include <linux/compiler.h>
			
 
				+#include <linux/of.h>
			
 
				 
			
 
				 #include "book3s.h"
			
 
				 
			
@@ -104,23 +108,6 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
 
				 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
			
 
				 #endif
			
 
				 
			
 
				-/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */
			
 
				-static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT;
			
 
				-module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR);
			
 
				-MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns");
			
 
				-
			
 
				-/* Factor by which the vcore halt poll interval is grown, default is to double
			
 
				- */
			
 
				-static unsigned int halt_poll_ns_grow = 2;
			
 
				-module_param(halt_poll_ns_grow, int, S_IRUGO);
			
 
				-MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by");
			
 
				-
			
 
				-/* Factor by which the vcore halt poll interval is shrunk, default is to reset
			
 
				- */
			
 
				-static unsigned int halt_poll_ns_shrink;
			
 
				-module_param(halt_poll_ns_shrink, int, S_IRUGO);
			
 
				-MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by");
			
 
				-
			
 
				 static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
			
 
				 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
			
 
				 
			
@@ -146,12 +133,21 @@ static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
 
				 
			
 
				 static bool kvmppc_ipi_thread(int cpu)
			
 
				 {
			
 
				+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				+
			
 
				+	/* On POWER9 we can use msgsnd to IPI any cpu */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		msg |= get_hard_smp_processor_id(cpu);
			
 
				+		smp_mb();
			
 
				+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
			
 
				+		return true;
			
 
				+	}
			
 
				+
			
 
				 	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
			
 
				 	if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
			
 
				 		preempt_disable();
			
 
				 		if (cpu_first_thread_sibling(cpu) ==
			
 
				 		    cpu_first_thread_sibling(smp_processor_id())) {
			
 
				-			unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				 			msg |= cpu_thread_in_core(cpu);
			
 
				 			smp_mb();
			
 
				 			__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
			
@@ -162,8 +158,12 @@ static bool kvmppc_ipi_thread(int cpu)
 
				 	}
			
 
				 
			
 
				 #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP)
			
 
				-	if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) {
			
 
				-		xics_wake_cpu(cpu);
			
 
				+	if (cpu >= 0 && cpu < nr_cpu_ids) {
			
 
				+		if (paca[cpu].kvm_hstate.xics_phys) {
			
 
				+			xics_wake_cpu(cpu);
			
 
				+			return true;
			
 
				+		}
			
 
				+		opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY);
			
 
				 		return true;
			
 
				 	}
			
 
				 #endif
			
@@ -299,41 +299,54 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr)
 
				 	vcpu->arch.pvr = pvr;
			
 
				 }
			
 
				 
			
 
				+/* Dummy value used in computing PCR value below */
			
 
				+#define PCR_ARCH_300	(PCR_ARCH_207 << 1)
			
 
				+
			
 
				 static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
			
 
				 {
			
 
				-	unsigned long pcr = 0;
			
 
				+	unsigned long host_pcr_bit = 0, guest_pcr_bit = 0;
			
 
				 	struct kvmppc_vcore *vc = vcpu->arch.vcore;
			
 
				 
			
 
				+	/* We can (emulate) our own architecture version and anything older */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		host_pcr_bit = PCR_ARCH_300;
			
 
				+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
			
 
				+		host_pcr_bit = PCR_ARCH_207;
			
 
				+	else if (cpu_has_feature(CPU_FTR_ARCH_206))
			
 
				+		host_pcr_bit = PCR_ARCH_206;
			
 
				+	else
			
 
				+		host_pcr_bit = PCR_ARCH_205;
			
 
				+
			
 
				+	/* Determine lowest PCR bit needed to run guest in given PVR level */
			
 
				+	guest_pcr_bit = host_pcr_bit;
			
 
				 	if (arch_compat) {
			
 
				 		switch (arch_compat) {
			
 
				 		case PVR_ARCH_205:
			
 
				-			/*
			
 
				-			 * If an arch bit is set in PCR, all the defined
			
 
				-			 * higher-order arch bits also have to be set.
			
 
				-			 */
			
 
				-			pcr = PCR_ARCH_206 | PCR_ARCH_205;
			
 
				+			guest_pcr_bit = PCR_ARCH_205;
			
 
				 			break;
			
 
				 		case PVR_ARCH_206:
			
 
				 		case PVR_ARCH_206p:
			
 
				-			pcr = PCR_ARCH_206;
			
 
				+			guest_pcr_bit = PCR_ARCH_206;
			
 
				 			break;
			
 
				 		case PVR_ARCH_207:
			
 
				+			guest_pcr_bit = PCR_ARCH_207;
			
 
				+			break;
			
 
				+		case PVR_ARCH_300:
			
 
				+			guest_pcr_bit = PCR_ARCH_300;
			
 
				 			break;
			
 
				 		default:
			
 
				 			return -EINVAL;
			
 
				 		}
			
 
				-
			
 
				-		if (!cpu_has_feature(CPU_FTR_ARCH_207S)) {
			
 
				-			/* POWER7 can't emulate POWER8 */
			
 
				-			if (!(pcr & PCR_ARCH_206))
			
 
				-				return -EINVAL;
			
 
				-			pcr &= ~PCR_ARCH_206;
			
 
				-		}
			
 
				 	}
			
 
				 
			
 
				+	/* Check requested PCR bits don't exceed our capabilities */
			
 
				+	if (guest_pcr_bit > host_pcr_bit)
			
 
				+		return -EINVAL;
			
 
				+
			
 
				 	spin_lock(&vc->lock);
			
 
				 	vc->arch_compat = arch_compat;
			
 
				-	vc->pcr = pcr;
			
 
				+	/* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
			
 
				+	vc->pcr = host_pcr_bit - guest_pcr_bit;
			
 
				 	spin_unlock(&vc->lock);
			
 
				 
			
 
				 	return 0;
			
@@ -945,6 +958,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
 
				 		break;
			
 
				 	case BOOK3S_INTERRUPT_EXTERNAL:
			
 
				 	case BOOK3S_INTERRUPT_H_DOORBELL:
			
 
				+	case BOOK3S_INTERRUPT_H_VIRT:
			
 
				 		vcpu->stat.ext_intr_exits++;
			
 
				 		r = RESUME_GUEST;
			
 
				 		break;
			
@@ -1229,6 +1243,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	case KVM_REG_PPC_WORT:
			
 
				 		*val = get_reg_val(id, vcpu->arch.wort);
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_TIDR:
			
 
				+		*val = get_reg_val(id, vcpu->arch.tid);
			
 
				+		break;
			
 
				+	case KVM_REG_PPC_PSSCR:
			
 
				+		*val = get_reg_val(id, vcpu->arch.psscr);
			
 
				+		break;
			
 
				 	case KVM_REG_PPC_VPA_ADDR:
			
 
				 		spin_lock(&vcpu->arch.vpa_update_lock);
			
 
				 		*val = get_reg_val(id, vcpu->arch.vpa.next_gpa);
			
@@ -1288,6 +1308,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	case KVM_REG_PPC_TM_CR:
			
 
				 		*val = get_reg_val(id, vcpu->arch.cr_tm);
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_TM_XER:
			
 
				+		*val = get_reg_val(id, vcpu->arch.xer_tm);
			
 
				+		break;
			
 
				 	case KVM_REG_PPC_TM_LR:
			
 
				 		*val = get_reg_val(id, vcpu->arch.lr_tm);
			
 
				 		break;
			
@@ -1427,6 +1450,12 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	case KVM_REG_PPC_WORT:
			
 
				 		vcpu->arch.wort = set_reg_val(id, *val);
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_TIDR:
			
 
				+		vcpu->arch.tid = set_reg_val(id, *val);
			
 
				+		break;
			
 
				+	case KVM_REG_PPC_PSSCR:
			
 
				+		vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS;
			
 
				+		break;
			
 
				 	case KVM_REG_PPC_VPA_ADDR:
			
 
				 		addr = set_reg_val(id, *val);
			
 
				 		r = -EINVAL;
			
@@ -1498,6 +1527,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	case KVM_REG_PPC_TM_CR:
			
 
				 		vcpu->arch.cr_tm = set_reg_val(id, *val);
			
 
				 		break;
			
 
				+	case KVM_REG_PPC_TM_XER:
			
 
				+		vcpu->arch.xer_tm = set_reg_val(id, *val);
			
 
				+		break;
			
 
				 	case KVM_REG_PPC_TM_LR:
			
 
				 		vcpu->arch.lr_tm = set_reg_val(id, *val);
			
 
				 		break;
			
@@ -1540,6 +1572,20 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
 
				 	return r;
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * On POWER9, threads are independent and can be in different partitions.
			
 
				+ * Therefore we consider each thread to be a subcore.
			
 
				+ * There is a restriction that all threads have to be in the same
			
 
				+ * MMU mode (radix or HPT), unfortunately, but since we only support
			
 
				+ * HPT guests on a HPT host so far, that isn't an impediment yet.
			
 
				+ */
			
 
				+static int threads_per_vcore(void)
			
 
				+{
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		return 1;
			
 
				+	return threads_per_subcore;
			
 
				+}
			
 
				+
			
 
				 static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
			
 
				 {
			
 
				 	struct kvmppc_vcore *vcore;
			
@@ -1554,7 +1600,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
 
				 	init_swait_queue_head(&vcore->wq);
			
 
				 	vcore->preempt_tb = TB_NIL;
			
 
				 	vcore->lpcr = kvm->arch.lpcr;
			
 
				-	vcore->first_vcpuid = core * threads_per_subcore;
			
 
				+	vcore->first_vcpuid = core * threads_per_vcore();
			
 
				 	vcore->kvm = kvm;
			
 
				 	INIT_LIST_HEAD(&vcore->preempt_list);
			
 
				 
			
@@ -1717,7 +1763,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
 
				 	int core;
			
 
				 	struct kvmppc_vcore *vcore;
			
 
				 
			
 
				-	core = id / threads_per_subcore;
			
 
				+	core = id / threads_per_vcore();
			
 
				 	if (core >= KVM_MAX_VCORES)
			
 
				 		goto out;
			
 
				 
			
@@ -1935,7 +1981,10 @@ static void kvmppc_wait_for_nap(void)
 
				 {
			
 
				 	int cpu = smp_processor_id();
			
 
				 	int i, loops;
			
 
				+	int n_threads = threads_per_vcore();
			
 
				 
			
 
				+	if (n_threads <= 1)
			
 
				+		return;
			
 
				 	for (loops = 0; loops < 1000000; ++loops) {
			
 
				 		/*
			
 
				 		 * Check if all threads are finished.
			
@@ -1943,17 +1992,17 @@ static void kvmppc_wait_for_nap(void)
 
				 		 * and the thread clears it when finished, so we look
			
 
				 		 * for any threads that still have a non-NULL vcore ptr.
			
 
				 		 */
			
 
				-		for (i = 1; i < threads_per_subcore; ++i)
			
 
				+		for (i = 1; i < n_threads; ++i)
			
 
				 			if (paca[cpu + i].kvm_hstate.kvm_vcore)
			
 
				 				break;
			
 
				-		if (i == threads_per_subcore) {
			
 
				+		if (i == n_threads) {
			
 
				 			HMT_medium();
			
 
				 			return;
			
 
				 		}
			
 
				 		HMT_low();
			
 
				 	}
			
 
				 	HMT_medium();
			
 
				-	for (i = 1; i < threads_per_subcore; ++i)
			
 
				+	for (i = 1; i < n_threads; ++i)
			
 
				 		if (paca[cpu + i].kvm_hstate.kvm_vcore)
			
 
				 			pr_err("KVM: CPU %d seems to be stuck\n", cpu + i);
			
 
				 }
			
@@ -2019,7 +2068,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
 
				 
			
 
				 	vc->vcore_state = VCORE_PREEMPT;
			
 
				 	vc->pcpu = smp_processor_id();
			
 
				-	if (vc->num_threads < threads_per_subcore) {
			
 
				+	if (vc->num_threads < threads_per_vcore()) {
			
 
				 		spin_lock(&lp->lock);
			
 
				 		list_add_tail(&vc->preempt_list, &lp->list);
			
 
				 		spin_unlock(&lp->lock);
			
@@ -2123,8 +2172,7 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
 
				 	cip->subcore_threads[sub] = vc->num_threads;
			
 
				 	cip->subcore_vm[sub] = vc->kvm;
			
 
				 	init_master_vcore(vc);
			
 
				-	list_del(&vc->preempt_list);
			
 
				-	list_add_tail(&vc->preempt_list, &cip->vcs[sub]);
			
 
				+	list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
			
 
				 
			
 
				 	return true;
			
 
				 }
			
@@ -2307,6 +2355,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 	unsigned long cmd_bit, stat_bit;
			
 
				 	int pcpu, thr;
			
 
				 	int target_threads;
			
 
				+	int controlled_threads;
			
 
				 
			
 
				 	/*
			
 
				 	 * Remove from the list any threads that have a signal pending
			
@@ -2324,12 +2373,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 	init_master_vcore(vc);
			
 
				 	vc->preempt_tb = TB_NIL;
			
 
				 
			
 
				+	/*
			
 
				+	 * Number of threads that we will be controlling: the same as
			
 
				+	 * the number of threads per subcore, except on POWER9,
			
 
				+	 * where it's 1 because the threads are (mostly) independent.
			
 
				+	 */
			
 
				+	controlled_threads = threads_per_vcore();
			
 
				+
			
 
				 	/*
			
 
				 	 * Make sure we are running on primary threads, and that secondary
			
 
				 	 * threads are offline.  Also check if the number of threads in this
			
 
				 	 * guest are greater than the current system threads per guest.
			
 
				 	 */
			
 
				-	if ((threads_per_core > 1) &&
			
 
				+	if ((controlled_threads > 1) &&
			
 
				 	    ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
			
 
				 		for_each_runnable_thread(i, vcpu, vc) {
			
 
				 			vcpu->arch.ret = -EBUSY;
			
@@ -2345,7 +2401,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 	 */
			
 
				 	init_core_info(&core_info, vc);
			
 
				 	pcpu = smp_processor_id();
			
 
				-	target_threads = threads_per_subcore;
			
 
				+	target_threads = controlled_threads;
			
 
				 	if (target_smt_mode && target_smt_mode < target_threads)
			
 
				 		target_threads = target_smt_mode;
			
 
				 	if (vc->num_threads < target_threads)
			
@@ -2381,7 +2437,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 		smp_wmb();
			
 
				 	}
			
 
				 	pcpu = smp_processor_id();
			
 
				-	for (thr = 0; thr < threads_per_subcore; ++thr)
			
 
				+	for (thr = 0; thr < controlled_threads; ++thr)
			
 
				 		paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
			
 
				 
			
 
				 	/* Initiate micro-threading (split-core) if required */
			
@@ -2491,7 +2547,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
 
				 	}
			
 
				 
			
 
				 	/* Let secondaries go back to the offline loop */
			
 
				-	for (i = 0; i < threads_per_subcore; ++i) {
			
 
				+	for (i = 0; i < controlled_threads; ++i) {
			
 
				 		kvmppc_release_hwthread(pcpu + i);
			
 
				 		if (sip && sip->napped[i])
			
 
				 			kvmppc_ipi_thread(pcpu + i);
			
@@ -2543,9 +2599,6 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
 
				 		vc->halt_poll_ns = 10000;
			
 
				 	else
			
 
				 		vc->halt_poll_ns *= halt_poll_ns_grow;
			
 
				-
			
 
				-	if (vc->halt_poll_ns > halt_poll_max_ns)
			
 
				-		vc->halt_poll_ns = halt_poll_max_ns;
			
 
				 }
			
 
				 
			
 
				 static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
			
@@ -2556,7 +2609,8 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
 
				 		vc->halt_poll_ns /= halt_poll_ns_shrink;
			
 
				 }
			
 
				 
			
 
				-/* Check to see if any of the runnable vcpus on the vcore have pending
			
 
				+/*
			
 
				+ * Check to see if any of the runnable vcpus on the vcore have pending
			
 
				  * exceptions or are no longer ceded
			
 
				  */
			
 
				 static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
			
@@ -2655,16 +2709,18 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
 
				 	}
			
 
				 
			
 
				 	/* Adjust poll time */
			
 
				-	if (halt_poll_max_ns) {
			
 
				+	if (halt_poll_ns) {
			
 
				 		if (block_ns <= vc->halt_poll_ns)
			
 
				 			;
			
 
				 		/* We slept and blocked for longer than the max halt time */
			
 
				-		else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns)
			
 
				+		else if (vc->halt_poll_ns && block_ns > halt_poll_ns)
			
 
				 			shrink_halt_poll_ns(vc);
			
 
				 		/* We slept and our poll time is too small */
			
 
				-		else if (vc->halt_poll_ns < halt_poll_max_ns &&
			
 
				-				block_ns < halt_poll_max_ns)
			
 
				+		else if (vc->halt_poll_ns < halt_poll_ns &&
			
 
				+				block_ns < halt_poll_ns)
			
 
				 			grow_halt_poll_ns(vc);
			
 
				+		if (vc->halt_poll_ns > halt_poll_ns)
			
 
				+			vc->halt_poll_ns = halt_poll_ns;
			
 
				 	} else
			
 
				 		vc->halt_poll_ns = 0;
			
 
				 
			
@@ -2971,6 +3027,15 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
 
				 	struct kvm_memslots *slots;
			
 
				 	struct kvm_memory_slot *memslot;
			
 
				 
			
 
				+	/*
			
 
				+	 * If we are making a new memslot, it might make
			
 
				+	 * some address that was previously cached as emulated
			
 
				+	 * MMIO be no longer emulated MMIO, so invalidate
			
 
				+	 * all the caches of emulated MMIO translations.
			
 
				+	 */
			
 
				+	if (npages)
			
 
				+		atomic64_inc(&kvm->arch.mmio_update);
			
 
				+
			
 
				 	if (npages && old->npages) {
			
 
				 		/*
			
 
				 		 * If modifying a memslot, reset all the rmap dirty bits.
			
@@ -3015,6 +3080,22 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu)
 
				 	return;
			
 
				 }
			
 
				 
			
 
				+static void kvmppc_setup_partition_table(struct kvm *kvm)
			
 
				+{
			
 
				+	unsigned long dw0, dw1;
			
 
				+
			
 
				+	/* PS field - page size for VRMA */
			
 
				+	dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) |
			
 
				+		((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1);
			
 
				+	/* HTABSIZE and HTABORG fields */
			
 
				+	dw0 |= kvm->arch.sdr1;
			
 
				+
			
 
				+	/* Second dword has GR=0; other fields are unused since UPRT=0 */
			
 
				+	dw1 = 0;
			
 
				+
			
 
				+	mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
			
 
				+}
			
 
				+
			
 
				 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
			
 
				 {
			
 
				 	int err = 0;
			
@@ -3066,17 +3147,20 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
 
				 	      psize == 0x1000000))
			
 
				 		goto out_srcu;
			
 
				 
			
 
				-	/* Update VRMASD field in the LPCR */
			
 
				 	senc = slb_pgsize_encoding(psize);
			
 
				 	kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T |
			
 
				 		(VRMA_VSID << SLB_VSID_SHIFT_1T);
			
 
				-	/* the -4 is to account for senc values starting at 0x10 */
			
 
				-	lpcr = senc << (LPCR_VRMASD_SH - 4);
			
 
				-
			
 
				 	/* Create HPTEs in the hash page table for the VRMA */
			
 
				 	kvmppc_map_vrma(vcpu, memslot, porder);
			
 
				 
			
 
				-	kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
			
 
				+	/* Update VRMASD field in the LPCR */
			
 
				+	if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		/* the -4 is to account for senc values starting at 0x10 */
			
 
				+		lpcr = senc << (LPCR_VRMASD_SH - 4);
			
 
				+		kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
			
 
				+	} else {
			
 
				+		kvmppc_setup_partition_table(kvm);
			
 
				+	}
			
 
				 
			
 
				 	/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
			
 
				 	smp_wmb();
			
@@ -3219,14 +3303,18 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 	 * Since we don't flush the TLB when tearing down a VM,
			
 
				 	 * and this lpid might have previously been used,
			
 
				 	 * make sure we flush on each core before running the new VM.
			
 
				+	 * On POWER9, the tlbie in mmu_partition_table_set_entry()
			
 
				+	 * does this flush for us.
			
 
				 	 */
			
 
				-	cpumask_setall(&kvm->arch.need_tlb_flush);
			
 
				+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		cpumask_setall(&kvm->arch.need_tlb_flush);
			
 
				 
			
 
				 	/* Start out with the default set of hcalls enabled */
			
 
				 	memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls,
			
 
				 	       sizeof(kvm->arch.enabled_hcalls));
			
 
				 
			
 
				-	kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
			
 
				+	if (!cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
			
 
				 
			
 
				 	/* Init LPCR for virtual RMA mode */
			
 
				 	kvm->arch.host_lpid = mfspr(SPRN_LPID);
			
@@ -3239,8 +3327,28 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
 
				 	/* On POWER8 turn on online bit to enable PURR/SPURR */
			
 
				 	if (cpu_has_feature(CPU_FTR_ARCH_207S))
			
 
				 		lpcr |= LPCR_ONL;
			
 
				+	/*
			
 
				+	 * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed)
			
 
				+	 * Set HVICE bit to enable hypervisor virtualization interrupts.
			
 
				+	 */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		lpcr &= ~LPCR_VPM0;
			
 
				+		lpcr |= LPCR_HVICE;
			
 
				+	}
			
 
				+
			
 
				 	kvm->arch.lpcr = lpcr;
			
 
				 
			
 
				+	/*
			
 
				+	 * Work out how many sets the TLB has, for the use of
			
 
				+	 * the TLB invalidation loop in book3s_hv_rmhandlers.S.
			
 
				+	 */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH;	/* 256 */
			
 
				+	else if (cpu_has_feature(CPU_FTR_ARCH_207S))
			
 
				+		kvm->arch.tlb_sets = POWER8_TLB_SETS;		/* 512 */
			
 
				+	else
			
 
				+		kvm->arch.tlb_sets = POWER7_TLB_SETS;		/* 128 */
			
 
				+
			
 
				 	/*
			
 
				 	 * Track that we now have a HV mode VM active. This blocks secondary
			
 
				 	 * CPU threads from coming online.
			
@@ -3305,9 +3413,9 @@ static int kvmppc_core_check_processor_compat_hv(void)
 
				 	    !cpu_has_feature(CPU_FTR_ARCH_206))
			
 
				 		return -EIO;
			
 
				 	/*
			
 
				-	 * Disable KVM for Power9, untill the required bits merged.
			
 
				+	 * Disable KVM for Power9 in radix mode.
			
 
				 	 */
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
			
 
				 		return -EIO;
			
 
				 
			
 
				 	return 0;
			
@@ -3661,6 +3769,23 @@ static int kvmppc_book3s_init_hv(void)
 
				 	if (r)
			
 
				 		return r;
			
 
				 
			
 
				+	/*
			
 
				+	 * We need a way of accessing the XICS interrupt controller,
			
 
				+	 * either directly, via paca[cpu].kvm_hstate.xics_phys, or
			
 
				+	 * indirectly, via OPAL.
			
 
				+	 */
			
 
				+#ifdef CONFIG_SMP
			
 
				+	if (!get_paca()->kvm_hstate.xics_phys) {
			
 
				+		struct device_node *np;
			
 
				+
			
 
				+		np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
			
 
				+		if (!np) {
			
 
				+			pr_err("KVM-HV: Cannot determine method for accessing XICS\n");
			
 
				+			return -ENODEV;
			
 
				+		}
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				 	kvm_ops_hv.owner = THIS_MODULE;
			
 
				 	kvmppc_hv_ops = &kvm_ops_hv;
			
 
				 
			
@@ -3683,3 +3808,4 @@ module_exit(kvmppc_book3s_exit_hv);
 
				 MODULE_LICENSE("GPL");
			
 
				 MODULE_ALIAS_MISCDEV(KVM_MINOR);
			
 
				 MODULE_ALIAS("devname:kvm");
			
 
				+
			
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -26,6 +26,9 @@
 
				 #include <asm/dbell.h>
			
 
				 #include <asm/cputhreads.h>
			
 
				 #include <asm/io.h>
			
 
				+#include <asm/asm-prototypes.h>
			
 
				+#include <asm/opal.h>
			
 
				+#include <asm/smp.h>
			
 
				 
			
 
				 #define KVM_CMA_CHUNK_ORDER	18
			
 
				 
			
@@ -205,12 +208,18 @@ static inline void rm_writeb(unsigned long paddr, u8 val)
 
				 void kvmhv_rm_send_ipi(int cpu)
			
 
				 {
			
 
				 	unsigned long xics_phys;
			
 
				+	unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				 
			
 
				-	/* On POWER8 for IPIs to threads in the same core, use msgsnd */
			
 
				+	/* On POWER9 we can use msgsnd for any destination cpu. */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		msg |= get_hard_smp_processor_id(cpu);
			
 
				+		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
			
 
				+		return;
			
 
				+	}
			
 
				+	/* On POWER8 for IPIs to threads in the same core, use msgsnd. */
			
 
				 	if (cpu_has_feature(CPU_FTR_ARCH_207S) &&
			
 
				 	    cpu_first_thread_sibling(cpu) ==
			
 
				 	    cpu_first_thread_sibling(raw_smp_processor_id())) {
			
 
				-		unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
			
 
				 		msg |= cpu_thread_in_core(cpu);
			
 
				 		__asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg));
			
 
				 		return;
			
@@ -218,7 +227,11 @@ void kvmhv_rm_send_ipi(int cpu)
 
				 
			
 
				 	/* Else poke the target with an IPI */
			
 
				 	xics_phys = paca[cpu].kvm_hstate.xics_phys;
			
 
				-	rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
			
 
				+	if (xics_phys)
			
 
				+		rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY);
			
 
				+	else
			
 
				+		opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu),
			
 
				+				     IPI_PRIORITY);
			
 
				 }
			
 
				 
			
 
				 /*
			
@@ -329,7 +342,7 @@ static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
 
				  * saved a copy of the XIRR in the PACA, it will be picked up by
			
 
				  * the host ICP driver.
			
 
				  */
			
 
				-static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
			
 
				+static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
			
 
				 {
			
 
				 	struct kvmppc_passthru_irqmap *pimap;
			
 
				 	struct kvmppc_irq_map *irq_map;
			
@@ -348,11 +361,11 @@ static int kvmppc_check_passthru(u32 xisr, __be32 xirr)
 
				 	/* We're handling this interrupt, generic code doesn't need to */
			
 
				 	local_paca->kvm_hstate.saved_xirr = 0;
			
 
				 
			
 
				-	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap);
			
 
				+	return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again);
			
 
				 }
			
 
				 
			
 
				 #else
			
 
				-static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
			
 
				+static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again)
			
 
				 {
			
 
				 	return 1;
			
 
				 }
			
@@ -367,14 +380,31 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr)
 
				  *	-1 if there was a guest wakeup IPI (which has now been cleared)
			
 
				  *	-2 if there is PCI passthrough external interrupt that was handled
			
 
				  */
			
 
				+static long kvmppc_read_one_intr(bool *again);
			
 
				 
			
 
				 long kvmppc_read_intr(void)
			
 
				+{
			
 
				+	long ret = 0;
			
 
				+	long rc;
			
 
				+	bool again;
			
 
				+
			
 
				+	do {
			
 
				+		again = false;
			
 
				+		rc = kvmppc_read_one_intr(&again);
			
 
				+		if (rc && (ret == 0 || rc > ret))
			
 
				+			ret = rc;
			
 
				+	} while (again);
			
 
				+	return ret;
			
 
				+}
			
 
				+
			
 
				+static long kvmppc_read_one_intr(bool *again)
			
 
				 {
			
 
				 	unsigned long xics_phys;
			
 
				 	u32 h_xirr;
			
 
				 	__be32 xirr;
			
 
				 	u32 xisr;
			
 
				 	u8 host_ipi;
			
 
				+	int64_t rc;
			
 
				 
			
 
				 	/* see if a host IPI is pending */
			
 
				 	host_ipi = local_paca->kvm_hstate.host_ipi;
			
@@ -383,8 +413,14 @@ long kvmppc_read_intr(void)
 
				 
			
 
				 	/* Now read the interrupt from the ICP */
			
 
				 	xics_phys = local_paca->kvm_hstate.xics_phys;
			
 
				-	if (unlikely(!xics_phys))
			
 
				-		return 1;
			
 
				+	if (!xics_phys) {
			
 
				+		/* Use OPAL to read the XIRR */
			
 
				+		rc = opal_rm_int_get_xirr(&xirr, false);
			
 
				+		if (rc < 0)
			
 
				+			return 1;
			
 
				+	} else {
			
 
				+		xirr = _lwzcix(xics_phys + XICS_XIRR);
			
 
				+	}
			
 
				 
			
 
				 	/*
			
 
				 	 * Save XIRR for later. Since we get control in reverse endian
			
@@ -392,7 +428,6 @@ long kvmppc_read_intr(void)
 
				 	 * host endian. Note that xirr is the value read from the
			
 
				 	 * XIRR register, while h_xirr is the host endian version.
			
 
				 	 */
			
 
				-	xirr = _lwzcix(xics_phys + XICS_XIRR);
			
 
				 	h_xirr = be32_to_cpu(xirr);
			
 
				 	local_paca->kvm_hstate.saved_xirr = h_xirr;
			
 
				 	xisr = h_xirr & 0xffffff;
			
@@ -411,8 +446,16 @@ long kvmppc_read_intr(void)
 
				 	 * If it is an IPI, clear the MFRR and EOI it.
			
 
				 	 */
			
 
				 	if (xisr == XICS_IPI) {
			
 
				-		_stbcix(xics_phys + XICS_MFRR, 0xff);
			
 
				-		_stwcix(xics_phys + XICS_XIRR, xirr);
			
 
				+		if (xics_phys) {
			
 
				+			_stbcix(xics_phys + XICS_MFRR, 0xff);
			
 
				+			_stwcix(xics_phys + XICS_XIRR, xirr);
			
 
				+		} else {
			
 
				+			opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff);
			
 
				+			rc = opal_rm_int_eoi(h_xirr);
			
 
				+			/* If rc > 0, there is another interrupt pending */
			
 
				+			*again = rc > 0;
			
 
				+		}
			
 
				+
			
 
				 		/*
			
 
				 		 * Need to ensure side effects of above stores
			
 
				 		 * complete before proceeding.
			
@@ -429,7 +472,11 @@ long kvmppc_read_intr(void)
 
				 			/* We raced with the host,
			
 
				 			 * we need to resend that IPI, bummer
			
 
				 			 */
			
 
				-			_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
			
 
				+			if (xics_phys)
			
 
				+				_stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
			
 
				+			else
			
 
				+				opal_rm_int_set_mfrr(hard_smp_processor_id(),
			
 
				+						     IPI_PRIORITY);
			
 
				 			/* Let side effects complete */
			
 
				 			smp_mb();
			
 
				 			return 1;
			
@@ -440,5 +487,5 @@ long kvmppc_read_intr(void)
 
				 		return -1;
			
 
				 	}
			
 
				 
			
 
				-	return kvmppc_check_passthru(xisr, xirr);
			
 
				+	return kvmppc_check_passthru(xisr, xirr, again);
			
 
				 }
			
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -16,6 +16,7 @@
 
				 #include <asm/machdep.h>
			
 
				 #include <asm/cputhreads.h>
			
 
				 #include <asm/hmi.h>
			
 
				+#include <asm/asm-prototypes.h>
			
 
				 
			
 
				 /* SRR1 bits for machine check on POWER7 */
			
 
				 #define SRR1_MC_LDSTERR		(1ul << (63-42))
			
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -21,6 +21,7 @@
 
				 #include <asm/hvcall.h>
			
 
				 #include <asm/synch.h>
			
 
				 #include <asm/ppc-opcode.h>
			
 
				+#include <asm/asm-prototypes.h>
			
 
				 
			
 
				 /* Translate address of a vmalloc'd thing to a linear map address */
			
 
				 static void *real_vmalloc_addr(void *x)
			
@@ -264,8 +265,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
				 
			
 
				 	if (pa)
			
 
				 		pteh |= HPTE_V_VALID;
			
 
				-	else
			
 
				+	else {
			
 
				 		pteh |= HPTE_V_ABSENT;
			
 
				+		ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
			
 
				+	}
			
 
				 
			
 
				 	/*If we had host pte mapping then  Check WIMG */
			
 
				 	if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) {
			
@@ -351,6 +354,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
				 			/* inval in progress, write a non-present HPTE */
			
 
				 			pteh |= HPTE_V_ABSENT;
			
 
				 			pteh &= ~HPTE_V_VALID;
			
 
				+			ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO);
			
 
				 			unlock_rmap(rmap);
			
 
				 		} else {
			
 
				 			kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index,
			
@@ -361,6 +365,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
 
				 		}
			
 
				 	}
			
 
				 
			
 
				+	/* Convert to new format on P9 */
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		ptel = hpte_old_to_new_r(pteh, ptel);
			
 
				+		pteh = hpte_old_to_new_v(pteh);
			
 
				+	}
			
 
				 	hpte[1] = cpu_to_be64(ptel);
			
 
				 
			
 
				 	/* Write the first HPTE dword, unlocking the HPTE and making it valid */
			
@@ -386,6 +395,13 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 
				 #define LOCK_TOKEN	(*(u32 *)(&get_paca()->paca_index))
			
 
				 #endif
			
 
				 
			
 
				+static inline int is_mmio_hpte(unsigned long v, unsigned long r)
			
 
				+{
			
 
				+	return ((v & HPTE_V_ABSENT) &&
			
 
				+		(r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
			
 
				+		(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
			
 
				+}
			
 
				+
			
 
				 static inline int try_lock_tlbie(unsigned int *lock)
			
 
				 {
			
 
				 	unsigned int tmp, old;
			
@@ -409,13 +425,18 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 
				 {
			
 
				 	long i;
			
 
				 
			
 
				+	/*
			
 
				+	 * We use the POWER9 5-operand versions of tlbie and tlbiel here.
			
 
				+	 * Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores
			
 
				+	 * the RS field, this is backwards-compatible with P7 and P8.
			
 
				+	 */
			
 
				 	if (global) {
			
 
				 		while (!try_lock_tlbie(&kvm->arch.tlbie_lock))
			
 
				 			cpu_relax();
			
 
				 		if (need_sync)
			
 
				 			asm volatile("ptesync" : : : "memory");
			
 
				 		for (i = 0; i < npages; ++i)
			
 
				-			asm volatile(PPC_TLBIE(%1,%0) : :
			
 
				+			asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
			
 
				 				     "r" (rbvalues[i]), "r" (kvm->arch.lpid));
			
 
				 		asm volatile("eieio; tlbsync; ptesync" : : : "memory");
			
 
				 		kvm->arch.tlbie_lock = 0;
			
@@ -423,7 +444,8 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
 
				 		if (need_sync)
			
 
				 			asm volatile("ptesync" : : : "memory");
			
 
				 		for (i = 0; i < npages; ++i)
			
 
				-			asm volatile("tlbiel %0" : : "r" (rbvalues[i]));
			
 
				+			asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : :
			
 
				+				     "r" (rbvalues[i]), "r" (0));
			
 
				 		asm volatile("ptesync" : : : "memory");
			
 
				 	}
			
 
				 }
			
@@ -435,18 +457,23 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 
				 	__be64 *hpte;
			
 
				 	unsigned long v, r, rb;
			
 
				 	struct revmap_entry *rev;
			
 
				-	u64 pte;
			
 
				+	u64 pte, orig_pte, pte_r;
			
 
				 
			
 
				 	if (pte_index >= kvm->arch.hpt_npte)
			
 
				 		return H_PARAMETER;
			
 
				 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
			
 
				 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
			
 
				 		cpu_relax();
			
 
				-	pte = be64_to_cpu(hpte[0]);
			
 
				+	pte = orig_pte = be64_to_cpu(hpte[0]);
			
 
				+	pte_r = be64_to_cpu(hpte[1]);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		pte = hpte_new_to_old_v(pte, pte_r);
			
 
				+		pte_r = hpte_new_to_old_r(pte_r);
			
 
				+	}
			
 
				 	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
			
 
				 	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) ||
			
 
				 	    ((flags & H_ANDCOND) && (pte & avpn) != 0)) {
			
 
				-		__unlock_hpte(hpte, pte);
			
 
				+		__unlock_hpte(hpte, orig_pte);
			
 
				 		return H_NOT_FOUND;
			
 
				 	}
			
 
				 
			
@@ -454,7 +481,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 
				 	v = pte & ~HPTE_V_HVLOCK;
			
 
				 	if (v & HPTE_V_VALID) {
			
 
				 		hpte[0] &= ~cpu_to_be64(HPTE_V_VALID);
			
 
				-		rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index);
			
 
				+		rb = compute_tlbie_rb(v, pte_r, pte_index);
			
 
				 		do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true);
			
 
				 		/*
			
 
				 		 * The reference (R) and change (C) bits in a HPT
			
@@ -472,6 +499,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags,
 
				 	note_hpte_modification(kvm, rev);
			
 
				 	unlock_hpte(hpte, 0);
			
 
				 
			
 
				+	if (is_mmio_hpte(v, pte_r))
			
 
				+		atomic64_inc(&kvm->arch.mmio_update);
			
 
				+
			
 
				 	if (v & HPTE_V_ABSENT)
			
 
				 		v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID;
			
 
				 	hpret[0] = v;
			
@@ -498,7 +528,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
				 	int global;
			
 
				 	long int ret = H_SUCCESS;
			
 
				 	struct revmap_entry *rev, *revs[4];
			
 
				-	u64 hp0;
			
 
				+	u64 hp0, hp1;
			
 
				 
			
 
				 	global = global_invalidates(kvm, 0);
			
 
				 	for (i = 0; i < 4 && ret == H_SUCCESS; ) {
			
@@ -531,6 +561,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
				 			}
			
 
				 			found = 0;
			
 
				 			hp0 = be64_to_cpu(hp[0]);
			
 
				+			hp1 = be64_to_cpu(hp[1]);
			
 
				+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+				hp0 = hpte_new_to_old_v(hp0, hp1);
			
 
				+				hp1 = hpte_new_to_old_r(hp1);
			
 
				+			}
			
 
				 			if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) {
			
 
				 				switch (flags & 3) {
			
 
				 				case 0:		/* absolute */
			
@@ -561,13 +596,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu)
 
				 				rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C);
			
 
				 				args[j] |= rcbits << (56 - 5);
			
 
				 				hp[0] = 0;
			
 
				+				if (is_mmio_hpte(hp0, hp1))
			
 
				+					atomic64_inc(&kvm->arch.mmio_update);
			
 
				 				continue;
			
 
				 			}
			
 
				 
			
 
				 			/* leave it locked */
			
 
				 			hp[0] &= ~cpu_to_be64(HPTE_V_VALID);
			
 
				-			tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]),
			
 
				-				be64_to_cpu(hp[1]), pte_index);
			
 
				+			tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index);
			
 
				 			indexes[n] = j;
			
 
				 			hptes[n] = hp;
			
 
				 			revs[n] = rev;
			
@@ -605,7 +641,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
				 	__be64 *hpte;
			
 
				 	struct revmap_entry *rev;
			
 
				 	unsigned long v, r, rb, mask, bits;
			
 
				-	u64 pte;
			
 
				+	u64 pte_v, pte_r;
			
 
				 
			
 
				 	if (pte_index >= kvm->arch.hpt_npte)
			
 
				 		return H_PARAMETER;
			
@@ -613,14 +649,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
				 	hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
			
 
				 	while (!try_lock_hpte(hpte, HPTE_V_HVLOCK))
			
 
				 		cpu_relax();
			
 
				-	pte = be64_to_cpu(hpte[0]);
			
 
				-	if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
			
 
				-	    ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) {
			
 
				-		__unlock_hpte(hpte, pte);
			
 
				+	v = pte_v = be64_to_cpu(hpte[0]);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1]));
			
 
				+	if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 ||
			
 
				+	    ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) {
			
 
				+		__unlock_hpte(hpte, pte_v);
			
 
				 		return H_NOT_FOUND;
			
 
				 	}
			
 
				 
			
 
				-	v = pte;
			
 
				+	pte_r = be64_to_cpu(hpte[1]);
			
 
				 	bits = (flags << 55) & HPTE_R_PP0;
			
 
				 	bits |= (flags << 48) & HPTE_R_KEY_HI;
			
 
				 	bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO);
			
@@ -642,22 +680,26 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags,
 
				 		 * readonly to writable.  If it should be writable, we'll
			
 
				 		 * take a trap and let the page fault code sort it out.
			
 
				 		 */
			
 
				-		pte = be64_to_cpu(hpte[1]);
			
 
				-		r = (pte & ~mask) | bits;
			
 
				-		if (hpte_is_writable(r) && !hpte_is_writable(pte))
			
 
				+		r = (pte_r & ~mask) | bits;
			
 
				+		if (hpte_is_writable(r) && !hpte_is_writable(pte_r))
			
 
				 			r = hpte_make_readonly(r);
			
 
				 		/* If the PTE is changing, invalidate it first */
			
 
				-		if (r != pte) {
			
 
				+		if (r != pte_r) {
			
 
				 			rb = compute_tlbie_rb(v, r, pte_index);
			
 
				-			hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) |
			
 
				+			hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) |
			
 
				 					      HPTE_V_ABSENT);
			
 
				 			do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags),
			
 
				 				  true);
			
 
				+			/* Don't lose R/C bit updates done by hardware */
			
 
				+			r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C);
			
 
				 			hpte[1] = cpu_to_be64(r);
			
 
				 		}
			
 
				 	}
			
 
				-	unlock_hpte(hpte, v & ~HPTE_V_HVLOCK);
			
 
				+	unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK);
			
 
				 	asm volatile("ptesync" : : : "memory");
			
 
				+	if (is_mmio_hpte(v, pte_r))
			
 
				+		atomic64_inc(&kvm->arch.mmio_update);
			
 
				+
			
 
				 	return H_SUCCESS;
			
 
				 }
			
 
				 
			
@@ -681,6 +723,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags,
 
				 		hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4));
			
 
				 		v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
			
 
				 		r = be64_to_cpu(hpte[1]);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+			v = hpte_new_to_old_v(v, r);
			
 
				+			r = hpte_new_to_old_r(r);
			
 
				+		}
			
 
				 		if (v & HPTE_V_ABSENT) {
			
 
				 			v &= ~HPTE_V_ABSENT;
			
 
				 			v |= HPTE_V_VALID;
			
@@ -798,10 +844,16 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
 
				 			unsigned long pte_index)
			
 
				 {
			
 
				 	unsigned long rb;
			
 
				+	u64 hp0, hp1;
			
 
				 
			
 
				 	hptep[0] &= ~cpu_to_be64(HPTE_V_VALID);
			
 
				-	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
			
 
				-			      pte_index);
			
 
				+	hp0 = be64_to_cpu(hptep[0]);
			
 
				+	hp1 = be64_to_cpu(hptep[1]);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hp0 = hpte_new_to_old_v(hp0, hp1);
			
 
				+		hp1 = hpte_new_to_old_r(hp1);
			
 
				+	}
			
 
				+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
			
 
				 	do_tlbies(kvm, &rb, 1, 1, true);
			
 
				 }
			
 
				 EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte);
			
@@ -811,9 +863,15 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
 
				 {
			
 
				 	unsigned long rb;
			
 
				 	unsigned char rbyte;
			
 
				+	u64 hp0, hp1;
			
 
				 
			
 
				-	rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]),
			
 
				-			      pte_index);
			
 
				+	hp0 = be64_to_cpu(hptep[0]);
			
 
				+	hp1 = be64_to_cpu(hptep[1]);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hp0 = hpte_new_to_old_v(hp0, hp1);
			
 
				+		hp1 = hpte_new_to_old_r(hp1);
			
 
				+	}
			
 
				+	rb = compute_tlbie_rb(hp0, hp1, pte_index);
			
 
				 	rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8;
			
 
				 	/* modify only the second-last byte, which contains the ref bit */
			
 
				 	*((char *)hptep + 14) = rbyte;
			
@@ -828,6 +886,37 @@ static int slb_base_page_shift[4] = {
 
				 	20,	/* 1M, unsupported */
			
 
				 };
			
 
				 
			
 
				+static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu,
			
 
				+		unsigned long eaddr, unsigned long slb_v, long mmio_update)
			
 
				+{
			
 
				+	struct mmio_hpte_cache_entry *entry = NULL;
			
 
				+	unsigned int pshift;
			
 
				+	unsigned int i;
			
 
				+
			
 
				+	for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) {
			
 
				+		entry = &vcpu->arch.mmio_cache.entry[i];
			
 
				+		if (entry->mmio_update == mmio_update) {
			
 
				+			pshift = entry->slb_base_pshift;
			
 
				+			if ((entry->eaddr >> pshift) == (eaddr >> pshift) &&
			
 
				+			    entry->slb_v == slb_v)
			
 
				+				return entry;
			
 
				+		}
			
 
				+	}
			
 
				+	return NULL;
			
 
				+}
			
 
				+
			
 
				+static struct mmio_hpte_cache_entry *
			
 
				+			next_mmio_cache_entry(struct kvm_vcpu *vcpu)
			
 
				+{
			
 
				+	unsigned int index = vcpu->arch.mmio_cache.index;
			
 
				+
			
 
				+	vcpu->arch.mmio_cache.index++;
			
 
				+	if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE)
			
 
				+		vcpu->arch.mmio_cache.index = 0;
			
 
				+
			
 
				+	return &vcpu->arch.mmio_cache.entry[index];
			
 
				+}
			
 
				+
			
 
				 /* When called from virtmode, this func should be protected by
			
 
				  * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK
			
 
				  * can trigger deadlock issue.
			
@@ -842,7 +931,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 
				 	unsigned long avpn;
			
 
				 	__be64 *hpte;
			
 
				 	unsigned long mask, val;
			
 
				-	unsigned long v, r;
			
 
				+	unsigned long v, r, orig_v;
			
 
				 
			
 
				 	/* Get page shift, work out hash and AVPN etc. */
			
 
				 	mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY;
			
@@ -877,6 +966,8 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 
				 		for (i = 0; i < 16; i += 2) {
			
 
				 			/* Read the PTE racily */
			
 
				 			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
			
 
				+			if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+				v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1]));
			
 
				 
			
 
				 			/* Check valid/absent, hash, segment size and AVPN */
			
 
				 			if (!(v & valid) || (v & mask) != val)
			
@@ -885,8 +976,12 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 
				 			/* Lock the PTE and read it under the lock */
			
 
				 			while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK))
			
 
				 				cpu_relax();
			
 
				-			v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
			
 
				+			v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK;
			
 
				 			r = be64_to_cpu(hpte[i+1]);
			
 
				+			if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+				v = hpte_new_to_old_v(v, r);
			
 
				+				r = hpte_new_to_old_r(r);
			
 
				+			}
			
 
				 
			
 
				 			/*
			
 
				 			 * Check the HPTE again, including base page size
			
@@ -896,7 +991,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
 
				 				/* Return with the HPTE still locked */
			
 
				 				return (hash << 3) + (i >> 1);
			
 
				 
			
 
				-			__unlock_hpte(&hpte[i], v);
			
 
				+			__unlock_hpte(&hpte[i], orig_v);
			
 
				 		}
			
 
				 
			
 
				 		if (val & HPTE_V_SECONDARY)
			
@@ -924,30 +1019,45 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 
				 {
			
 
				 	struct kvm *kvm = vcpu->kvm;
			
 
				 	long int index;
			
 
				-	unsigned long v, r, gr;
			
 
				+	unsigned long v, r, gr, orig_v;
			
 
				 	__be64 *hpte;
			
 
				 	unsigned long valid;
			
 
				 	struct revmap_entry *rev;
			
 
				 	unsigned long pp, key;
			
 
				+	struct mmio_hpte_cache_entry *cache_entry = NULL;
			
 
				+	long mmio_update = 0;
			
 
				 
			
 
				 	/* For protection fault, expect to find a valid HPTE */
			
 
				 	valid = HPTE_V_VALID;
			
 
				-	if (status & DSISR_NOHPTE)
			
 
				+	if (status & DSISR_NOHPTE) {
			
 
				 		valid |= HPTE_V_ABSENT;
			
 
				-
			
 
				-	index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
			
 
				-	if (index < 0) {
			
 
				-		if (status & DSISR_NOHPTE)
			
 
				-			return status;	/* there really was no HPTE */
			
 
				-		return 0;		/* for prot fault, HPTE disappeared */
			
 
				+		mmio_update = atomic64_read(&kvm->arch.mmio_update);
			
 
				+		cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update);
			
 
				 	}
			
 
				-	hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
			
 
				-	v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
			
 
				-	r = be64_to_cpu(hpte[1]);
			
 
				-	rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
			
 
				-	gr = rev->guest_rpte;
			
 
				+	if (cache_entry) {
			
 
				+		index = cache_entry->pte_index;
			
 
				+		v = cache_entry->hpte_v;
			
 
				+		r = cache_entry->hpte_r;
			
 
				+		gr = cache_entry->rpte;
			
 
				+	} else {
			
 
				+		index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid);
			
 
				+		if (index < 0) {
			
 
				+			if (status & DSISR_NOHPTE)
			
 
				+				return status;	/* there really was no HPTE */
			
 
				+			return 0;	/* for prot fault, HPTE disappeared */
			
 
				+		}
			
 
				+		hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4));
			
 
				+		v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK;
			
 
				+		r = be64_to_cpu(hpte[1]);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+			v = hpte_new_to_old_v(v, r);
			
 
				+			r = hpte_new_to_old_r(r);
			
 
				+		}
			
 
				+		rev = real_vmalloc_addr(&kvm->arch.revmap[index]);
			
 
				+		gr = rev->guest_rpte;
			
 
				 
			
 
				-	unlock_hpte(hpte, v);
			
 
				+		unlock_hpte(hpte, orig_v);
			
 
				+	}
			
 
				 
			
 
				 	/* For not found, if the HPTE is valid by now, retry the instruction */
			
 
				 	if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID))
			
@@ -985,12 +1095,32 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr,
 
				 	vcpu->arch.pgfault_index = index;
			
 
				 	vcpu->arch.pgfault_hpte[0] = v;
			
 
				 	vcpu->arch.pgfault_hpte[1] = r;
			
 
				+	vcpu->arch.pgfault_cache = cache_entry;
			
 
				 
			
 
				 	/* Check the storage key to see if it is possibly emulated MMIO */
			
 
				-	if (data && (vcpu->arch.shregs.msr & MSR_IR) &&
			
 
				-	    (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
			
 
				-	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO))
			
 
				-		return -2;	/* MMIO emulation - load instr word */
			
 
				+	if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) ==
			
 
				+	    (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) {
			
 
				+		if (!cache_entry) {
			
 
				+			unsigned int pshift = 12;
			
 
				+			unsigned int pshift_index;
			
 
				+
			
 
				+			if (slb_v & SLB_VSID_L) {
			
 
				+				pshift_index = ((slb_v & SLB_VSID_LP) >> 4);
			
 
				+				pshift = slb_base_page_shift[pshift_index];
			
 
				+			}
			
 
				+			cache_entry = next_mmio_cache_entry(vcpu);
			
 
				+			cache_entry->eaddr = addr;
			
 
				+			cache_entry->slb_base_pshift = pshift;
			
 
				+			cache_entry->pte_index = index;
			
 
				+			cache_entry->hpte_v = v;
			
 
				+			cache_entry->hpte_r = r;
			
 
				+			cache_entry->rpte = gr;
			
 
				+			cache_entry->slb_v = slb_v;
			
 
				+			cache_entry->mmio_update = mmio_update;
			
 
				+		}
			
 
				+		if (data && (vcpu->arch.shregs.msr & MSR_IR))
			
 
				+			return -2;	/* MMIO emulation - load instr word */
			
 
				+	}
			
 
				 
			
 
				 	return -1;		/* send fault up to host kernel mode */
			
 
				 }
			
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -24,6 +24,7 @@
 
				 #include <asm/pnv-pci.h>
			
 
				 #include <asm/opal.h>
			
 
				 #include <asm/smp.h>
			
 
				+#include <asm/asm-prototypes.h>
			
 
				 
			
 
				 #include "book3s_xics.h"
			
 
				 
			
@@ -70,7 +71,11 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu)
 
				 	hcpu = hcore << threads_shift;
			
 
				 	kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
			
 
				 	smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
			
 
				-	icp_native_cause_ipi_rm(hcpu);
			
 
				+	if (paca[hcpu].kvm_hstate.xics_phys)
			
 
				+		icp_native_cause_ipi_rm(hcpu);
			
 
				+	else
			
 
				+		opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu),
			
 
				+				     IPI_PRIORITY);
			
 
				 }
			
 
				 #else
			
 
				 static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { }
			
@@ -737,7 +742,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
 
				 
			
 
				 unsigned long eoi_rc;
			
 
				 
			
 
				-static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
			
 
				+static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
			
 
				 {
			
 
				 	unsigned long xics_phys;
			
 
				 	int64_t rc;
			
@@ -751,7 +756,12 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr)
 
				 
			
 
				 	/* EOI it */
			
 
				 	xics_phys = local_paca->kvm_hstate.xics_phys;
			
 
				-	_stwcix(xics_phys + XICS_XIRR, xirr);
			
 
				+	if (xics_phys) {
			
 
				+		_stwcix(xics_phys + XICS_XIRR, xirr);
			
 
				+	} else {
			
 
				+		rc = opal_rm_int_eoi(be32_to_cpu(xirr));
			
 
				+		*again = rc > 0;
			
 
				+	}
			
 
				 }
			
 
				 
			
 
				 static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu)
			
@@ -809,9 +819,10 @@ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
 
				 }
			
 
				 
			
 
				 long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
			
 
				-				 u32 xirr,
			
 
				+				 __be32 xirr,
			
 
				 				 struct kvmppc_irq_map *irq_map,
			
 
				-				 struct kvmppc_passthru_irqmap *pimap)
			
 
				+				 struct kvmppc_passthru_irqmap *pimap,
			
 
				+				 bool *again)
			
 
				 {
			
 
				 	struct kvmppc_xics *xics;
			
 
				 	struct kvmppc_icp *icp;
			
@@ -825,7 +836,8 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 
				 	icp_rm_deliver_irq(xics, icp, irq);
			
 
				 
			
 
				 	/* EOI the interrupt */
			
 
				-	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr);
			
 
				+	icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr,
			
 
				+		again);
			
 
				 
			
 
				 	if (check_too_hard(xics, icp) == H_TOO_HARD)
			
 
				 		return 2;
			
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -501,17 +501,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				 	cmpwi	r0, 0
			
 
				 	beq	57f
			
 
				 	li	r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
			
 
				-	mfspr	r4, SPRN_LPCR
			
 
				-	rlwimi	r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
			
 
				-	mtspr	SPRN_LPCR, r4
			
 
				-	isync
			
 
				-	std	r0, HSTATE_SCRATCH0(r13)
			
 
				-	ptesync
			
 
				-	ld	r0, HSTATE_SCRATCH0(r13)
			
 
				-1:	cmpd	r0, r0
			
 
				-	bne	1b
			
 
				-	nap
			
 
				-	b	.
			
 
				+	mfspr	r5, SPRN_LPCR
			
 
				+	rlwimi	r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
			
 
				+	b	kvm_nap_sequence
			
 
				 
			
 
				 57:	li	r0, 0
			
 
				 	stbx	r0, r3, r4
			
@@ -523,6 +515,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				  *                                                                            *
			
 
				  *****************************************************************************/
			
 
				 
			
 
				+/* Stack frame offsets */
			
 
				+#define STACK_SLOT_TID		(112-16)
			
 
				+#define STACK_SLOT_PSSCR	(112-24)
			
 
				+
			
 
				 .global kvmppc_hv_entry
			
 
				 kvmppc_hv_entry:
			
 
				 
			
@@ -581,12 +577,14 @@ kvmppc_hv_entry:
 
				 	ld	r9,VCORE_KVM(r5)	/* pointer to struct kvm */
			
 
				 	cmpwi	r6,0
			
 
				 	bne	10f
			
 
				-	ld	r6,KVM_SDR1(r9)
			
 
				 	lwz	r7,KVM_LPID(r9)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r6,KVM_SDR1(r9)
			
 
				 	li	r0,LPID_RSVD		/* switch to reserved LPID */
			
 
				 	mtspr	SPRN_LPID,r0
			
 
				 	ptesync
			
 
				 	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
			
 
				+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
			
 
				 	mtspr	SPRN_LPID,r7
			
 
				 	isync
			
 
				 
			
@@ -607,12 +605,8 @@ kvmppc_hv_entry:
 
				 	stdcx.	r7,0,r6
			
 
				 	bne	23b
			
 
				 	/* Flush the TLB of any entries for this LPID */
			
 
				-	/* use arch 2.07S as a proxy for POWER8 */
			
 
				-BEGIN_FTR_SECTION
			
 
				-	li	r6,512			/* POWER8 has 512 sets */
			
 
				-FTR_SECTION_ELSE
			
 
				-	li	r6,128			/* POWER7 has 128 sets */
			
 
				-ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S)
			
 
				+	lwz	r6,KVM_TLB_SETS(r9)
			
 
				+	li	r0,0			/* RS for P9 version of tlbiel */
			
 
				 	mtctr	r6
			
 
				 	li	r7,0x800		/* IS field = 0b10 */
			
 
				 	ptesync
			
@@ -698,6 +692,14 @@ kvmppc_got_guest:
 
				 	mtspr	SPRN_PURR,r7
			
 
				 	mtspr	SPRN_SPURR,r8
			
 
				 
			
 
				+	/* Save host values of some registers */
			
 
				+BEGIN_FTR_SECTION
			
 
				+	mfspr	r5, SPRN_TIDR
			
 
				+	mfspr	r6, SPRN_PSSCR
			
 
				+	std	r5, STACK_SLOT_TID(r1)
			
 
				+	std	r6, STACK_SLOT_PSSCR(r1)
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				+
			
 
				 BEGIN_FTR_SECTION
			
 
				 	/* Set partition DABR */
			
 
				 	/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
			
@@ -750,14 +752,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
 
				 BEGIN_FTR_SECTION
			
 
				 	ld	r5, VCPU_MMCR + 24(r4)
			
 
				 	ld	r6, VCPU_SIER(r4)
			
 
				+	mtspr	SPRN_MMCR2, r5
			
 
				+	mtspr	SPRN_SIER, r6
			
 
				+BEGIN_FTR_SECTION_NESTED(96)
			
 
				 	lwz	r7, VCPU_PMC + 24(r4)
			
 
				 	lwz	r8, VCPU_PMC + 28(r4)
			
 
				 	ld	r9, VCPU_MMCR + 32(r4)
			
 
				-	mtspr	SPRN_MMCR2, r5
			
 
				-	mtspr	SPRN_SIER, r6
			
 
				 	mtspr	SPRN_SPMC1, r7
			
 
				 	mtspr	SPRN_SPMC2, r8
			
 
				 	mtspr	SPRN_MMCRS, r9
			
 
				+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				 	mtspr	SPRN_MMCR0, r3
			
 
				 	isync
			
@@ -813,20 +817,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
				 	mtspr	SPRN_EBBHR, r8
			
 
				 	ld	r5, VCPU_EBBRR(r4)
			
 
				 	ld	r6, VCPU_BESCR(r4)
			
 
				-	ld	r7, VCPU_CSIGR(r4)
			
 
				-	ld	r8, VCPU_TACR(r4)
			
 
				+	lwz	r7, VCPU_GUEST_PID(r4)
			
 
				+	ld	r8, VCPU_WORT(r4)
			
 
				 	mtspr	SPRN_EBBRR, r5
			
 
				 	mtspr	SPRN_BESCR, r6
			
 
				-	mtspr	SPRN_CSIGR, r7
			
 
				-	mtspr	SPRN_TACR, r8
			
 
				+	mtspr	SPRN_PID, r7
			
 
				+	mtspr	SPRN_WORT, r8
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/* POWER8-only registers */
			
 
				 	ld	r5, VCPU_TCSCR(r4)
			
 
				 	ld	r6, VCPU_ACOP(r4)
			
 
				-	lwz	r7, VCPU_GUEST_PID(r4)
			
 
				-	ld	r8, VCPU_WORT(r4)
			
 
				+	ld	r7, VCPU_CSIGR(r4)
			
 
				+	ld	r8, VCPU_TACR(r4)
			
 
				 	mtspr	SPRN_TCSCR, r5
			
 
				 	mtspr	SPRN_ACOP, r6
			
 
				-	mtspr	SPRN_PID, r7
			
 
				-	mtspr	SPRN_WORT, r8
			
 
				+	mtspr	SPRN_CSIGR, r7
			
 
				+	mtspr	SPRN_TACR, r8
			
 
				+FTR_SECTION_ELSE
			
 
				+	/* POWER9-only registers */
			
 
				+	ld	r5, VCPU_TID(r4)
			
 
				+	ld	r6, VCPU_PSSCR(r4)
			
 
				+	oris	r6, r6, PSSCR_EC@h	/* This makes stop trap to HV */
			
 
				+	mtspr	SPRN_TIDR, r5
			
 
				+	mtspr	SPRN_PSSCR, r6
			
 
				+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
			
 
				 8:
			
 
				 
			
 
				 	/*
			
@@ -1341,20 +1355,29 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
				 	std	r8, VCPU_EBBHR(r9)
			
 
				 	mfspr	r5, SPRN_EBBRR
			
 
				 	mfspr	r6, SPRN_BESCR
			
 
				-	mfspr	r7, SPRN_CSIGR
			
 
				-	mfspr	r8, SPRN_TACR
			
 
				+	mfspr	r7, SPRN_PID
			
 
				+	mfspr	r8, SPRN_WORT
			
 
				 	std	r5, VCPU_EBBRR(r9)
			
 
				 	std	r6, VCPU_BESCR(r9)
			
 
				-	std	r7, VCPU_CSIGR(r9)
			
 
				-	std	r8, VCPU_TACR(r9)
			
 
				+	stw	r7, VCPU_GUEST_PID(r9)
			
 
				+	std	r8, VCPU_WORT(r9)
			
 
				+BEGIN_FTR_SECTION
			
 
				 	mfspr	r5, SPRN_TCSCR
			
 
				 	mfspr	r6, SPRN_ACOP
			
 
				-	mfspr	r7, SPRN_PID
			
 
				-	mfspr	r8, SPRN_WORT
			
 
				+	mfspr	r7, SPRN_CSIGR
			
 
				+	mfspr	r8, SPRN_TACR
			
 
				 	std	r5, VCPU_TCSCR(r9)
			
 
				 	std	r6, VCPU_ACOP(r9)
			
 
				-	stw	r7, VCPU_GUEST_PID(r9)
			
 
				-	std	r8, VCPU_WORT(r9)
			
 
				+	std	r7, VCPU_CSIGR(r9)
			
 
				+	std	r8, VCPU_TACR(r9)
			
 
				+FTR_SECTION_ELSE
			
 
				+	mfspr	r5, SPRN_TIDR
			
 
				+	mfspr	r6, SPRN_PSSCR
			
 
				+	std	r5, VCPU_TID(r9)
			
 
				+	rldicl	r6, r6, 4, 50		/* r6 &= PSSCR_GUEST_VIS */
			
 
				+	rotldi	r6, r6, 60
			
 
				+	std	r6, VCPU_PSSCR(r9)
			
 
				+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
			
 
				 	/*
			
 
				 	 * Restore various registers to 0, where non-zero values
			
 
				 	 * set by the guest could disrupt the host.
			
@@ -1363,12 +1386,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
 
				 	mtspr	SPRN_IAMR, r0
			
 
				 	mtspr	SPRN_CIABR, r0
			
 
				 	mtspr	SPRN_DAWRX, r0
			
 
				-	mtspr	SPRN_TCSCR, r0
			
 
				 	mtspr	SPRN_WORT, r0
			
 
				+BEGIN_FTR_SECTION
			
 
				+	mtspr	SPRN_TCSCR, r0
			
 
				 	/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
			
 
				 	li	r0, 1
			
 
				 	sldi	r0, r0, 31
			
 
				 	mtspr	SPRN_MMCRS, r0
			
 
				+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
			
 
				 8:
			
 
				 
			
 
				 	/* Save and reset AMR and UAMOR before turning on the MMU */
			
@@ -1502,15 +1527,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				 	stw	r8, VCPU_PMC + 20(r9)
			
 
				 BEGIN_FTR_SECTION
			
 
				 	mfspr	r5, SPRN_SIER
			
 
				+	std	r5, VCPU_SIER(r9)
			
 
				+BEGIN_FTR_SECTION_NESTED(96)
			
 
				 	mfspr	r6, SPRN_SPMC1
			
 
				 	mfspr	r7, SPRN_SPMC2
			
 
				 	mfspr	r8, SPRN_MMCRS
			
 
				-	std	r5, VCPU_SIER(r9)
			
 
				 	stw	r6, VCPU_PMC + 24(r9)
			
 
				 	stw	r7, VCPU_PMC + 28(r9)
			
 
				 	std	r8, VCPU_MMCR + 32(r9)
			
 
				 	lis	r4, 0x8000
			
 
				 	mtspr	SPRN_MMCRS, r4
			
 
				+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
			
 
				 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				 22:
			
 
				 	/* Clear out SLB */
			
@@ -1519,6 +1546,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				 	slbia
			
 
				 	ptesync
			
 
				 
			
 
				+	/* Restore host values of some registers */
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r5, STACK_SLOT_TID(r1)
			
 
				+	ld	r6, STACK_SLOT_PSSCR(r1)
			
 
				+	mtspr	SPRN_TIDR, r5
			
 
				+	mtspr	SPRN_PSSCR, r6
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				+
			
 
				 	/*
			
 
				 	 * POWER7/POWER8 guest -> host partition switch code.
			
 
				 	 * We don't have to lock against tlbies but we do
			
@@ -1552,12 +1587,14 @@ kvmhv_switch_to_host:
 
				 	beq	19f
			
 
				 
			
 
				 	/* Primary thread switches back to host partition */
			
 
				-	ld	r6,KVM_HOST_SDR1(r4)
			
 
				 	lwz	r7,KVM_HOST_LPID(r4)
			
 
				+BEGIN_FTR_SECTION
			
 
				+	ld	r6,KVM_HOST_SDR1(r4)
			
 
				 	li	r8,LPID_RSVD		/* switch to reserved LPID */
			
 
				 	mtspr	SPRN_LPID,r8
			
 
				 	ptesync
			
 
				-	mtspr	SPRN_SDR1,r6		/* switch to partition page table */
			
 
				+	mtspr	SPRN_SDR1,r6		/* switch to host page table */
			
 
				+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
			
 
				 	mtspr	SPRN_LPID,r7
			
 
				 	isync
			
 
				 
			
@@ -2211,6 +2248,21 @@ BEGIN_FTR_SECTION
 
				 	ori	r5, r5, LPCR_PECEDH
			
 
				 	rlwimi	r5, r3, 0, LPCR_PECEDP
			
 
				 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
			
 
				+
			
 
				+kvm_nap_sequence:		/* desired LPCR value in r5 */
			
 
				+BEGIN_FTR_SECTION
			
 
				+	/*
			
 
				+	 * PSSCR bits:	exit criterion = 1 (wakeup based on LPCR at sreset)
			
 
				+	 *		enable state loss = 1 (allow SMT mode switch)
			
 
				+	 *		requested level = 0 (just stop dispatching)
			
 
				+	 */
			
 
				+	lis	r3, (PSSCR_EC | PSSCR_ESL)@h
			
 
				+	mtspr	SPRN_PSSCR, r3
			
 
				+	/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
			
 
				+	li	r4, LPCR_PECE_HVEE@higher
			
 
				+	sldi	r4, r4, 32
			
 
				+	or	r5, r5, r4
			
 
				+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
			
 
				 	mtspr	SPRN_LPCR,r5
			
 
				 	isync
			
 
				 	li	r0, 0
			
@@ -2219,7 +2271,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 
				 	ld	r0, HSTATE_SCRATCH0(r13)
			
 
				 1:	cmpd	r0, r0
			
 
				 	bne	1b
			
 
				+BEGIN_FTR_SECTION
			
 
				 	nap
			
 
				+FTR_SECTION_ELSE
			
 
				+	PPC_STOP
			
 
				+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
			
 
				 	b	.
			
 
				 
			
 
				 33:	mr	r4, r3
			
@@ -2600,11 +2656,13 @@ kvmppc_save_tm:
 
				 	mfctr	r7
			
 
				 	mfspr	r8, SPRN_AMR
			
 
				 	mfspr	r10, SPRN_TAR
			
 
				+	mfxer	r11
			
 
				 	std	r5, VCPU_LR_TM(r9)
			
 
				 	stw	r6, VCPU_CR_TM(r9)
			
 
				 	std	r7, VCPU_CTR_TM(r9)
			
 
				 	std	r8, VCPU_AMR_TM(r9)
			
 
				 	std	r10, VCPU_TAR_TM(r9)
			
 
				+	std	r11, VCPU_XER_TM(r9)
			
 
				 
			
 
				 	/* Restore r12 as trap number. */
			
 
				 	lwz	r12, VCPU_TRAP(r9)
			
@@ -2697,11 +2755,13 @@ kvmppc_restore_tm:
 
				 	ld	r7, VCPU_CTR_TM(r4)
			
 
				 	ld	r8, VCPU_AMR_TM(r4)
			
 
				 	ld	r9, VCPU_TAR_TM(r4)
			
 
				+	ld	r10, VCPU_XER_TM(r4)
			
 
				 	mtlr	r5
			
 
				 	mtcr	r6
			
 
				 	mtctr	r7
			
 
				 	mtspr	SPRN_AMR, r8
			
 
				 	mtspr	SPRN_TAR, r9
			
 
				+	mtxer	r10
			
 
				 
			
 
				 	/*
			
 
				 	 * Load up PPR and DSCR values but don't put them in the actual SPRs
			
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -536,7 +536,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
				 #ifdef CONFIG_PPC_BOOK3S_64
			
 
				 	case KVM_CAP_SPAPR_TCE:
			
 
				 	case KVM_CAP_SPAPR_TCE_64:
			
 
				-	case KVM_CAP_PPC_ALLOC_HTAB:
			
 
				 	case KVM_CAP_PPC_RTAS:
			
 
				 	case KVM_CAP_PPC_FIXUP_HCALL:
			
 
				 	case KVM_CAP_PPC_ENABLE_HCALL:
			
@@ -545,13 +544,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 
				 #endif
			
 
				 		r = 1;
			
 
				 		break;
			
 
				+
			
 
				+	case KVM_CAP_PPC_ALLOC_HTAB:
			
 
				+		r = hv_enabled;
			
 
				+		break;
			
 
				 #endif /* CONFIG_PPC_BOOK3S_64 */
			
 
				 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
			
 
				 	case KVM_CAP_PPC_SMT:
			
 
				-		if (hv_enabled)
			
 
				-			r = threads_per_subcore;
			
 
				-		else
			
 
				-			r = 0;
			
 
				+		r = 0;
			
 
				+		if (hv_enabled) {
			
 
				+			if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+				r = 1;
			
 
				+			else
			
 
				+				r = threads_per_subcore;
			
 
				+		}
			
 
				 		break;
			
 
				 	case KVM_CAP_PPC_RMA:
			
 
				 		r = 0;
			
--- a/arch/powerpc/kvm/trace_hv.h
+++ b/arch/powerpc/kvm/trace_hv.h
@@ -449,7 +449,7 @@ TRACE_EVENT(kvmppc_vcore_wakeup,
 
				 		__entry->tgid   = current->tgid;
			
 
				 	),
			
 
				 
			
 
				-	TP_printk("%s time %lld ns, tgid=%d",
			
 
				+	TP_printk("%s time %llu ns, tgid=%d",
			
 
				 		__entry->waited ? "wait" : "poll",
			
 
				 		__entry->ns, __entry->tgid)
			
 
				 );
			
--- a/arch/powerpc/mm/hash_native_64.c
+++ b/arch/powerpc/mm/hash_native_64.c
@@ -221,13 +221,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 
				 		return -1;
			
 
				 
			
 
				 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
			
 
				-	hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
			
 
				+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
			
 
				 
			
 
				 	if (!(vflags & HPTE_V_BOLTED)) {
			
 
				 		DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n",
			
 
				 			i, hpte_v, hpte_r);
			
 
				 	}
			
 
				 
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hpte_r = hpte_old_to_new_r(hpte_v, hpte_r);
			
 
				+		hpte_v = hpte_old_to_new_v(hpte_v);
			
 
				+	}
			
 
				+
			
 
				 	hptep->r = cpu_to_be64(hpte_r);
			
 
				 	/* Guarantee the second dword is visible before the valid bit */
			
 
				 	eieio();
			
@@ -295,6 +300,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 
				 		vpn, want_v & HPTE_V_AVPN, slot, newpp);
			
 
				 
			
 
				 	hpte_v = be64_to_cpu(hptep->v);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
			
 
				 	/*
			
 
				 	 * We need to invalidate the TLB always because hpte_remove doesn't do
			
 
				 	 * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less
			
@@ -309,6 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp,
 
				 		native_lock_hpte(hptep);
			
 
				 		/* recheck with locks held */
			
 
				 		hpte_v = be64_to_cpu(hptep->v);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
			
 
				 		if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) ||
			
 
				 			     !(hpte_v & HPTE_V_VALID))) {
			
 
				 			ret = -1;
			
@@ -350,6 +359,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize)
 
				 	for (i = 0; i < HPTES_PER_GROUP; i++) {
			
 
				 		hptep = htab_address + slot;
			
 
				 		hpte_v = be64_to_cpu(hptep->v);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
			
 
				 
			
 
				 		if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID))
			
 
				 			/* HPTE matches */
			
@@ -409,6 +420,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn,
 
				 	want_v = hpte_encode_avpn(vpn, bpsize, ssize);
			
 
				 	native_lock_hpte(hptep);
			
 
				 	hpte_v = be64_to_cpu(hptep->v);
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+		hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
			
 
				 
			
 
				 	/*
			
 
				 	 * We need to invalidate the TLB always because hpte_remove doesn't do
			
@@ -467,6 +480,8 @@ static void native_hugepage_invalidate(unsigned long vsid,
 
				 		want_v = hpte_encode_avpn(vpn, psize, ssize);
			
 
				 		native_lock_hpte(hptep);
			
 
				 		hpte_v = be64_to_cpu(hptep->v);
			
 
				+		if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+			hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r));
			
 
				 
			
 
				 		/* Even if we miss, we need to invalidate the TLB */
			
 
				 		if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))
			
@@ -504,6 +519,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 
				 	/* Look at the 8 bit LP value */
			
 
				 	unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
			
 
				 
			
 
				+	if (cpu_has_feature(CPU_FTR_ARCH_300)) {
			
 
				+		hpte_v = hpte_new_to_old_v(hpte_v, hpte_r);
			
 
				+		hpte_r = hpte_new_to_old_r(hpte_r);
			
 
				+	}
			
 
				 	if (!(hpte_v & HPTE_V_LARGE)) {
			
 
				 		size   = MMU_PAGE_4K;
			
 
				 		a_size = MMU_PAGE_4K;
			
@@ -512,11 +531,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot,
 
				 		a_size = hpte_page_sizes[lp] >> 4;
			
 
				 	}
			
 
				 	/* This works for all page sizes, and for 256M and 1T segments */
			
 
				-	if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				-		*ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT;
			
 
				-	else
			
 
				-		*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
			
 
				-
			
 
				+	*ssize = hpte_v >> HPTE_V_SSIZE_SHIFT;
			
 
				 	shift = mmu_psize_defs[size].shift;
			
 
				 
			
 
				 	avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm);
			
@@ -639,6 +654,9 @@ static void native_flush_hash_range(unsigned long number, int local)
 
				 			want_v = hpte_encode_avpn(vpn, psize, ssize);
			
 
				 			native_lock_hpte(hptep);
			
 
				 			hpte_v = be64_to_cpu(hptep->v);
			
 
				+			if (cpu_has_feature(CPU_FTR_ARCH_300))
			
 
				+				hpte_v = hpte_new_to_old_v(hpte_v,
			
 
				+						be64_to_cpu(hptep->r));
			
 
				 			if (!HPTE_V_COMPARE(hpte_v, want_v) ||
			
 
				 			    !(hpte_v & HPTE_V_VALID))
			
 
				 				native_unlock_hpte(hptep);
			
--- a/arch/powerpc/mm/hash_utils_64.c
+++ b/arch/powerpc/mm/hash_utils_64.c
@@ -792,37 +792,17 @@ static void update_hid_for_hash(void)
 
				 static void __init hash_init_partition_table(phys_addr_t hash_table,
			
 
				 					     unsigned long htab_size)
			
 
				 {
			
 
				-	unsigned long ps_field;
			
 
				-	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
			
 
				+	mmu_partition_table_init();
			
 
				 
			
 
				 	/*
			
 
				-	 * slb llp encoding for the page size used in VPM real mode.
			
 
				-	 * We can ignore that for lpid 0
			
 
				+	 * PS field (VRMA page size) is not used for LPID 0, hence set to 0.
			
 
				+	 * For now, UPRT is 0 and we have no segment table.
			
 
				 	 */
			
 
				-	ps_field = 0;
			
 
				 	htab_size =  __ilog2(htab_size) - 18;
			
 
				-
			
 
				-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
			
 
				-	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
			
 
				-						MEMBLOCK_ALLOC_ANYWHERE));
			
 
				-
			
 
				-	/* Initialize the Partition Table with no entries */
			
 
				-	memset((void *)partition_tb, 0, patb_size);
			
 
				-	partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size);
			
 
				-	/*
			
 
				-	 * FIXME!! This should be done via update_partition table
			
 
				-	 * For now UPRT is 0 for us.
			
 
				-	 */
			
 
				-	partition_tb->patb1 = 0;
			
 
				+	mmu_partition_table_set_entry(0, hash_table | htab_size, 0);
			
 
				 	pr_info("Partition table %p\n", partition_tb);
			
 
				 	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
			
 
				 		update_hid_for_hash();
			
 
				-	/*
			
 
				-	 * update partition table control register,
			
 
				-	 * 64 K size.
			
 
				-	 */
			
 
				-	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
			
 
				-
			
 
				 }
			
 
				 
			
 
				 static void __init htab_initialize(void)
			
--- a/arch/powerpc/mm/pgtable-radix.c
+++ b/arch/powerpc/mm/pgtable-radix.c
@@ -177,23 +177,15 @@ static void __init radix_init_pgtable(void)
 
				 
			
 
				 static void __init radix_init_partition_table(void)
			
 
				 {
			
 
				-	unsigned long rts_field;
			
 
				+	unsigned long rts_field, dw0;
			
 
				 
			
 
				+	mmu_partition_table_init();
			
 
				 	rts_field = radix__get_tree_size();
			
 
				+	dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
			
 
				+	mmu_partition_table_set_entry(0, dw0, 0);
			
 
				 
			
 
				-	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large.");
			
 
				-	partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT);
			
 
				-	partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) |
			
 
				-					  RADIX_PGD_INDEX_SIZE | PATB_HR);
			
 
				 	pr_info("Initializing Radix MMU\n");
			
 
				 	pr_info("Partition table %p\n", partition_tb);
			
 
				-
			
 
				-	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
			
 
				-	/*
			
 
				-	 * update partition table control register,
			
 
				-	 * 64 K size.
			
 
				-	 */
			
 
				-	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
			
 
				 }
			
 
				 
			
 
				 void __init radix_init_native(void)
			
@@ -378,6 +370,8 @@ void __init radix__early_init_mmu(void)
 
				 		radix_init_partition_table();
			
 
				 	}
			
 
				 
			
 
				+	memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
			
 
				+
			
 
				 	radix_init_pgtable();
			
 
				 }
			
 
				 
			
--- a/arch/powerpc/mm/pgtable_64.c
+++ b/arch/powerpc/mm/pgtable_64.c
@@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift)
 
				 	}
			
 
				 }
			
 
				 #endif
			
 
				+
			
 
				+#ifdef CONFIG_PPC_BOOK3S_64
			
 
				+void __init mmu_partition_table_init(void)
			
 
				+{
			
 
				+	unsigned long patb_size = 1UL << PATB_SIZE_SHIFT;
			
 
				+
			
 
				+	BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large.");
			
 
				+	partition_tb = __va(memblock_alloc_base(patb_size, patb_size,
			
 
				+						MEMBLOCK_ALLOC_ANYWHERE));
			
 
				+
			
 
				+	/* Initialize the Partition Table with no entries */
			
 
				+	memset((void *)partition_tb, 0, patb_size);
			
 
				+
			
 
				+	/*
			
 
				+	 * update partition table control register,
			
 
				+	 * 64 K size.
			
 
				+	 */
			
 
				+	mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12));
			
 
				+}
			
 
				+
			
 
				+void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0,
			
 
				+				   unsigned long dw1)
			
 
				+{
			
 
				+	partition_tb[lpid].patb0 = cpu_to_be64(dw0);
			
 
				+	partition_tb[lpid].patb1 = cpu_to_be64(dw1);
			
 
				+
			
 
				+	/* Global flush of TLBs and partition table caches for this lpid */
			
 
				+	asm volatile("ptesync" : : : "memory");
			
 
				+	asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : :
			
 
				+		     "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid));
			
 
				+	asm volatile("eieio; tlbsync; ptesync" : : : "memory");
			
 
				+}
			
 
				+EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry);
			
 
				+#endif /* CONFIG_PPC_BOOK3S_64 */
			
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state,		OPAL_PCI_GET_PRESENCE_STATE);
 
				 OPAL_CALL(opal_pci_get_power_state,		OPAL_PCI_GET_POWER_STATE);
			
 
				 OPAL_CALL(opal_pci_set_power_state,		OPAL_PCI_SET_POWER_STATE);
			
 
				 OPAL_CALL(opal_int_get_xirr,			OPAL_INT_GET_XIRR);
			
 
				+OPAL_CALL_REAL(opal_rm_int_get_xirr,		OPAL_INT_GET_XIRR);
			
 
				 OPAL_CALL(opal_int_set_cppr,			OPAL_INT_SET_CPPR);
			
 
				 OPAL_CALL(opal_int_eoi,				OPAL_INT_EOI);
			
 
				+OPAL_CALL_REAL(opal_rm_int_eoi,			OPAL_INT_EOI);
			
 
				 OPAL_CALL(opal_int_set_mfrr,			OPAL_INT_SET_MFRR);
			
 
				+OPAL_CALL_REAL(opal_rm_int_set_mfrr,		OPAL_INT_SET_MFRR);
			
 
				 OPAL_CALL(opal_pci_tce_kill,			OPAL_PCI_TCE_KILL);
			
 
				 OPAL_CALL_REAL(opal_rm_pci_tce_kill,		OPAL_PCI_TCE_KILL);
			
--- a/arch/powerpc/platforms/powernv/opal.c
+++ b/arch/powerpc/platforms/powernv/opal.c
@@ -896,3 +896,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind);
 
				 EXPORT_SYMBOL_GPL(opal_leds_set_ind);
			
 
				 /* Export this symbol for PowerNV Operator Panel class driver */
			
 
				 EXPORT_SYMBOL_GPL(opal_write_oppanel_async);
			
 
				+/* Export this for KVM */
			
 
				+EXPORT_SYMBOL_GPL(opal_int_set_mfrr);
			
--- a/arch/powerpc/platforms/ps3/htab.c
+++ b/arch/powerpc/platforms/ps3/htab.c
@@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn,
 
				 	vflags &= ~HPTE_V_SECONDARY;
			
 
				 
			
 
				 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
			
 
				-	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags;
			
 
				+	hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags;
			
 
				 
			
 
				 	spin_lock_irqsave(&ps3_htab_lock, flags);
			
 
				 
			
--- a/arch/powerpc/platforms/pseries/lpar.c
+++ b/arch/powerpc/platforms/pseries/lpar.c
@@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group,
 
				 			 hpte_group, vpn,  pa, rflags, vflags, psize);
			
 
				 
			
 
				 	hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID;
			
 
				-	hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags;
			
 
				+	hpte_r = hpte_encode_r(pa, psize, apsize) | rflags;
			
 
				 
			
 
				 	if (!(vflags & HPTE_V_BOLTED))
			
 
				 		pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r);
			
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1113,6 +1113,10 @@ static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu)
 
				 
			
 
				 extern bool kvm_rebooting;
			
 
				 
			
 
				+extern unsigned int halt_poll_ns;
			
 
				+extern unsigned int halt_poll_ns_grow;
			
 
				+extern unsigned int halt_poll_ns_shrink;
			
 
				+
			
 
				 struct kvm_device {
			
 
				 	struct kvm_device_ops *ops;
			
 
				 	struct kvm *kvm;
			
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -651,6 +651,9 @@ struct kvm_enable_cap {
 
				 };
			
 
				 
			
 
				 /* for KVM_PPC_GET_PVINFO */
			
 
				+
			
 
				+#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
			
 
				+
			
 
				 struct kvm_ppc_pvinfo {
			
 
				 	/* out */
			
 
				 	__u32 flags;
			
@@ -682,8 +685,6 @@ struct kvm_ppc_smmu_info {
 
				 	struct kvm_ppc_one_seg_page_size sps[KVM_PPC_PAGE_SIZES_MAX_SZ];
			
 
				 };
			
 
				 
			
 
				-#define KVM_PPC_PVINFO_FLAGS_EV_IDLE   (1<<0)
			
 
				-
			
 
				 #define KVMIO 0xAE
			
 
				 
			
 
				 /* machine type bits, to be used as argument to KVM_CREATE_VM */
			
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -70,16 +70,19 @@ MODULE_AUTHOR("Qumranet");
 
				 MODULE_LICENSE("GPL");
			
 
				 
			
 
				 /* Architectures should define their poll value according to the halt latency */
			
 
				-static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
			
 
				+unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
			
 
				 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
			
 
				+EXPORT_SYMBOL_GPL(halt_poll_ns);
			
 
				 
			
 
				 /* Default doubles per-vcpu halt_poll_ns. */
			
 
				-static unsigned int halt_poll_ns_grow = 2;
			
 
				+unsigned int halt_poll_ns_grow = 2;
			
 
				 module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
			
 
				+EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
			
 
				 
			
 
				 /* Default resets per-vcpu halt_poll_ns . */
			
 
				-static unsigned int halt_poll_ns_shrink;
			
 
				+unsigned int halt_poll_ns_shrink;
			
 
				 module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
			
 
				+EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
			
 
				 
			
 
				 /*
			
 
				  * Ordering of locks: