Browse Source

Merge tag 'powerpc-4.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux

Pull powerpc updates from Michael Ellerman:
 "Highlights:
   - Major rework of Book3S 64-bit exception vectors (Nicholas Piggin)
   - Use gas sections for arranging exception vectors et. al.
   - Large set of TM cleanups and selftests (Cyril Bur)
   - Enable transactional memory (TM) lazily for userspace (Cyril Bur)
   - Support for XZ compression in the zImage wrapper (Oliver
     O'Halloran)
   - Add support for bpf constant blinding (Naveen N. Rao)
   - Beginnings of upstream support for PA Semi Nemo motherboards
     (Darren Stevens)

  Fixes:
   - Ensure .mem(init|exit).text are within _stext/_etext (Michael
     Ellerman)
   - xmon: Don't use ld on 32-bit (Michael Ellerman)
   - vdso64: Use double word compare on pointers (Anton Blanchard)
   - powerpc/nvram: Fix an incorrect partition merge (Pan Xinhui)
   - powerpc: Fix usage of _PAGE_RO in hugepage (Christophe Leroy)
   - powerpc/mm: Update FORCE_MAX_ZONEORDER range to allow hugetlb w/4K
     (Aneesh Kumar K.V)
   - Fix memory leak in queue_hotplug_event() error path (Andrew
     Donnellan)
   - Replay hypervisor maintenance interrupt first (Nicholas Piggin)

  Various performance optimisations (Anton Blanchard):
   - Align hot loops of memset() and backwards_memcpy()
   - During context switch, check before setting mm_cpumask
   - Remove static branch prediction in atomic{, 64}_add_unless
   - Only disable HAVE_EFFICIENT_UNALIGNED_ACCESS on POWER7 little
     endian
   - Set default CPU type to POWER8 for little endian builds

  Cleanups & features:
   - Sparse fixes/cleanups (Daniel Axtens)
   - Preserve CFAR value on SLB miss caused by access to bogus address
     (Paul Mackerras)
   - Radix MMU fixups for POWER9 (Aneesh Kumar K.V)
   - Support for setting used_(vsr|vr|spe) in sigreturn path (for CRIU)
     (Simon Guo)
   - Optimise syscall entry for virtual, relocatable case (Nicholas
     Piggin)
   - Optimise MSR handling in exception handling (Nicholas Piggin)
   - Support for kexec with Radix MMU (Benjamin Herrenschmidt)
   - powernv EEH fixes (Russell Currey)
   - Suprise PCI hotplug support for powernv (Gavin Shan)
   - Endian/sparse fixes for powernv PCI (Gavin Shan)
   - Defconfig updates (Anton Blanchard)
   - KVM: PPC: Book3S HV: Migrate pinned pages out of CMA (Balbir Singh)
   - cxl: Flush PSL cache before resetting the adapter (Frederic Barrat)
   - cxl: replace loop with for_each_child_of_node(), remove unneeded
     of_node_put() (Andrew Donnellan)
   - Fix HV facility unavailable to use correct handler (Nicholas
     Piggin)
   - Remove unnecessary syscall trampoline (Nicholas Piggin)
   - fadump: Fix build break when CONFIG_PROC_VMCORE=n (Michael
     Ellerman)
   - Quieten EEH message when no adapters are found (Anton Blanchard)
   - powernv: Add PHB register dump debugfs handle (Russell Currey)
   - Use kprobe blacklist for exception handlers & asm functions
     (Nicholas Piggin)
   - Document the syscall ABI (Nicholas Piggin)
   - MAINTAINERS: Update cxl maintainers (Michael Neuling)
   - powerpc: Remove all usages of NO_IRQ (Michael Ellerman)

  Minor cleanups:
   - Andrew Donnellan, Christophe Leroy, Colin Ian King, Cyril Bur,
     Frederic Barrat, Pan Xinhui, PrasannaKumar Muralidharan, Rui Teng,
     Simon Guo"

* tag 'powerpc-4.9-1' of git://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux: (156 commits)
  powerpc/bpf: Add support for bpf constant blinding
  powerpc/bpf: Implement support for tail calls
  powerpc/bpf: Introduce accessors for using the tmp local stack space
  powerpc/fadump: Fix build break when CONFIG_PROC_VMCORE=n
  powerpc: tm: Enable transactional memory (TM) lazily for userspace
  powerpc/tm: Add TM Unavailable Exception
  powerpc: Remove do_load_up_transact_{fpu,altivec}
  powerpc: tm: Rename transct_(*) to ck(\1)_state
  powerpc: tm: Always use fp_state and vr_state to store live registers
  selftests/powerpc: Add checks for transactional VSXs in signal contexts
  selftests/powerpc: Add checks for transactional VMXs in signal contexts
  selftests/powerpc: Add checks for transactional FPUs in signal contexts
  selftests/powerpc: Add checks for transactional GPRs in signal contexts
  selftests/powerpc: Check that signals always get delivered
  selftests/powerpc: Add TM tcheck helpers in C
  selftests/powerpc: Allow tests to extend their kill timeout
  selftests/powerpc: Introduce GPR asm helper header file
  selftests/powerpc: Move VMX stack frame macros to header file
  selftests/powerpc: Rework FPU stack placement macros and move to header file
  selftests/powerpc: Check for VSX preservation across userspace preemption
  ...
Linus Torvalds 8 years ago
parent
commit
07021b4359
100 changed files with 3238 additions and 2419 deletions
  1. 105 0
      Documentation/powerpc/syscall64-abi.txt
  2. 2 2
      MAINTAINERS
  3. 3 7
      arch/powerpc/Kconfig
  4. 18 25
      arch/powerpc/Makefile
  5. 55 31
      arch/powerpc/boot/Makefile
  6. 0 1
      arch/powerpc/boot/cuboot-c2k.c
  7. 148 0
      arch/powerpc/boot/decompress.c
  8. 12 0
      arch/powerpc/boot/fixup-headers.sed
  9. 0 204
      arch/powerpc/boot/gunzip_util.c
  10. 0 45
      arch/powerpc/boot/gunzip_util.h
  11. 18 17
      arch/powerpc/boot/main.c
  12. 3 0
      arch/powerpc/boot/ops.h
  13. 14 0
      arch/powerpc/boot/stdbool.h
  14. 13 0
      arch/powerpc/boot/stdint.h
  15. 14 0
      arch/powerpc/boot/types.h
  16. 45 16
      arch/powerpc/boot/wrapper
  17. 39 0
      arch/powerpc/boot/xz_config.h
  18. 12 7
      arch/powerpc/configs/powernv_defconfig
  19. 12 7
      arch/powerpc/configs/ppc64_defconfig
  20. 12 7
      arch/powerpc/configs/pseries_defconfig
  21. 40 3
      arch/powerpc/include/asm/asm-prototypes.h
  22. 2 2
      arch/powerpc/include/asm/atomic.h
  23. 2 1
      arch/powerpc/include/asm/book3s/32/pgtable.h
  24. 5 2
      arch/powerpc/include/asm/book3s/64/pgtable.h
  25. 69 19
      arch/powerpc/include/asm/book3s/64/radix.h
  26. 1 0
      arch/powerpc/include/asm/book3s/64/tlbflush-radix.h
  27. 3 1
      arch/powerpc/include/asm/cputable.h
  28. 60 88
      arch/powerpc/include/asm/exception-64s.h
  29. 0 4
      arch/powerpc/include/asm/fadump.h
  30. 393 0
      arch/powerpc/include/asm/head-64.h
  31. 1 1
      arch/powerpc/include/asm/machdep.h
  32. 3 0
      arch/powerpc/include/asm/mmu-book3e.h
  33. 4 0
      arch/powerpc/include/asm/mmu.h
  34. 3 1
      arch/powerpc/include/asm/mmu_context.h
  35. 3 3
      arch/powerpc/include/asm/mpic_msgr.h
  36. 2 1
      arch/powerpc/include/asm/nohash/32/pgtable.h
  37. 2 1
      arch/powerpc/include/asm/nohash/64/pgtable.h
  38. 1 1
      arch/powerpc/include/asm/parport.h
  39. 3 1
      arch/powerpc/include/asm/pnv-pci.h
  40. 2 0
      arch/powerpc/include/asm/ppc-opcode.h
  41. 16 33
      arch/powerpc/include/asm/ppc_asm.h
  42. 7 9
      arch/powerpc/include/asm/processor.h
  43. 4 1
      arch/powerpc/include/asm/reg.h
  44. 0 2
      arch/powerpc/include/asm/signal.h
  45. 0 5
      arch/powerpc/include/asm/tm.h
  46. 7 8
      arch/powerpc/kernel/Makefile
  47. 6 6
      arch/powerpc/kernel/asm-offsets.c
  48. 19 0
      arch/powerpc/kernel/cputable.c
  49. 3 1
      arch/powerpc/kernel/eeh.c
  50. 9 1
      arch/powerpc/kernel/eeh_driver.c
  51. 1 0
      arch/powerpc/kernel/eeh_pe.c
  52. 0 1
      arch/powerpc/kernel/entry_32.S
  53. 9 12
      arch/powerpc/kernel/entry_64.S
  54. 1008 1060
      arch/powerpc/kernel/exceptions-64s.S
  55. 7 1
      arch/powerpc/kernel/fadump.c
  56. 0 26
      arch/powerpc/kernel/fpu.S
  57. 0 3
      arch/powerpc/kernel/head_32.S
  58. 36 17
      arch/powerpc/kernel/head_64.S
  59. 0 1
      arch/powerpc/kernel/head_8xx.S
  60. 6 3
      arch/powerpc/kernel/hw_breakpoint.c
  61. 1 1
      arch/powerpc/kernel/ibmebus.c
  62. 11 6
      arch/powerpc/kernel/irq.c
  63. 7 7
      arch/powerpc/kernel/legacy_serial.c
  64. 23 52
      arch/powerpc/kernel/machine_kexec_64.c
  65. 3 1
      arch/powerpc/kernel/misc_32.S
  66. 21 9
      arch/powerpc/kernel/misc_64.S
  67. 1 1
      arch/powerpc/kernel/module.c
  68. 5 5
      arch/powerpc/kernel/nvram_64.c
  69. 3 2
      arch/powerpc/kernel/pci-common.c
  70. 1 1
      arch/powerpc/kernel/pci_of_scan.c
  71. 106 75
      arch/powerpc/kernel/process.c
  72. 82 0
      arch/powerpc/kernel/prom_init.c
  73. 98 255
      arch/powerpc/kernel/ptrace.c
  74. 23 18
      arch/powerpc/kernel/signal.c
  75. 10 8
      arch/powerpc/kernel/signal.h
  76. 66 56
      arch/powerpc/kernel/signal_32.c
  77. 114 93
      arch/powerpc/kernel/signal_64.c
  78. 1 0
      arch/powerpc/kernel/syscalls.c
  79. 1 0
      arch/powerpc/kernel/time.c
  80. 50 44
      arch/powerpc/kernel/tm.S
  81. 72 11
      arch/powerpc/kernel/traps.c
  82. 0 6
      arch/powerpc/kernel/vdso64/Makefile
  83. 1 1
      arch/powerpc/kernel/vdso64/datapage.S
  84. 1 1
      arch/powerpc/kernel/vdso64/gettimeofday.S
  85. 0 25
      arch/powerpc/kernel/vector.S
  86. 52 3
      arch/powerpc/kernel/vmlinux.lds.S
  87. 1 1
      arch/powerpc/lib/Makefile
  88. 2 0
      arch/powerpc/lib/mem_64.S
  89. 3 4
      arch/powerpc/mm/Makefile
  90. 2 2
      arch/powerpc/mm/fault.c
  91. 50 31
      arch/powerpc/mm/hash_utils_64.c
  92. 7 0
      arch/powerpc/mm/hugetlbpage.c
  93. 1 1
      arch/powerpc/mm/init_32.c
  94. 77 4
      arch/powerpc/mm/mmu_context_iommu.c
  95. 10 1
      arch/powerpc/mm/pgtable-book3s64.c
  96. 40 0
      arch/powerpc/mm/pgtable-radix.c
  97. 1 1
      arch/powerpc/mm/pgtable.c
  98. 3 5
      arch/powerpc/mm/slb_low.S
  99. 24 0
      arch/powerpc/mm/tlb-radix.c
  100. 2 0
      arch/powerpc/net/bpf_jit.h

+ 105 - 0
Documentation/powerpc/syscall64-abi.txt

@@ -0,0 +1,105 @@
+===============================================
+Power Architecture 64-bit Linux system call ABI
+===============================================
+
+syscall
+=======
+
+syscall calling sequence[*] matches the Power Architecture 64-bit ELF ABI
+specification C function calling sequence, including register preservation
+rules, with the following differences.
+
+[*] Some syscalls (typically low-level management functions) may have
+    different calling sequences (e.g., rt_sigreturn).
+
+Parameters and return value
+---------------------------
+The system call number is specified in r0.
+
+There is a maximum of 6 integer parameters to a syscall, passed in r3-r8.
+
+Both a return value and a return error code are returned. cr0.SO is the return
+error code, and r3 is the return value or error code. When cr0.SO is clear,
+the syscall succeeded and r3 is the return value. When cr0.SO is set, the
+syscall failed and r3 is the error code that generally corresponds to errno.
+
+Stack
+-----
+System calls do not modify the caller's stack frame. For example, the caller's
+stack frame LR and CR save fields are not used.
+
+Register preservation rules
+---------------------------
+Register preservation rules match the ELF ABI calling sequence with the
+following differences:
+
+r0:         Volatile.   (System call number.)
+r3:         Volatile.   (Parameter 1, and return value.)
+r4-r8:      Volatile.   (Parameters 2-6.)
+cr0:        Volatile    (cr0.SO is the return error condition)
+cr1, cr5-7: Nonvolatile.
+lr:         Nonvolatile.
+
+All floating point and vector data registers as well as control and status
+registers are nonvolatile.
+
+Invocation
+----------
+The syscall is performed with the sc instruction, and returns with execution
+continuing at the instruction following the sc instruction.
+
+Transactional Memory
+--------------------
+Syscall behavior can change if the processor is in transactional or suspended
+transaction state, and the syscall can affect the behavior of the transaction.
+
+If the processor is in suspended state when a syscall is made, the syscall
+will be performed as normal, and will return as normal. The syscall will be
+performed in suspended state, so its side effects will be persistent according
+to the usual transactional memory semantics. A syscall may or may not result
+in the transaction being doomed by hardware.
+
+If the processor is in transactional state when a syscall is made, then the
+behavior depends on the presence of PPC_FEATURE2_HTM_NOSC in the AT_HWCAP2 ELF
+auxiliary vector.
+
+- If present, which is the case for newer kernels, then the syscall will not
+  be performed and the transaction will be doomed by the kernel with the
+  failure code TM_CAUSE_SYSCALL | TM_CAUSE_PERSISTENT in the TEXASR SPR.
+
+- If not present (older kernels), then the kernel will suspend the
+  transactional state and the syscall will proceed as in the case of a
+  suspended state syscall, and will resume the transactional state before
+  returning to the caller. This case is not well defined or supported, so this
+  behavior should not be relied upon.
+
+
+vsyscall
+========
+
+vsyscall calling sequence matches the syscall calling sequence, with the
+following differences. Some vsyscalls may have different calling sequences.
+
+Parameters and return value
+---------------------------
+r0 is not used as an input. The vsyscall is selected by its address.
+
+Stack
+-----
+The vsyscall may or may not use the caller's stack frame save areas.
+
+Register preservation rules
+---------------------------
+r0: Volatile.
+cr1, cr5-7: Volatile.
+lr: Volatile.
+
+Invocation
+----------
+The vsyscall is performed with a branch-with-link instruction to the vsyscall
+function address.
+
+Transactional Memory
+--------------------
+vsyscalls will run in the same transactional state as the caller. A vsyscall
+may or may not result in the transaction being doomed by hardware.

+ 2 - 2
MAINTAINERS

@@ -3523,14 +3523,14 @@ F:	drivers/net/ethernet/chelsio/cxgb4vf/
 
 CXL (IBM Coherent Accelerator Processor Interface CAPI) DRIVER
 M:	Ian Munsie <imunsie@au1.ibm.com>
-M:	Michael Neuling <mikey@neuling.org>
+M:	Frederic Barrat <fbarrat@linux.vnet.ibm.com>
 L:	linuxppc-dev@lists.ozlabs.org
 S:	Supported
+F:	arch/powerpc/platforms/powernv/pci-cxl.c
 F:	drivers/misc/cxl/
 F:	include/misc/cxl*
 F:	include/uapi/misc/cxl.h
 F:	Documentation/powerpc/cxl.txt
-F:	Documentation/powerpc/cxl.txt
 F:	Documentation/ABI/testing/sysfs-class-cxl
 
 CXLFLASH (IBM Coherent Accelerator Processor Interface CAPI Flash) SCSI DRIVER

+ 3 - 7
arch/powerpc/Kconfig

@@ -12,11 +12,6 @@ config 64BIT
 	bool
 	default y if PPC64
 
-config WORD_SIZE
-	int
-	default 64 if PPC64
-	default 32 if !PPC64
-
 config ARCH_PHYS_ADDR_T_64BIT
        def_bool PPC64 || PHYS_64BIT
 
@@ -101,7 +96,7 @@ config PPC
 	select VIRT_TO_BUS if !PPC64
 	select HAVE_IDE
 	select HAVE_IOREMAP_PROT
-	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_LITTLE_ENDIAN
+	select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU)
 	select HAVE_KPROBES
 	select HAVE_ARCH_KGDB
 	select HAVE_KRETPROBES
@@ -167,6 +162,7 @@ config PPC
 	select GENERIC_CPU_AUTOPROBE
 	select HAVE_VIRT_CPU_ACCOUNTING
 	select HAVE_ARCH_HARDENED_USERCOPY
+	select HAVE_KERNEL_GZIP
 
 config GENERIC_CSUM
 	def_bool CPU_LITTLE_ENDIAN
@@ -637,7 +633,7 @@ config FORCE_MAX_ZONEORDER
 	int "Maximum zone order"
 	range 8 9 if PPC64 && PPC_64K_PAGES
 	default "9" if PPC64 && PPC_64K_PAGES
-	range 9 13 if PPC64 && !PPC_64K_PAGES
+	range 13 13 if PPC64 && !PPC_64K_PAGES
 	default "13" if PPC64 && !PPC_64K_PAGES
 	range 9 64 if PPC32 && PPC_16K_PAGES
 	default "9" if PPC32 && PPC_16K_PAGES

+ 18 - 25
arch/powerpc/Makefile

@@ -43,31 +43,24 @@ NM		:= $(NM) --synthetic
 endif
 endif
 
-ifeq ($(CONFIG_PPC64),y)
-ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-OLDARCH	:= ppc64le
-else
-OLDARCH	:= ppc64
-endif
-else
-ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
-OLDARCH	:= ppcle
-else
-OLDARCH	:= ppc
-endif
-endif
+# BITS is used as extension for files which are available in a 32 bit
+# and a 64 bit version to simplify shared Makefiles.
+# e.g.: obj-y += foo_$(BITS).o
+export BITS
 
-# It seems there are times we use this Makefile without
-# including the config file, but this replicates the old behaviour
-ifeq ($(CONFIG_WORD_SIZE),)
-CONFIG_WORD_SIZE := 32
+ifdef CONFIG_PPC64
+        BITS := 64
+else
+        BITS := 32
 endif
 
-UTS_MACHINE := $(OLDARCH)
+machine-y = ppc
+machine-$(CONFIG_PPC64) += 64
+machine-$(CONFIG_CPU_LITTLE_ENDIAN) += le
+UTS_MACHINE := $(subst $(space),,$(machine-y))
 
 ifeq ($(CONFIG_CPU_LITTLE_ENDIAN),y)
 override LD	+= -EL
-override CROSS32AS += -mlittle-endian
 LDEMULATION	:= lppc
 GNUTARGET	:= powerpcle
 MULTIPLEWORD	:= -mno-multiple
@@ -89,10 +82,10 @@ aflags-$(CONFIG_CPU_BIG_ENDIAN)		+= $(call cc-option,-mbig-endian)
 aflags-$(CONFIG_CPU_LITTLE_ENDIAN)	+= -mlittle-endian
 
 ifeq ($(HAS_BIARCH),y)
-override AS	+= -a$(CONFIG_WORD_SIZE)
-override LD	+= -m elf$(CONFIG_WORD_SIZE)$(LDEMULATION)
-override CC	+= -m$(CONFIG_WORD_SIZE)
-override AR	:= GNUTARGET=elf$(CONFIG_WORD_SIZE)-$(GNUTARGET) $(AR)
+override AS	+= -a$(BITS)
+override LD	+= -m elf$(BITS)$(LDEMULATION)
+override CC	+= -m$(BITS)
+override AR	:= GNUTARGET=elf$(BITS)-$(GNUTARGET) $(AR)
 endif
 
 LDFLAGS_vmlinux-y := -Bstatic
@@ -179,7 +172,7 @@ KBUILD_CFLAGS	+= $(call cc-option,-msoft-float)
 KBUILD_CFLAGS	+= -pipe -Iarch/$(ARCH) $(CFLAGS-y)
 CPP		= $(CC) -E $(KBUILD_CFLAGS)
 
-CHECKFLAGS	+= -m$(CONFIG_WORD_SIZE) -D__powerpc__ -D__powerpc$(CONFIG_WORD_SIZE)__
+CHECKFLAGS	+= -m$(BITS) -D__powerpc__ -D__powerpc$(BITS)__
 ifdef CONFIG_CPU_BIG_ENDIAN
 CHECKFLAGS	+= -D__BIG_ENDIAN__
 else
@@ -234,7 +227,7 @@ KBUILD_CFLAGS += $(cpu-as-y)
 KBUILD_AFLAGS += $(aflags-y)
 KBUILD_CFLAGS += $(cflags-y)
 
-head-y				:= arch/powerpc/kernel/head_$(CONFIG_WORD_SIZE).o
+head-y				:= arch/powerpc/kernel/head_$(BITS).o
 head-$(CONFIG_8xx)		:= arch/powerpc/kernel/head_8xx.o
 head-$(CONFIG_40x)		:= arch/powerpc/kernel/head_40x.o
 head-$(CONFIG_44x)		:= arch/powerpc/kernel/head_44x.o

+ 55 - 31
arch/powerpc/boot/Makefile

@@ -19,10 +19,15 @@
 
 all: $(obj)/zImage
 
+compress-$(CONFIG_KERNEL_GZIP) := CONFIG_KERNEL_GZIP
+compress-$(CONFIG_KERNEL_XZ)   := CONFIG_KERNEL_XZ
+
 BOOTCFLAGS    := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \
 		 -fno-strict-aliasing -Os -msoft-float -pipe \
 		 -fomit-frame-pointer -fno-builtin -fPIC -nostdinc \
-		 -isystem $(shell $(CROSS32CC) -print-file-name=include)
+		 -isystem $(shell $(CROSS32CC) -print-file-name=include) \
+		 -D$(compress-y)
+
 ifdef CONFIG_PPC64_BOOT_WRAPPER
 BOOTCFLAGS	+= -m64
 endif
@@ -59,13 +64,30 @@ $(obj)/treeboot-currituck.o: BOOTCFLAGS += -mcpu=405
 $(obj)/treeboot-akebono.o: BOOTCFLAGS += -mcpu=405
 $(obj)/virtex405-head.o: BOOTAFLAGS += -mcpu=405
 
+# The pre-boot decompressors pull in a lot of kernel headers and other source
+# files. This creates a bit of a dependency headache since we need to copy
+# these files into the build dir, fix up any includes and ensure that dependent
+# files are copied in the right order.
+
+# these need to be seperate variables because they are copied out of different
+# directories in the kernel tree. Sure you COULd merge them, but it's a
+# cure-is-worse-than-disease situation.
+zlib-decomp-$(CONFIG_KERNEL_GZIP) := decompress_inflate.c
+zlib-$(CONFIG_KERNEL_GZIP) := inffast.c inflate.c inftrees.c
+zlibheader-$(CONFIG_KERNEL_GZIP) := inffast.h inffixed.h inflate.h inftrees.h infutil.h
+zliblinuxheader-$(CONFIG_KERNEL_GZIP) := zlib.h zconf.h zutil.h
 
-zlib       := inffast.c inflate.c inftrees.c
-zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h
-zliblinuxheader := zlib.h zconf.h zutil.h
+$(addprefix $(obj)/, decompress.o): \
+	$(addprefix $(obj)/,$(zlib-decomp-y))
 
-$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o): \
-	$(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
+$(addprefix $(obj)/, $(zlib-decomp-y)): \
+	$(addprefix $(obj)/,$(zliblinuxheader-y)) \
+	$(addprefix $(obj)/,$(zlibheader-y)) \
+	$(addprefix $(obj)/,$(zlib-y))
+
+$(addprefix $(obj)/,$(zlib-y)): \
+	$(addprefix $(obj)/,$(zliblinuxheader-y)) \
+	$(addprefix $(obj)/,$(zlibheader-y))
 
 libfdt       := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c
 libfdtheader := fdt.h libfdt.h libfdt_internal.h
@@ -73,10 +95,10 @@ libfdtheader := fdt.h libfdt.h libfdt_internal.h
 $(addprefix $(obj)/,$(libfdt) libfdt-wrapper.o simpleboot.o epapr.o opal.o): \
 	$(addprefix $(obj)/,$(libfdtheader))
 
-src-wlib-y := string.S crt0.S crtsavres.S stdio.c main.c \
+src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \
 		$(libfdt) libfdt-wrapper.c \
 		ns16550.c serial.c simple_alloc.c div64.S util.S \
-		gunzip_util.c elf_util.c $(zlib) devtree.c stdlib.c \
+		elf_util.c $(zlib-y) devtree.c stdlib.c \
 		oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \
 		uartlite.c mpc52xx-psc.c opal.c opal-calls.S
 src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c
@@ -125,23 +147,20 @@ obj-wlib := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-wlib))))
 obj-plat := $(addsuffix .o, $(basename $(addprefix $(obj)/, $(src-plat))))
 obj-plat: $(libfdt)
 
-quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = sed "s@__used@@;s@<linux/\([^>]*\).*@\"\1\"@" $< > $@
+quiet_cmd_copy_kern_src = COPY    $@
+      cmd_copy_kern_src = sed -f $(srctree)/arch/powerpc/boot/fixup-headers.sed $< > $@
 
-quiet_cmd_copy_zlibheader = COPY    $@
-      cmd_copy_zlibheader = sed "s@<linux/\([^>]*\).*@\"\1\"@" $< > $@
-# stddef.h for NULL
-quiet_cmd_copy_zliblinuxheader = COPY    $@
-      cmd_copy_zliblinuxheader = sed "s@<linux/string.h>@\"string.h\"@;s@<linux/kernel.h>@<stddef.h>@;s@<linux/\([^>]*\).*@\"\1\"@" $< > $@
+$(addprefix $(obj)/,$(zlib-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+	$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-	$(call cmd,copy_zlib)
+$(addprefix $(obj)/,$(zlibheader-y)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+	$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
-	$(call cmd,copy_zlibheader)
+$(addprefix $(obj)/,$(zliblinuxheader-y)): $(obj)/%: $(srctree)/include/linux/%
+	$(call cmd,copy_kern_src)
 
-$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
-	$(call cmd,copy_zliblinuxheader)
+$(addprefix $(obj)/,$(zlib-decomp-y)): $(obj)/%: $(srctree)/lib/%
+	$(call cmd,copy_kern_src)
 
 quiet_cmd_copy_libfdt = COPY    $@
       cmd_copy_libfdt = cp $< $@
@@ -150,17 +169,17 @@ $(addprefix $(obj)/,$(libfdt) $(libfdtheader)): $(obj)/%: $(srctree)/scripts/dtc
 	$(call cmd,copy_libfdt)
 
 $(obj)/empty.c:
-	@touch $@
+	$(Q)touch $@
 
 $(obj)/zImage.lds: $(obj)/%: $(srctree)/$(src)/%.S
 	$(CROSS32CC) $(cpp_flags) -E -Wp,-MD,$(depfile) -P -Upowerpc \
 		-D__ASSEMBLY__ -DLINKER_SCRIPT -o $@ $<
 
 $(obj)/zImage.coff.lds $(obj)/zImage.ps3.lds : $(obj)/%: $(srctree)/$(src)/%.S
-	@cp $< $@
+	$(Q)cp $< $@
 
-clean-files := $(zlib) $(zlibheader) $(zliblinuxheader) \
-		$(libfdt) $(libfdtheader) \
+clean-files := $(zlib-) $(zlibheader-) $(zliblinuxheader-) \
+		$(zlib-decomp-) $(libfdt) $(libfdtheader) \
 		empty.c zImage.coff.lds zImage.ps3.lds zImage.lds
 
 quiet_cmd_bootcc = BOOTCC  $@
@@ -207,10 +226,14 @@ CROSSWRAP := -C "$(CROSS_COMPILE)"
 endif
 endif
 
+compressor-$(CONFIG_KERNEL_GZIP) := gz
+compressor-$(CONFIG_KERNEL_XZ)   := xz
+
 # args (to if_changed): 1 = (this rule), 2 = platform, 3 = dts 4=dtb 5=initrd
 quiet_cmd_wrap	= WRAP    $@
-      cmd_wrap	=$(CONFIG_SHELL) $(wrapper) -c -o $@ -p $2 $(CROSSWRAP) \
-		$(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) vmlinux
+      cmd_wrap	=$(CONFIG_SHELL) $(wrapper) -Z $(compressor-y) -c -o $@ -p $2 \
+		$(CROSSWRAP) $(if $3, -s $3)$(if $4, -d $4)$(if $5, -i $5) \
+		vmlinux
 
 image-$(CONFIG_PPC_PSERIES)		+= zImage.pseries
 image-$(CONFIG_PPC_POWERNV)		+= zImage.pseries
@@ -391,9 +414,9 @@ image-y := vmlinux.strip
 endif
 
 $(obj)/zImage:		$(addprefix $(obj)/, $(image-y))
-	@rm -f $@; ln $< $@
+	$(Q)rm -f $@; ln $< $@
 $(obj)/zImage.initrd:	$(addprefix $(obj)/, $(initrd-y))
-	@rm -f $@; ln $< $@
+	$(Q)rm -f $@; ln $< $@
 
 # Only install the vmlinux
 install: $(CONFIGURE) $(addprefix $(obj)/, $(image-y))
@@ -410,8 +433,9 @@ clean-files += $(image-) $(initrd-) cuImage.* dtbImage.* treeImage.* \
 	zImage.maple simpleImage.* otheros.bld *.dtb
 
 # clean up files cached by wrapper
-clean-kernel := vmlinux.strip vmlinux.bin
-clean-kernel += $(addsuffix .gz,$(clean-kernel))
+clean-kernel-base := vmlinux.strip vmlinux.bin
+clean-kernel := $(addsuffix .gz,$(clean-kernel-base))
+clean-kernel += $(addsuffix .xz,$(clean-kernel-base))
 # If not absolute clean-files are relative to $(obj).
 clean-files += $(addprefix $(objtree)/, $(clean-kernel))
 

+ 0 - 1
arch/powerpc/boot/cuboot-c2k.c

@@ -18,7 +18,6 @@
 #include "io.h"
 #include "ops.h"
 #include "elf.h"
-#include "gunzip_util.h"
 #include "mv64x60.h"
 #include "cuboot.h"
 #include "ppcboot.h"

+ 148 - 0
arch/powerpc/boot/decompress.c

@@ -0,0 +1,148 @@
+/*
+ * Wrapper around the kernel's pre-boot decompression library.
+ *
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "elf.h"
+#include "page.h"
+#include "string.h"
+#include "stdio.h"
+#include "ops.h"
+#include "reg.h"
+#include "types.h"
+
+/*
+ * The decompressor_*.c files play #ifdef games so they can be used in both
+ * pre-boot and regular kernel code. We need these definitions to make the
+ * includes work.
+ */
+
+#define STATIC static
+#define INIT
+#define __always_inline inline
+
+/*
+ * The build process will copy the required zlib source files and headers
+ * out of lib/ and "fix" the includes so they do not pull in other kernel
+ * headers.
+ */
+
+#ifdef CONFIG_KERNEL_GZIP
+#	include "decompress_inflate.c"
+#endif
+
+#ifdef CONFIG_KERNEL_XZ
+#	include "xz_config.h"
+#	include "../../../lib/decompress_unxz.c"
+#endif
+
+/* globals for tracking the state of the decompression */
+static unsigned long decompressed_bytes;
+static unsigned long limit;
+static unsigned long skip;
+static char *output_buffer;
+
+/*
+ * flush() is called by __decompress() when the decompressor's scratch buffer is
+ * full.
+ */
+static long flush(void *v, unsigned long buffer_size)
+{
+	unsigned long end = decompressed_bytes + buffer_size;
+	unsigned long size = buffer_size;
+	unsigned long offset = 0;
+	char *in = v;
+	char *out;
+
+	/*
+	 * if we hit our decompression limit, we need to fake an error to abort
+	 * the in-progress decompression.
+	 */
+	if (decompressed_bytes >= limit)
+		return -1;
+
+	/* skip this entire block */
+	if (end <= skip) {
+		decompressed_bytes += buffer_size;
+		return buffer_size;
+	}
+
+	/* skip some data at the start, but keep the rest of the block */
+	if (decompressed_bytes < skip && end > skip) {
+		offset = skip - decompressed_bytes;
+
+		in += offset;
+		size -= offset;
+		decompressed_bytes += offset;
+	}
+
+	out = &output_buffer[decompressed_bytes - skip];
+	size = min(decompressed_bytes + size, limit) - decompressed_bytes;
+
+	memcpy(out, in, size);
+	decompressed_bytes += size;
+
+	return buffer_size;
+}
+
+static void print_err(char *s)
+{
+	/* suppress the "error" when we terminate the decompressor */
+	if (decompressed_bytes >= limit)
+		return;
+
+	printf("Decompression error: '%s'\n\r", s);
+}
+
+/**
+ * partial_decompress - decompresses part or all of a compressed buffer
+ * @inbuf:       input buffer
+ * @input_size:  length of the input buffer
+ * @outbuf:      input buffer
+ * @output_size: length of the input buffer
+ * @skip         number of output bytes to ignore
+ *
+ * This function takes compressed data from inbuf, decompresses and write it to
+ * outbuf. Once output_size bytes are written to the output buffer, or the
+ * stream is exhausted the function will return the number of bytes that were
+ * decompressed. Otherwise it will return whatever error code the decompressor
+ * reported (NB: This is specific to each decompressor type).
+ *
+ * The skip functionality is mainly there so the program and discover
+ * the size of the compressed image so that it can ask firmware (if present)
+ * for an appropriately sized buffer.
+ */
+long partial_decompress(void *inbuf, unsigned long input_size,
+	void *outbuf, unsigned long output_size, unsigned long _skip)
+{
+	int ret;
+
+	/*
+	 * The skipped bytes needs to be included in the size of data we want
+	 * to decompress.
+	 */
+	output_size += _skip;
+
+	decompressed_bytes = 0;
+	output_buffer = outbuf;
+	limit = output_size;
+	skip = _skip;
+
+	ret = __decompress(inbuf, input_size, NULL, flush, outbuf,
+		output_size, NULL, print_err);
+
+	/*
+	 * If decompression was aborted due to an actual error rather than
+	 * a fake error that we used to abort, then we should report it.
+	 */
+	if (decompressed_bytes < limit)
+		return ret;
+
+	return decompressed_bytes - skip;
+}

+ 12 - 0
arch/powerpc/boot/fixup-headers.sed

@@ -0,0 +1,12 @@
+# Copyright 2016 IBM Corporation.
+#
+# This program is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License version 2 or later as
+# published by the Free Software Foundation.
+
+s@#include <linux/decompress/mm\.h>@@;
+s@\"zlib_inflate/\([^\"]*\).*@"\1"@;
+s@<linux/kernel.h>@<stddef.h>@;
+
+s@__used@@;
+s@<linux/\([^>]*\).*@"\1"@;

+ 0 - 204
arch/powerpc/boot/gunzip_util.c

@@ -1,204 +0,0 @@
-/*
- * Copyright 2007 David Gibson, IBM Corporation.
- * Based on earlier work, Copyright (C) Paul Mackerras 1997.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <stddef.h>
-#include "string.h"
-#include "stdio.h"
-#include "ops.h"
-#include "gunzip_util.h"
-
-#define HEAD_CRC	2
-#define EXTRA_FIELD	4
-#define ORIG_NAME	8
-#define COMMENT		0x10
-#define RESERVED	0xe0
-
-/**
- * gunzip_start - prepare to decompress gzip data
- * @state:     decompressor state structure to be initialized
- * @src:       buffer containing gzip compressed or uncompressed data
- * @srclen:    size in bytes of the buffer at src
- *
- * If the buffer at @src contains a gzip header, this function
- * initializes zlib to decompress the data, storing the decompression
- * state in @state.  The other functions in this file can then be used
- * to decompress data from the gzipped stream.
- *
- * If the buffer at @src does not contain a gzip header, it is assumed
- * to contain uncompressed data.  The buffer information is recorded
- * in @state and the other functions in this file will simply copy
- * data from the uncompressed data stream at @src.
- *
- * Any errors, such as bad compressed data, cause an error to be
- * printed an the platform's exit() function to be called.
- */
-void gunzip_start(struct gunzip_state *state, void *src, int srclen)
-{
-	char *hdr = src;
-	int hdrlen = 0;
-
-	memset(state, 0, sizeof(*state));
-
-	/* Check for gzip magic number */
-	if ((hdr[0] == 0x1f) && (hdr[1] == 0x8b)) {
-		/* gzip data, initialize zlib parameters */
-		int r, flags;
-
-		state->s.workspace = state->scratch;
-		if (zlib_inflate_workspacesize() > sizeof(state->scratch))
-			fatal("insufficient scratch space for gunzip\n\r");
-
-		/* skip header */
-		hdrlen = 10;
-		flags = hdr[3];
-		if (hdr[2] != Z_DEFLATED || (flags & RESERVED) != 0)
-			fatal("bad gzipped data\n\r");
-		if ((flags & EXTRA_FIELD) != 0)
-			hdrlen = 12 + hdr[10] + (hdr[11] << 8);
-		if ((flags & ORIG_NAME) != 0)
-			while (hdr[hdrlen++] != 0)
-				;
-		if ((flags & COMMENT) != 0)
-			while (hdr[hdrlen++] != 0)
-				;
-		if ((flags & HEAD_CRC) != 0)
-			hdrlen += 2;
-		if (hdrlen >= srclen)
-			fatal("gunzip_start: ran out of data in header\n\r");
-
-		r = zlib_inflateInit2(&state->s, -MAX_WBITS);
-		if (r != Z_OK)
-			fatal("inflateInit2 returned %d\n\r", r);
-	}
-
-	state->s.total_in = hdrlen;
-	state->s.next_in = src + hdrlen;
-	state->s.avail_in = srclen - hdrlen;
-}
-
-/**
- * gunzip_partial - extract bytes from a gzip data stream
- * @state:     gzip state structure previously initialized by gunzip_start()
- * @dst:       buffer to store extracted data
- * @dstlen:    maximum number of bytes to extract
- *
- * This function extracts at most @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.  Exactly @dstlen bytes are extracted unless the data
- * stream doesn't contain enough bytes, in which case the entire
- * remainder of the stream is decompressed.
- *
- * Returns the actual number of bytes extracted.  If any errors occur,
- * such as a corrupted compressed stream, an error is printed an the
- * platform's exit() function is called.
- */
-int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen)
-{
-	int len;
-
-	if (state->s.workspace) {
-		/* gunzipping */
-		int r;
-
-		state->s.next_out = dst;
-		state->s.avail_out = dstlen;
-		r = zlib_inflate(&state->s, Z_FULL_FLUSH);
-		if (r != Z_OK && r != Z_STREAM_END)
-			fatal("inflate returned %d msg: %s\n\r", r, state->s.msg);
-		len = state->s.next_out - (Byte *)dst;
-	} else {
-		/* uncompressed image */
-		len = min(state->s.avail_in, (uLong)dstlen);
-		memcpy(dst, state->s.next_in, len);
-		state->s.next_in += len;
-		state->s.avail_in -= len;
-	}
-	return len;
-}
-
-/**
- * gunzip_exactly - extract a fixed number of bytes from a gzip data stream
- * @state:     gzip state structure previously initialized by gunzip_start()
- * @dst:       buffer to store extracted data
- * @dstlen:    number of bytes to extract
- *
- * This function extracts exactly @dstlen bytes from the data stream
- * previously associated with @state by gunzip_start(), decompressing
- * if necessary.
- *
- * If there are less @dstlen bytes available in the data stream, or if
- * any other errors occur, such as a corrupted compressed stream, an
- * error is printed an the platform's exit() function is called.
- */
-void gunzip_exactly(struct gunzip_state *state, void *dst, int dstlen)
-{
-	int len;
-
-	len  = gunzip_partial(state, dst, dstlen);
-	if (len < dstlen)
-		fatal("\n\rgunzip_exactly: ran out of data!"
-				" Wanted %d, got %d.\n\r", dstlen, len);
-}
-
-/**
- * gunzip_discard - discard bytes from a gzip data stream
- * @state:     gzip state structure previously initialized by gunzip_start()
- * @len:       number of bytes to discard
- *
- * This function extracts, then discards exactly @len bytes from the
- * data stream previously associated with @state by gunzip_start().
- * Subsequent gunzip_partial(), gunzip_exactly() or gunzip_finish()
- * calls will extract the data following the discarded bytes in the
- * data stream.
- *
- * If there are less @len bytes available in the data stream, or if
- * any other errors occur, such as a corrupted compressed stream, an
- * error is printed an the platform's exit() function is called.
- */
-void gunzip_discard(struct gunzip_state *state, int len)
-{
-	static char discard_buf[128];
-
-	while (len > sizeof(discard_buf)) {
-		gunzip_exactly(state, discard_buf, sizeof(discard_buf));
-		len -= sizeof(discard_buf);
-	}
-
-	if (len > 0)
-		gunzip_exactly(state, discard_buf, len);
-}
-
-/**
- * gunzip_finish - extract all remaining bytes from a gzip data stream
- * @state:     gzip state structure previously initialized by gunzip_start()
- * @dst:       buffer to store extracted data
- * @dstlen:    maximum number of bytes to extract
- *
- * This function extracts all remaining data, or at most @dstlen
- * bytes, from the stream previously associated with @state by
- * gunzip_start().  zlib is then shut down, so it is an error to use
- * any of the functions in this file on @state until it is
- * re-initialized with another call to gunzip_start().
- *
- * If any errors occur, such as a corrupted compressed stream, an
- * error is printed an the platform's exit() function is called.
- */
-int gunzip_finish(struct gunzip_state *state, void *dst, int dstlen)
-{
-	int len;
-
-	len = gunzip_partial(state, dst, dstlen);
-
-	if (state->s.workspace) {
-		zlib_inflateEnd(&state->s);
-	}
-
-	return len;
-}

+ 0 - 45
arch/powerpc/boot/gunzip_util.h

@@ -1,45 +0,0 @@
-/*
- * Decompression convenience functions
- *
- * Copyright 2007 David Gibson, IBM Corporation.
- *
- * This file is licensed under the terms of the GNU General Public
- * License version 2.  This program is licensed "as is" without any
- * warranty of any kind, whether express or implied.
- */
-#ifndef _PPC_BOOT_GUNZIP_UTIL_H_
-#define _PPC_BOOT_GUNZIP_UTIL_H_
-
-#include "zlib.h"
-
-/*
- * These functions are designed to make life easy for decompressing
- * kernel images, initrd images or any other gzip compressed image,
- * particularly if its useful to decompress part of the image (e.g. to
- * examine headers) before decompressing the remainder.
- *
- * To use:
- *     - declare a gunzip_state structure
- *     - use gunzip_start() to initialize the state, associating it
- *       with a stream of compressed data
- *     - use gunzip_partial(), gunzip_exactly() and gunzip_discard()
- *       in any combination to extract pieces of data from the stream
- *     - Finally use gunzip_finish() to extract the tail of the
- *       compressed stream and wind up zlib
- */
-
-/* scratch space for gunzip; 46912 is from zlib_inflate_workspacesize() */
-#define GUNZIP_SCRATCH_SIZE	46912
-
-struct gunzip_state {
-	z_stream s;
-	char scratch[46912];
-};
-
-void gunzip_start(struct gunzip_state *state, void *src, int srclen);
-int gunzip_partial(struct gunzip_state *state, void *dst, int dstlen);
-void gunzip_exactly(struct gunzip_state *state, void *dst, int len);
-void gunzip_discard(struct gunzip_state *state, int len);
-int gunzip_finish(struct gunzip_state *state, void *dst, int len);
-
-#endif /* _PPC_BOOT_GUNZIP_UTIL_H_ */

+ 18 - 17
arch/powerpc/boot/main.c

@@ -15,11 +15,8 @@
 #include "string.h"
 #include "stdio.h"
 #include "ops.h"
-#include "gunzip_util.h"
 #include "reg.h"
 
-static struct gunzip_state gzstate;
-
 struct addr_range {
 	void *addr;
 	unsigned long size;
@@ -30,15 +27,14 @@ struct addr_range {
 static struct addr_range prep_kernel(void)
 {
 	char elfheader[256];
-	void *vmlinuz_addr = _vmlinux_start;
+	unsigned char *vmlinuz_addr = (unsigned char *)_vmlinux_start;
 	unsigned long vmlinuz_size = _vmlinux_end - _vmlinux_start;
 	void *addr = 0;
 	struct elf_info ei;
-	int len;
+	long len;
 
-	/* gunzip the ELF header of the kernel */
-	gunzip_start(&gzstate, vmlinuz_addr, vmlinuz_size);
-	gunzip_exactly(&gzstate, elfheader, sizeof(elfheader));
+	partial_decompress(vmlinuz_addr, vmlinuz_size,
+		elfheader, sizeof(elfheader), 0);
 
 	if (!parse_elf64(elfheader, &ei) && !parse_elf32(elfheader, &ei))
 		fatal("Error: not a valid PPC32 or PPC64 ELF file!\n\r");
@@ -51,7 +47,7 @@ static struct addr_range prep_kernel(void)
 	 * the kernel bss must be claimed (it will be zero'd by the
 	 * kernel itself)
 	 */
-	printf("Allocating 0x%lx bytes for kernel ...\n\r", ei.memsize);
+	printf("Allocating 0x%lx bytes for kernel...\n\r", ei.memsize);
 
 	if (platform_ops.vmlinux_alloc) {
 		addr = platform_ops.vmlinux_alloc(ei.memsize);
@@ -71,16 +67,21 @@ static struct addr_range prep_kernel(void)
 					"device tree\n\r");
 	}
 
-	/* Finally, gunzip the kernel */
-	printf("gunzipping (0x%p <- 0x%p:0x%p)...", addr,
+	/* Finally, decompress the kernel */
+	printf("Decompressing (0x%p <- 0x%p:0x%p)...\n\r", addr,
 	       vmlinuz_addr, vmlinuz_addr+vmlinuz_size);
-	/* discard up to the actual load data */
-	gunzip_discard(&gzstate, ei.elfoffset - sizeof(elfheader));
-	len = gunzip_finish(&gzstate, addr, ei.loadsize);
+
+	len = partial_decompress(vmlinuz_addr, vmlinuz_size,
+		addr, ei.loadsize, ei.elfoffset);
+
+	if (len < 0)
+		fatal("Decompression failed with error code %ld\n\r", len);
+
 	if (len != ei.loadsize)
-		fatal("ran out of data!  only got 0x%x of 0x%lx bytes.\n\r",
-				len, ei.loadsize);
-	printf("done 0x%x bytes\n\r", len);
+		 fatal("Decompression error: got 0x%lx bytes, expected 0x%lx.\n\r",
+			 len, ei.loadsize);
+
+	printf("Done! Decompressed 0x%lx bytes\n\r", len);
 
 	flush_cache(addr, ei.loadsize);
 

+ 3 - 0
arch/powerpc/boot/ops.h

@@ -260,4 +260,7 @@ int __ilog2_u32(u32 n)
 	return 31 - bit;
 }
 
+long partial_decompress(void *inbuf, unsigned long input_size, void *outbuf,
+	unsigned long output_size, unsigned long skip);
+
 #endif /* _PPC_BOOT_OPS_H_ */

+ 14 - 0
arch/powerpc/boot/stdbool.h

@@ -0,0 +1,14 @@
+/*
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file is only necessary because some of the pre-boot decompressors
+ * expect stdbool.h to be available.
+ *
+ */
+
+#include "types.h"

+ 13 - 0
arch/powerpc/boot/stdint.h

@@ -0,0 +1,13 @@
+/*
+ * Copyright (C) IBM Corporation 2016.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * This file is only necessary because some of the pre-boot decompressors
+ * expect stdint.h to be available.
+ */
+
+#include "types.h"

+ 14 - 0
arch/powerpc/boot/types.h

@@ -1,6 +1,8 @@
 #ifndef _TYPES_H_
 #define _TYPES_H_
 
+#include <stdbool.h>
+
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
 typedef unsigned char		u8;
@@ -34,4 +36,16 @@ typedef s64 int64_t;
 	(void) (&_x == &_y);	\
 	_x > _y ? _x : _y; })
 
+#define min_t(type, a, b) min(((type) a), ((type) b))
+#define max_t(type, a, b) max(((type) a), ((type) b))
+
+typedef int bool;
+
+#ifndef true
+#define true 1
+#endif
+
+#ifndef false
+#define false 0
+#endif
 #endif /* _TYPES_H_ */

+ 45 - 16
arch/powerpc/boot/wrapper

@@ -20,6 +20,8 @@
 # -D dir	specify directory containing data files used by script
 #		(default ./arch/powerpc/boot)
 # -W dir	specify working directory for temporary files (default .)
+# -z		use gzip (legacy)
+# -Z zsuffix    compression to use (gz, xz or none)
 
 # Stop execution if any command fails
 set -e
@@ -38,7 +40,7 @@ dtb=
 dts=
 cacheit=
 binary=
-gzip=.gz
+compression=.gz
 pie=
 format=
 
@@ -59,7 +61,8 @@ tmpdir=.
 usage() {
     echo 'Usage: wrapper [-o output] [-p platform] [-i initrd]' >&2
     echo '       [-d devtree] [-s tree.dts] [-c] [-C cross-prefix]' >&2
-    echo '       [-D datadir] [-W workingdir] [--no-gzip] [vmlinux]' >&2
+    echo '       [-D datadir] [-W workingdir] [-Z (gz|xz|none)]' >&2
+    echo '       [--no-compression] [vmlinux]' >&2
     exit 1
 }
 
@@ -126,8 +129,24 @@ while [ "$#" -gt 0 ]; do
 	[ "$#" -gt 0 ] || usage
 	tmpdir="$1"
 	;;
+    -z)
+	compression=.gz
+	;;
+    -Z)
+	shift
+	[ "$#" -gt 0 ] || usage
+        [ "$1" != "gz" -o "$1" != "xz" -o "$1" != "none" ] || usage
+
+	compression=".$1"
+
+        if [ $compression = ".none" ]; then
+                compression=
+        fi
+	;;
     --no-gzip)
-        gzip=
+        # a "feature" of the the wrapper script is that it can be used outside
+        # the kernel tree. So keeping this around for backwards compatibility.
+        compression=
         ;;
     -?)
 	usage
@@ -140,6 +159,7 @@ while [ "$#" -gt 0 ]; do
     shift
 done
 
+
 if [ -n "$dts" ]; then
     if [ ! -r "$dts" -a -r "$object/dts/$dts" ]; then
 	dts="$object/dts/$dts"
@@ -212,7 +232,7 @@ miboot|uboot*)
     ;;
 cuboot*)
     binary=y
-    gzip=
+    compression=
     case "$platform" in
     *-mpc866ads|*-mpc885ads|*-adder875*|*-ep88xc)
         platformo=$object/cuboot-8xx.o
@@ -243,7 +263,7 @@ cuboot*)
 ps3)
     platformo="$object/ps3-head.o $object/ps3-hvcall.o $object/ps3.o"
     lds=$object/zImage.ps3.lds
-    gzip=
+    compression=
     ext=bin
     objflags="-O binary --set-section-flags=.bss=contents,alloc,load,data"
     ksection=.kernel:vmlinux.bin
@@ -310,27 +330,37 @@ mvme7100)
 esac
 
 vmz="$tmpdir/`basename \"$kernel\"`.$ext"
-if [ -z "$cacheit" -o ! -f "$vmz$gzip" -o "$vmz$gzip" -ot "$kernel" ]; then
-    ${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
 
-    strip_size=$(stat -c %s $vmz.$$)
+# Calculate the vmlinux.strip size
+${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
+strip_size=$(stat -c %s $vmz.$$)
 
-    if [ -n "$gzip" ]; then
+if [ -z "$cacheit" -o ! -f "$vmz$compression" -o "$vmz$compression" -ot "$kernel" ]; then
+    # recompress the image if we need to
+    case $compression in
+    .xz)
+        xz --check=crc32 -f -6 "$vmz.$$"
+        ;;
+    .gz)
         gzip -n -f -9 "$vmz.$$"
-    fi
+        ;;
+    *)
+        # drop the compression suffix so the stripped vmlinux is used
+        compression=
+	;;
+    esac
 
     if [ -n "$cacheit" ]; then
-	mv -f "$vmz.$$$gzip" "$vmz$gzip"
+	mv -f "$vmz.$$$compression" "$vmz$compression"
     else
 	vmz="$vmz.$$"
     fi
 else
-    # Calculate the vmlinux.strip size
-    ${CROSS}objcopy $objflags "$kernel" "$vmz.$$"
-    strip_size=$(stat -c %s $vmz.$$)
     rm -f $vmz.$$
 fi
 
+vmz="$vmz$compression"
+
 if [ "$make_space" = "y" ]; then
 	# Round the size to next higher MB limit
 	round_size=$(((strip_size + 0xfffff) & 0xfff00000))
@@ -346,8 +376,6 @@ if [ "$make_space" = "y" ]; then
 	fi
 fi
 
-vmz="$vmz$gzip"
-
 # Extract kernel version information, some platforms want to include
 # it in the image header
 version=`${CROSS}strings "$kernel" | grep '^Linux version [-0-9.]' | \
@@ -417,6 +445,7 @@ if [ "$platform" != "miboot" ]; then
     if [ -n "$link_address" ] ; then
         text_start="-Ttext $link_address"
     fi
+#link everything
     ${CROSS}ld -m $format -T $lds $text_start $pie -o "$ofile" \
 	$platformo $tmp $object/wrapper.a
     rm $tmp

+ 39 - 0
arch/powerpc/boot/xz_config.h

@@ -0,0 +1,39 @@
+#ifndef __XZ_CONFIG_H__
+#define __XZ_CONFIG_H__
+
+/*
+ * most of this is copied from lib/xz/xz_private.h, we can't use their defines
+ * since the boot wrapper is not built in the same environment as the rest of
+ * the kernel.
+ */
+
+#include "types.h"
+#include "swab.h"
+
+static inline uint32_t swab32p(void *p)
+{
+	uint32_t *q = p;
+
+	return swab32(*q);
+}
+
+#ifdef __LITTLE_ENDIAN__
+#define get_le32(p) (*((uint32_t *) (p)))
+#else
+#define get_le32(p) swab32p(p)
+#endif
+
+#define memeq(a, b, size) (memcmp(a, b, size) == 0)
+#define memzero(buf, size) memset(buf, 0, size)
+
+/* prevent the inclusion of the xz-preboot MM headers */
+#define DECOMPR_MM_H
+#define memmove memmove
+#define XZ_EXTERN static
+
+/* xz.h needs to be included directly since we need enum xz_mode */
+#include "../../../include/linux/xz.h"
+
+#undef XZ_EXTERN
+
+#endif

+ 12 - 7
arch/powerpc/configs/powernv_defconfig

@@ -15,6 +15,8 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
 CONFIG_NUMA_BALANCING=y
 CONFIG_CGROUPS=y
 CONFIG_MEMCG=y
@@ -95,7 +97,7 @@ CONFIG_BLK_DEV_IDECD=y
 CONFIG_BLK_DEV_GENERIC=y
 CONFIG_BLK_DEV_AMD74XX=y
 CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
+CONFIG_CHR_DEV_ST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_BLK_DEV_SR_VENDOR=y
 CONFIG_CHR_DEV_SG=y
@@ -107,7 +109,7 @@ CONFIG_SCSI_CXGB4_ISCSI=m
 CONFIG_SCSI_BNX2_ISCSI=m
 CONFIG_BE2ISCSI=m
 CONFIG_SCSI_MPT2SAS=m
-CONFIG_SCSI_SYM53C8XX_2=y
+CONFIG_SCSI_SYM53C8XX_2=m
 CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
 CONFIG_SCSI_IPR=y
 CONFIG_SCSI_QLA_FC=m
@@ -149,10 +151,10 @@ CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
 CONFIG_VHOST_NET=m
-CONFIG_VORTEX=y
+CONFIG_VORTEX=m
 CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
-CONFIG_PCNET32=y
+CONFIG_PCNET32=m
 CONFIG_TIGON3=y
 CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
@@ -163,6 +165,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 CONFIG_IXGB=m
 CONFIG_IXGBE=m
+CONFIG_I40E=m
 CONFIG_MLX4_EN=m
 CONFIG_MYRI10GE=m
 CONFIG_QLGE=m
@@ -238,7 +241,7 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
+CONFIG_REISERFS_FS=m
 CONFIG_REISERFS_FS_XATTR=y
 CONFIG_REISERFS_FS_POSIX_ACL=y
 CONFIG_REISERFS_FS_SECURITY=y
@@ -253,10 +256,10 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=y
+CONFIG_ISO9660_FS=m
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
+CONFIG_VFAT_FS=m
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
@@ -310,6 +313,8 @@ CONFIG_CRYPTO_TEA=m
 CONFIG_CRYPTO_TWOFISH=m
 CONFIG_CRYPTO_LZO=m
 CONFIG_CRYPTO_DEV_NX=y
+CONFIG_CRYPTO_DEV_VMX=y
+CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m

+ 12 - 7
arch/powerpc/configs/ppc64_defconfig

@@ -10,6 +10,8 @@ CONFIG_TASKSTATS=y
 CONFIG_TASK_DELAY_ACCT=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
 CONFIG_CGROUPS=y
 CONFIG_CPUSETS=y
 CONFIG_BLK_DEV_INITRD=y
@@ -90,7 +92,7 @@ CONFIG_BLK_DEV_AMD74XX=y
 CONFIG_BLK_DEV_IDE_PMAC=y
 CONFIG_BLK_DEV_IDE_PMAC_ATA100FIRST=y
 CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
+CONFIG_CHR_DEV_ST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_BLK_DEV_SR_VENDOR=y
 CONFIG_CHR_DEV_SG=y
@@ -103,7 +105,7 @@ CONFIG_BE2ISCSI=m
 CONFIG_SCSI_MPT2SAS=m
 CONFIG_SCSI_IBMVSCSI=y
 CONFIG_SCSI_IBMVFC=m
-CONFIG_SCSI_SYM53C8XX_2=y
+CONFIG_SCSI_SYM53C8XX_2=m
 CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
 CONFIG_SCSI_IPR=y
 CONFIG_SCSI_QLA_FC=m
@@ -149,10 +151,10 @@ CONFIG_NETCONSOLE=y
 CONFIG_TUN=m
 CONFIG_VIRTIO_NET=m
 CONFIG_VHOST_NET=m
-CONFIG_VORTEX=y
+CONFIG_VORTEX=m
 CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
-CONFIG_PCNET32=y
+CONFIG_PCNET32=m
 CONFIG_TIGON3=y
 CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
@@ -165,6 +167,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 CONFIG_IXGB=m
 CONFIG_IXGBE=m
+CONFIG_I40E=m
 CONFIG_MLX4_EN=m
 CONFIG_MYRI10GE=m
 CONFIG_PASEMI_MAC=y
@@ -269,7 +272,7 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
+CONFIG_REISERFS_FS=m
 CONFIG_REISERFS_FS_XATTR=y
 CONFIG_REISERFS_FS_POSIX_ACL=y
 CONFIG_REISERFS_FS_SECURITY=y
@@ -284,10 +287,10 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=y
+CONFIG_ISO9660_FS=m
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
+CONFIG_VFAT_FS=m
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
@@ -347,6 +350,8 @@ CONFIG_CRYPTO_LZO=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
+CONFIG_CRYPTO_DEV_VMX=y
+CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m

+ 12 - 7
arch/powerpc/configs/pseries_defconfig

@@ -15,6 +15,8 @@ CONFIG_TASK_XACCT=y
 CONFIG_TASK_IO_ACCOUNTING=y
 CONFIG_IKCONFIG=y
 CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=18
+CONFIG_LOG_CPU_MAX_BUF_SHIFT=13
 CONFIG_NUMA_BALANCING=y
 CONFIG_NUMA_BALANCING_DEFAULT_ENABLED=y
 CONFIG_CGROUPS=y
@@ -95,7 +97,7 @@ CONFIG_BLK_DEV_IDECD=y
 CONFIG_BLK_DEV_GENERIC=y
 CONFIG_BLK_DEV_AMD74XX=y
 CONFIG_BLK_DEV_SD=y
-CONFIG_CHR_DEV_ST=y
+CONFIG_CHR_DEV_ST=m
 CONFIG_BLK_DEV_SR=y
 CONFIG_BLK_DEV_SR_VENDOR=y
 CONFIG_CHR_DEV_SG=y
@@ -108,7 +110,7 @@ CONFIG_BE2ISCSI=m
 CONFIG_SCSI_MPT2SAS=m
 CONFIG_SCSI_IBMVSCSI=y
 CONFIG_SCSI_IBMVFC=m
-CONFIG_SCSI_SYM53C8XX_2=y
+CONFIG_SCSI_SYM53C8XX_2=m
 CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=0
 CONFIG_SCSI_IPR=y
 CONFIG_SCSI_QLA_FC=m
@@ -150,10 +152,10 @@ CONFIG_TUN=m
 CONFIG_VETH=m
 CONFIG_VIRTIO_NET=m
 CONFIG_VHOST_NET=m
-CONFIG_VORTEX=y
+CONFIG_VORTEX=m
 CONFIG_ACENIC=m
 CONFIG_ACENIC_OMIT_TIGON_I=y
-CONFIG_PCNET32=y
+CONFIG_PCNET32=m
 CONFIG_TIGON3=y
 CONFIG_BNX2X=m
 CONFIG_CHELSIO_T1=m
@@ -166,6 +168,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 CONFIG_IXGB=m
 CONFIG_IXGBE=m
+CONFIG_I40E=m
 CONFIG_MLX4_EN=m
 CONFIG_MYRI10GE=m
 CONFIG_QLGE=m
@@ -241,7 +244,7 @@ CONFIG_EXT2_FS_SECURITY=y
 CONFIG_EXT4_FS=y
 CONFIG_EXT4_FS_POSIX_ACL=y
 CONFIG_EXT4_FS_SECURITY=y
-CONFIG_REISERFS_FS=y
+CONFIG_REISERFS_FS=m
 CONFIG_REISERFS_FS_XATTR=y
 CONFIG_REISERFS_FS_POSIX_ACL=y
 CONFIG_REISERFS_FS_SECURITY=y
@@ -256,10 +259,10 @@ CONFIG_NILFS2_FS=m
 CONFIG_AUTOFS4_FS=m
 CONFIG_FUSE_FS=m
 CONFIG_OVERLAY_FS=m
-CONFIG_ISO9660_FS=y
+CONFIG_ISO9660_FS=m
 CONFIG_UDF_FS=m
 CONFIG_MSDOS_FS=y
-CONFIG_VFAT_FS=y
+CONFIG_VFAT_FS=m
 CONFIG_PROC_KCORE=y
 CONFIG_TMPFS=y
 CONFIG_TMPFS_POSIX_ACL=y
@@ -314,6 +317,8 @@ CONFIG_CRYPTO_LZO=m
 # CONFIG_CRYPTO_ANSI_CPRNG is not set
 CONFIG_CRYPTO_DEV_NX=y
 CONFIG_CRYPTO_DEV_NX_ENCRYPT=m
+CONFIG_CRYPTO_DEV_VMX=y
+CONFIG_CRYPTO_DEV_VMX_ENCRYPT=m
 CONFIG_VIRTUALIZATION=y
 CONFIG_KVM_BOOK3S_64=m
 CONFIG_KVM_BOOK3S_64_HV=m

+ 40 - 3
arch/powerpc/include/asm/asm-prototypes.h

@@ -15,6 +15,8 @@
 #include <linux/threads.h>
 #include <linux/kprobes.h>
 
+#include <uapi/asm/ucontext.h>
+
 /* SMP */
 extern struct thread_info *current_set[NR_CPUS];
 extern struct thread_info *secondary_ti;
@@ -52,8 +54,8 @@ void SMIException(struct pt_regs *regs);
 void handle_hmi_exception(struct pt_regs *regs);
 void instruction_breakpoint_exception(struct pt_regs *regs);
 void RunModeException(struct pt_regs *regs);
-void __kprobes single_step_exception(struct pt_regs *regs);
-void __kprobes program_check_exception(struct pt_regs *regs);
+void single_step_exception(struct pt_regs *regs);
+void program_check_exception(struct pt_regs *regs);
 void alignment_exception(struct pt_regs *regs);
 void StackOverflow(struct pt_regs *regs);
 void nonrecoverable_exception(struct pt_regs *regs);
@@ -70,6 +72,41 @@ void unrecoverable_exception(struct pt_regs *regs);
 void kernel_bad_stack(struct pt_regs *regs);
 void system_reset_exception(struct pt_regs *regs);
 void machine_check_exception(struct pt_regs *regs);
-void __kprobes emulation_assist_interrupt(struct pt_regs *regs);
+void emulation_assist_interrupt(struct pt_regs *regs);
+
+/* signals, syscalls and interrupts */
+#ifdef CONFIG_PPC64
+int sys_swapcontext(struct ucontext __user *old_ctx,
+		    struct ucontext __user *new_ctx,
+		    long ctx_size, long r6, long r7, long r8, struct pt_regs *regs);
+#else
+long sys_swapcontext(struct ucontext __user *old_ctx,
+		    struct ucontext __user *new_ctx,
+		    int ctx_size, int r6, int r7, int r8, struct pt_regs *regs);
+#endif
+long sys_switch_endian(void);
+notrace unsigned int __check_irq_replay(void);
+void notrace restore_interrupts(void);
+
+/* ptrace */
+long do_syscall_trace_enter(struct pt_regs *regs);
+void do_syscall_trace_leave(struct pt_regs *regs);
+
+/* process */
+void restore_math(struct pt_regs *regs);
+void restore_tm_state(struct pt_regs *regs);
+
+/* prom_init (OpenFirmware) */
+unsigned long __init prom_init(unsigned long r3, unsigned long r4,
+			       unsigned long pp,
+			       unsigned long r6, unsigned long r7,
+			       unsigned long kbase);
+
+/* setup */
+void __init early_setup(unsigned long dt_ptr);
+void early_setup_secondary(void);
+
+/* time */
+void accumulate_stolen_time(void);
 
 #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */

+ 2 - 2
arch/powerpc/include/asm/atomic.h

@@ -233,7 +233,7 @@ static __inline__ int __atomic_add_unless(atomic_t *v, int a, int u)
 	PPC_ATOMIC_ENTRY_BARRIER
 "1:	lwarx	%0,0,%1		# __atomic_add_unless\n\
 	cmpw	0,%0,%3 \n\
-	beq-	2f \n\
+	beq	2f \n\
 	add	%0,%2,%0 \n"
 	PPC405_ERR77(0,%2)
 "	stwcx.	%0,0,%1 \n\
@@ -539,7 +539,7 @@ static __inline__ int atomic64_add_unless(atomic64_t *v, long a, long u)
 	PPC_ATOMIC_ENTRY_BARRIER
 "1:	ldarx	%0,0,%1		# __atomic_add_unless\n\
 	cmpd	0,%0,%3 \n\
-	beq-	2f \n\
+	beq	2f \n\
 	add	%0,%2,%0 \n"
 "	stdcx.	%0,0,%1 \n\
 	bne-	1b \n"

+ 2 - 1
arch/powerpc/include/asm/book3s/32/pgtable.h

@@ -223,7 +223,8 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 }
 
 
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void __ptep_set_access_flags(struct mm_struct *mm,
+					   pte_t *ptep, pte_t entry)
 {
 	unsigned long set = pte_val(entry) &
 		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);

+ 5 - 2
arch/powerpc/include/asm/book3s/64/pgtable.h

@@ -6,6 +6,8 @@
  */
 #define _PAGE_BIT_SWAP_TYPE	0
 
+#define _PAGE_RO		0
+
 #define _PAGE_EXEC		0x00001 /* execute permission */
 #define _PAGE_WRITE		0x00002 /* write access allowed */
 #define _PAGE_READ		0x00004	/* read access allowed */
@@ -565,10 +567,11 @@ static inline bool check_pte_access(unsigned long access, unsigned long ptev)
  * Generic functions with hash/radix callbacks
  */
 
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void __ptep_set_access_flags(struct mm_struct *mm,
+					   pte_t *ptep, pte_t entry)
 {
 	if (radix_enabled())
-		return radix__ptep_set_access_flags(ptep, entry);
+		return radix__ptep_set_access_flags(mm, ptep, entry);
 	return hash__ptep_set_access_flags(ptep, entry);
 }
 

+ 69 - 19
arch/powerpc/include/asm/book3s/64/radix.h

@@ -11,6 +11,11 @@
 #include <asm/book3s/64/radix-4k.h>
 #endif
 
+#ifndef __ASSEMBLY__
+#include <asm/book3s/64/tlbflush-radix.h>
+#include <asm/cpu_has_feature.h>
+#endif
+
 /* An empty PTE can still have a R or C writeback */
 #define RADIX_PTE_NONE_MASK		(_PAGE_DIRTY | _PAGE_ACCESSED)
 
@@ -105,11 +110,8 @@
 #define RADIX_PUD_TABLE_SIZE	(sizeof(pud_t) << RADIX_PUD_INDEX_SIZE)
 #define RADIX_PGD_TABLE_SIZE	(sizeof(pgd_t) << RADIX_PGD_INDEX_SIZE)
 
-static inline unsigned long radix__pte_update(struct mm_struct *mm,
-					unsigned long addr,
-					pte_t *ptep, unsigned long clr,
-					unsigned long set,
-					int huge)
+static inline unsigned long __radix_pte_update(pte_t *ptep, unsigned long clr,
+					       unsigned long set)
 {
 	pte_t pte;
 	unsigned long old_pte, new_pte;
@@ -121,9 +123,39 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
 
 	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
 
-	/* We already do a sync in cmpxchg, is ptesync needed ?*/
+	return old_pte;
+}
+
+
+static inline unsigned long radix__pte_update(struct mm_struct *mm,
+					unsigned long addr,
+					pte_t *ptep, unsigned long clr,
+					unsigned long set,
+					int huge)
+{
+	unsigned long old_pte;
+
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+
+		unsigned long new_pte;
+
+		old_pte = __radix_pte_update(ptep, ~0, 0);
+		asm volatile("ptesync" : : : "memory");
+		/*
+		 * new value of pte
+		 */
+		new_pte = (old_pte | set) & ~clr;
+
+		/*
+		 * For now let's do heavy pid flush
+		 * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
+		 */
+		radix__flush_tlb_mm(mm);
+
+		__radix_pte_update(ptep, 0, new_pte);
+	} else
+		old_pte = __radix_pte_update(ptep, clr, set);
 	asm volatile("ptesync" : : : "memory");
-	/* huge pages use the old page table lock */
 	if (!huge)
 		assert_pte_locked(mm, addr);
 
@@ -134,20 +166,33 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm,
  * Set the dirty and/or accessed bits atomically in a linux PTE, this
  * function doesn't need to invalidate tlb.
  */
-static inline void radix__ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void radix__ptep_set_access_flags(struct mm_struct *mm,
+						pte_t *ptep, pte_t entry)
 {
-	pte_t pte;
-	unsigned long old_pte, new_pte;
+
 	unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_ACCESSED |
 					      _PAGE_RW | _PAGE_EXEC);
-	do {
-		pte = READ_ONCE(*ptep);
-		old_pte = pte_val(pte);
+
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+
+		unsigned long old_pte, new_pte;
+
+		old_pte = __radix_pte_update(ptep, ~0, 0);
+		asm volatile("ptesync" : : : "memory");
+		/*
+		 * new value of pte
+		 */
 		new_pte = old_pte | set;
 
-	} while (!pte_xchg(ptep, __pte(old_pte), __pte(new_pte)));
+		/*
+		 * For now let's do heavy pid flush
+		 * radix__flush_tlb_page_psize(mm, addr, mmu_virtual_psize);
+		 */
+		radix__flush_tlb_mm(mm);
 
-	/* We already do a sync in cmpxchg, is ptesync needed ?*/
+		__radix_pte_update(ptep, 0, new_pte);
+	} else
+		__radix_pte_update(ptep, 0, set);
 	asm volatile("ptesync" : : : "memory");
 }
 
@@ -233,14 +278,19 @@ static inline unsigned long radix__get_tree_size(void)
 {
 	unsigned long rts_field;
 	/*
-	 * we support 52 bits, hence 52-31 = 21, 0b10101
+	 * We support 52 bits, hence:
+	 *  DD1    52-28 = 24, 0b11000
+	 *  Others 52-31 = 21, 0b10101
 	 * RTS encoding details
 	 * bits 0 - 3 of rts -> bits 6 - 8 unsigned long
 	 * bits 4 - 5 of rts -> bits 62 - 63 of unsigned long
 	 */
-	rts_field = (0x5UL << 5); /* 6 - 8 bits */
-	rts_field |= (0x2UL << 61);
-
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		rts_field = (0x3UL << 61);
+	else {
+		rts_field = (0x5UL << 5); /* 6 - 8 bits */
+		rts_field |= (0x2UL << 61);
+	}
 	return rts_field;
 }
 #endif /* __ASSEMBLY__ */

+ 1 - 0
arch/powerpc/include/asm/book3s/64/tlbflush-radix.h

@@ -41,4 +41,5 @@ extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmad
 extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa,
 				     unsigned long page_size);
 extern void radix__flush_tlb_lpid(unsigned long lpid);
+extern void radix__flush_tlb_all(void);
 #endif

+ 3 - 1
arch/powerpc/include/asm/cputable.h

@@ -212,6 +212,7 @@ enum {
 #define CPU_FTR_DABRX			LONG_ASM_CONST(0x0800000000000000)
 #define CPU_FTR_PMAO_BUG		LONG_ASM_CONST(0x1000000000000000)
 #define CPU_FTR_SUBCORE			LONG_ASM_CONST(0x2000000000000000)
+#define CPU_FTR_POWER9_DD1		LONG_ASM_CONST(0x4000000000000000)
 
 #ifndef __ASSEMBLY__
 
@@ -472,6 +473,7 @@ enum {
 	    CPU_FTR_ICSWX | CPU_FTR_CFAR | CPU_FTR_HVMODE | CPU_FTR_VMX_COPY | \
 	    CPU_FTR_DBELL | CPU_FTR_HAS_PPR | CPU_FTR_DAWR | \
 	    CPU_FTR_ARCH_207S | CPU_FTR_TM_COMP | CPU_FTR_ARCH_300)
+#define CPU_FTRS_POWER9_DD1 (CPU_FTRS_POWER9 | CPU_FTR_POWER9_DD1)
 #define CPU_FTRS_CELL	(CPU_FTR_USE_TB | CPU_FTR_LWSYNC | \
 	    CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_CTRL | \
 	    CPU_FTR_ALTIVEC_COMP | CPU_FTR_MMCRA | CPU_FTR_SMT | \
@@ -490,7 +492,7 @@ enum {
 	    (CPU_FTRS_POWER4 | CPU_FTRS_PPC970 | CPU_FTRS_POWER5 | \
 	     CPU_FTRS_POWER6 | CPU_FTRS_POWER7 | CPU_FTRS_POWER8E | \
 	     CPU_FTRS_POWER8 | CPU_FTRS_POWER8_DD1 | CPU_FTRS_CELL | \
-	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9)
+	     CPU_FTRS_PA6T | CPU_FTR_VSX | CPU_FTRS_POWER9 | CPU_FTRS_POWER9_DD1)
 #endif
 #else
 enum {

+ 60 - 88
arch/powerpc/include/asm/exception-64s.h

@@ -34,6 +34,7 @@
  * exception handlers (including pSeries LPAR) and iSeries LPAR
  * implementations as possible.
  */
+#include <asm/head-64.h>
 
 #define EX_R9		0
 #define EX_R10		8
@@ -52,7 +53,6 @@
 
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
-	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
 	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
 	LOAD_HANDLER(r12,label);					\
 	mtctr	r12;							\
@@ -84,13 +84,14 @@
 
 /*
  * We're short on space and time in the exception prolog, so we can't
- * use the normal SET_REG_IMMEDIATE macro. Normally we just need the
- * low halfword of the address, but for Kdump we need the whole low
- * word.
+ * use the normal LOAD_REG_IMMEDIATE macro to load the address of label.
+ * Instead we get the base of the kernel from paca->kernelbase and or in the low
+ * part of label. This requires that the label be within 64KB of kernelbase, and
+ * that kernelbase be 64K aligned.
  */
 #define LOAD_HANDLER(reg, label)					\
-	/* Handlers must be within 64K of kbase, which must be 64k aligned */ \
-	ori	reg,reg,(label)-_stext;	/* virt addr of handler ... */
+	ld	reg,PACAKBASE(r13);	/* get high part of &label */	\
+	ori	reg,reg,(FIXED_SYMBOL_ABS_ADDR(label))@l;
 
 /* Exception register prefixes */
 #define EXC_HV	H
@@ -175,7 +176,6 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	__EXCEPTION_PROLOG_1(area, extra, vec)
 
 #define __EXCEPTION_PROLOG_PSERIES_1(label, h)				\
-	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
 	ld	r10,PACAKMSR(r13);	/* get MSR value for kernel */	\
 	mfspr	r11,SPRN_##h##SRR0;	/* save SRR0 */			\
 	LOAD_HANDLER(r12,label)						\
@@ -192,10 +192,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	EXCEPTION_PROLOG_1(area, extra, vec);				\
 	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
-#define __KVMTEST(n)							\
-	lbz	r10,HSTATE_IN_GUEST(r13);			\
+#define __KVMTEST(h, n)							\
+	lbz	r10,HSTATE_IN_GUEST(r13);				\
 	cmpwi	r10,0;							\
-	bne	do_kvm_##n
+	bne	do_kvm_##h##n
 
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
 /*
@@ -208,8 +208,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 #define kvmppc_interrupt kvmppc_interrupt_pr
 #endif
 
-#define __KVM_HANDLER(area, h, n)					\
-do_kvm_##n:								\
+#define __KVM_HANDLER_PROLOG(area, n)					\
 	BEGIN_FTR_SECTION_NESTED(947)					\
 	ld	r10,area+EX_CFAR(r13);					\
 	std	r10,HSTATE_CFAR(r13);					\
@@ -222,21 +221,23 @@ do_kvm_##n:								\
 	stw	r9,HSTATE_SCRATCH1(r13);				\
 	ld	r9,area+EX_R9(r13);					\
 	std	r12,HSTATE_SCRATCH0(r13);				\
+
+#define __KVM_HANDLER(area, h, n)					\
+	__KVM_HANDLER_PROLOG(area, n)					\
 	li	r12,n;							\
 	b	kvmppc_interrupt
 
 #define __KVM_HANDLER_SKIP(area, h, n)					\
-do_kvm_##n:								\
 	cmpwi	r10,KVM_GUEST_MODE_SKIP;				\
 	ld	r10,area+EX_R10(r13);					\
 	beq	89f;							\
-	stw	r9,HSTATE_SCRATCH1(r13);			\
+	stw	r9,HSTATE_SCRATCH1(r13);				\
 	BEGIN_FTR_SECTION_NESTED(948)					\
 	ld	r9,area+EX_PPR(r13);					\
 	std	r9,HSTATE_PPR(r13);					\
 	END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948);	\
 	ld	r9,area+EX_R9(r13);					\
-	std	r12,HSTATE_SCRATCH0(r13);			\
+	std	r12,HSTATE_SCRATCH0(r13);				\
 	li	r12,n;							\
 	b	kvmppc_interrupt;					\
 89:	mtocrf	0x80,r9;						\
@@ -244,12 +245,12 @@ do_kvm_##n:								\
 	b	kvmppc_skip_##h##interrupt
 
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-#define KVMTEST(n)			__KVMTEST(n)
+#define KVMTEST(h, n)			__KVMTEST(h, n)
 #define KVM_HANDLER(area, h, n)		__KVM_HANDLER(area, h, n)
 #define KVM_HANDLER_SKIP(area, h, n)	__KVM_HANDLER_SKIP(area, h, n)
 
 #else
-#define KVMTEST(n)
+#define KVMTEST(h, n)
 #define KVM_HANDLER(area, h, n)
 #define KVM_HANDLER_SKIP(area, h, n)
 #endif
@@ -333,94 +334,79 @@ do_kvm_##n:								\
 /*
  * Exception vectors.
  */
-#define STD_EXCEPTION_PSERIES(vec, label)		\
-	. = vec;					\
-	.globl label##_pSeries;				\
-label##_pSeries:					\
+#define STD_EXCEPTION_PSERIES(vec, label)			\
 	SET_SCRATCH0(r13);		/* save r13 */		\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_STD, KVMTEST, vec)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label,		\
+				 EXC_STD, KVMTEST_PR, vec);	\
 
 /* Version of above for when we have to branch out-of-line */
+#define __OOL_EXCEPTION(vec, label, hdlr)			\
+	SET_SCRATCH0(r13)					\
+	EXCEPTION_PROLOG_0(PACA_EXGEN)				\
+	b hdlr;
+
 #define STD_EXCEPTION_PSERIES_OOL(vec, label)			\
-	.globl label##_pSeries;					\
-label##_pSeries:						\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec);	\
-	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_STD)
-
-#define STD_EXCEPTION_HV(loc, vec, label)		\
-	. = loc;					\
-	.globl label##_hv;				\
-label##_hv:						\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_PR, vec);	\
+	EXCEPTION_PROLOG_PSERIES_1(label, EXC_STD)
+
+#define STD_EXCEPTION_HV(loc, vec, label)			\
 	SET_SCRATCH0(r13);	/* save r13 */			\
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label##_common,	\
-				 EXC_HV, KVMTEST, vec)
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, label,		\
+				 EXC_HV, KVMTEST_HV, vec);
 
-/* Version of above for when we have to branch out-of-line */
-#define STD_EXCEPTION_HV_OOL(vec, label)		\
-	.globl label##_hv;				\
-label##_hv:						\
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, vec);	\
-	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_HV)
+#define STD_EXCEPTION_HV_OOL(vec, label)			\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec);	\
+	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 #define STD_RELON_EXCEPTION_PSERIES(loc, vec, label)	\
-	. = loc;					\
-	.globl label##_relon_pSeries;			\
-label##_relon_pSeries:					\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);		/* save r13 */	\
-	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-				       EXC_STD, NOTEST, vec)
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_STD, NOTEST, vec);
 
 #define STD_RELON_EXCEPTION_PSERIES_OOL(vec, label)		\
-	.globl label##_relon_pSeries;				\
-label##_relon_pSeries:						\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
-	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_STD)
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD)
 
 #define STD_RELON_EXCEPTION_HV(loc, vec, label)		\
-	. = loc;					\
-	.globl label##_relon_hv;			\
-label##_relon_hv:					\
 	/* No guest interrupts come through here */	\
 	SET_SCRATCH0(r13);	/* save r13 */		\
-	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label##_common, \
-				       EXC_HV, NOTEST, vec)
+	EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec);
 
 #define STD_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	.globl label##_relon_hv;				\
-label##_relon_hv:						\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec);		\
-	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, EXC_HV)
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV)
 
 /* This associate vector numbers with bits in paca->irq_happened */
 #define SOFTEN_VALUE_0x500	PACA_IRQ_EE
-#define SOFTEN_VALUE_0x502	PACA_IRQ_EE
 #define SOFTEN_VALUE_0x900	PACA_IRQ_DEC
-#define SOFTEN_VALUE_0x982	PACA_IRQ_DEC
+#define SOFTEN_VALUE_0x980	PACA_IRQ_DEC
 #define SOFTEN_VALUE_0xa00	PACA_IRQ_DBELL
 #define SOFTEN_VALUE_0xe80	PACA_IRQ_DBELL
-#define SOFTEN_VALUE_0xe82	PACA_IRQ_DBELL
 #define SOFTEN_VALUE_0xe60	PACA_IRQ_HMI
-#define SOFTEN_VALUE_0xe62	PACA_IRQ_HMI
 #define SOFTEN_VALUE_0xea0	PACA_IRQ_EE
-#define SOFTEN_VALUE_0xea2	PACA_IRQ_EE
 
 #define __SOFTEN_TEST(h, vec)						\
 	lbz	r10,PACASOFTIRQEN(r13);					\
 	cmpwi	r10,0;							\
 	li	r10,SOFTEN_VALUE_##vec;					\
 	beq	masked_##h##interrupt
+
 #define _SOFTEN_TEST(h, vec)	__SOFTEN_TEST(h, vec)
 
 #define SOFTEN_TEST_PR(vec)						\
-	KVMTEST(vec);							\
+	KVMTEST(EXC_STD, vec);						\
 	_SOFTEN_TEST(EXC_STD, vec)
 
 #define SOFTEN_TEST_HV(vec)						\
-	KVMTEST(vec);							\
+	KVMTEST(EXC_HV, vec);						\
 	_SOFTEN_TEST(EXC_HV, vec)
 
+#define KVMTEST_PR(vec)							\
+	KVMTEST(EXC_STD, vec)
+
+#define KVMTEST_HV(vec)							\
+	KVMTEST(EXC_HV, vec)
+
 #define SOFTEN_NOTEST_PR(vec)		_SOFTEN_TEST(EXC_STD, vec)
 #define SOFTEN_NOTEST_HV(vec)		_SOFTEN_TEST(EXC_HV, vec)
 
@@ -428,58 +414,47 @@ label##_relon_hv:						\
 	SET_SCRATCH0(r13);    /* save r13 */				\
 	EXCEPTION_PROLOG_0(PACA_EXGEN);					\
 	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);			\
-	EXCEPTION_PROLOG_PSERIES_1(label##_common, h);
+	EXCEPTION_PROLOG_PSERIES_1(label, h);
 
 #define _MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	__MASKABLE_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_EXCEPTION_PSERIES(loc, vec, label)			\
-	. = loc;							\
-	.globl label##_pSeries;						\
-label##_pSeries:							\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
 				    EXC_STD, SOFTEN_TEST_PR)
 
+#define MASKABLE_EXCEPTION_PSERIES_OOL(vec, label)			\
+	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_PR, vec);		\
+	EXCEPTION_PROLOG_PSERIES_1(label, EXC_STD)
+
 #define MASKABLE_EXCEPTION_HV(loc, vec, label)				\
-	. = loc;							\
-	.globl label##_hv;						\
-label##_hv:								\
 	_MASKABLE_EXCEPTION_PSERIES(vec, label,				\
 				    EXC_HV, SOFTEN_TEST_HV)
 
 #define MASKABLE_EXCEPTION_HV_OOL(vec, label)				\
-	.globl label##_hv;						\
-label##_hv:								\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec);		\
-	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_HV);
+	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 #define __MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)	\
 	SET_SCRATCH0(r13);    /* save r13 */				\
 	EXCEPTION_PROLOG_0(PACA_EXGEN);					\
-	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);		\
-	EXCEPTION_RELON_PROLOG_PSERIES_1(label##_common, h);
-#define _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)	\
+	__EXCEPTION_PROLOG_1(PACA_EXGEN, extra, vec);			\
+	EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)
+
+#define _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)		\
 	__MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, h, extra)
 
 #define MASKABLE_RELON_EXCEPTION_PSERIES(loc, vec, label)		\
-	. = loc;							\
-	.globl label##_relon_pSeries;					\
-label##_relon_pSeries:							\
 	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
 					  EXC_STD, SOFTEN_NOTEST_PR)
 
 #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label)			\
-	. = loc;							\
-	.globl label##_relon_hv;					\
-label##_relon_hv:							\
 	_MASKABLE_RELON_EXCEPTION_PSERIES(vec, label,			\
 					  EXC_HV, SOFTEN_NOTEST_HV)
 
 #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label)			\
-	.globl label##_relon_hv;					\
-label##_relon_hv:							\
 	EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec);		\
-	EXCEPTION_PROLOG_PSERIES_1(label##_common, EXC_HV);
+	EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV)
 
 /*
  * Our exception common code can be passed various "additions"
@@ -505,9 +480,6 @@ BEGIN_FTR_SECTION				\
 END_FTR_SECTION_IFSET(CPU_FTR_CTRL)
 
 #define EXCEPTION_COMMON(trap, label, hdlr, ret, additions)	\
-	.align	7;						\
-	.globl label##_common;					\
-label##_common:							\
 	EXCEPTION_PROLOG_COMMON(trap, PACA_EXGEN);		\
 	/* Volatile regs are potentially clobbered here */	\
 	additions;						\

+ 0 - 4
arch/powerpc/include/asm/fadump.h

@@ -45,10 +45,6 @@
 
 #define memblock_num_regions(memblock_type)	(memblock.memblock_type.cnt)
 
-#ifndef ELF_CORE_EFLAGS
-#define ELF_CORE_EFLAGS 0
-#endif
-
 /* Firmware provided dump sections */
 #define FADUMP_CPU_STATE_DATA	0x0001
 #define FADUMP_HPTE_REGION	0x0002

+ 393 - 0
arch/powerpc/include/asm/head-64.h

@@ -0,0 +1,393 @@
+#ifndef _ASM_POWERPC_HEAD_64_H
+#define _ASM_POWERPC_HEAD_64_H
+
+#include <asm/cache.h>
+
+/*
+ * We can't do CPP stringification and concatination directly into the section
+ * name for some reason, so these macros can do it for us.
+ */
+.macro define_ftsec name
+	.section ".head.text.\name\()","ax",@progbits
+.endm
+.macro define_data_ftsec name
+	.section ".head.data.\name\()","a",@progbits
+.endm
+.macro use_ftsec name
+	.section ".head.text.\name\()"
+.endm
+
+/*
+ * Fixed (location) sections are used by opening fixed sections and emitting
+ * fixed section entries into them before closing them. Multiple fixed sections
+ * can be open at any time.
+ *
+ * Each fixed section created in a .S file must have corresponding linkage
+ * directives including location, added to  arch/powerpc/kernel/vmlinux.lds.S
+ *
+ * For each fixed section, code is generated into it in the order which it
+ * appears in the source.  Fixed section entries can be placed at a fixed
+ * location within the section using _LOCATION postifx variants. These must
+ * be ordered according to their relative placements within the section.
+ *
+ * OPEN_FIXED_SECTION(section_name, start_address, end_address)
+ * FIXED_SECTION_ENTRY_BEGIN(section_name, label1)
+ *
+ * USE_FIXED_SECTION(section_name)
+ * label3:
+ *     li  r10,128
+ *     mv  r11,r10
+
+ * FIXED_SECTION_ENTRY_BEGIN_LOCATION(section_name, label2, start_address)
+ * FIXED_SECTION_ENTRY_END_LOCATION(section_name, label2, end_address)
+ * CLOSE_FIXED_SECTION(section_name)
+ *
+ * ZERO_FIXED_SECTION can be used to emit zeroed data.
+ *
+ * Troubleshooting:
+ * - If the build dies with "Error: attempt to move .org backwards" at
+ *   CLOSE_FIXED_SECTION() or elsewhere, there may be something
+ *   unexpected being added there. Remove the '. = x_len' line, rebuild, and
+ *   check what is pushing the section down.
+ * - If the build dies in linking, check arch/powerpc/kernel/vmlinux.lds.S
+ *   for instructions.
+ * - If the kernel crashes or hangs in very early boot, it could be linker
+ *   stubs at the start of the main text.
+ */
+
+#define OPEN_FIXED_SECTION(sname, start, end)			\
+	sname##_start = (start);				\
+	sname##_end = (end);					\
+	sname##_len = (end) - (start);				\
+	define_ftsec sname;					\
+	. = 0x0;						\
+start_##sname:
+
+#define OPEN_TEXT_SECTION(start)				\
+	text_start = (start);					\
+	.section ".text","ax",@progbits;			\
+	. = 0x0;						\
+start_text:
+
+#define ZERO_FIXED_SECTION(sname, start, end)			\
+	sname##_start = (start);				\
+	sname##_end = (end);					\
+	sname##_len = (end) - (start);				\
+	define_data_ftsec sname;				\
+	. = 0x0;						\
+	. = sname##_len;
+
+#define USE_FIXED_SECTION(sname)				\
+	fs_label = start_##sname;				\
+	fs_start = sname##_start;				\
+	use_ftsec sname;
+
+#define USE_TEXT_SECTION()					\
+	fs_label = start_text;					\
+	fs_start = text_start;					\
+	.text
+
+#define CLOSE_FIXED_SECTION(sname)				\
+	USE_FIXED_SECTION(sname);				\
+	. = sname##_len;					\
+end_##sname:
+
+
+#define __FIXED_SECTION_ENTRY_BEGIN(sname, name, __align)	\
+	USE_FIXED_SECTION(sname);				\
+	.align __align;						\
+	.global name;						\
+name:
+
+#define FIXED_SECTION_ENTRY_BEGIN(sname, name)			\
+	__FIXED_SECTION_ENTRY_BEGIN(sname, name, 0)
+
+#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start)		\
+	USE_FIXED_SECTION(sname);				\
+	name##_start = (start);					\
+	.if (start) < sname##_start;				\
+	.error "Fixed section underflow";			\
+	.abort;							\
+	.endif;							\
+	. = (start) - sname##_start;				\
+	.global name;						\
+name:
+
+#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, end)		\
+	.if (end) > sname##_end;				\
+	.error "Fixed section overflow";			\
+	.abort;							\
+	.endif;							\
+	.if (. - name > end - name##_start);			\
+	.error "Fixed entry overflow";				\
+	.abort;							\
+	.endif;							\
+	. = ((end) - sname##_start);				\
+
+
+/*
+ * These macros are used to change symbols in other fixed sections to be
+ * absolute or related to our current fixed section.
+ *
+ * - DEFINE_FIXED_SYMBOL / FIXED_SYMBOL_ABS_ADDR is used to find the
+ *   absolute address of a symbol within a fixed section, from any section.
+ *
+ * - ABS_ADDR is used to find the absolute address of any symbol, from within
+ *   a fixed section.
+ */
+#define DEFINE_FIXED_SYMBOL(label)				\
+	label##_absolute = (label - fs_label + fs_start)
+
+#define FIXED_SYMBOL_ABS_ADDR(label)				\
+	(label##_absolute)
+
+#define ABS_ADDR(label) (label - fs_label + fs_start)
+
+/*
+ * Following are the BOOK3S exception handler helper macros.
+ * Handlers come in a number of types, and each type has a number of varieties.
+ *
+ * EXC_REAL_*        - real, unrelocated exception vectors
+ * EXC_VIRT_*        - virt (AIL), unrelocated exception vectors
+ * TRAMP_REAL_*   - real, unrelocated helpers (virt can call these)
+ * TRAMP_VIRT_*  - virt, unreloc helpers (in practice, real can use)
+ * TRAMP_KVM         - KVM handlers that get put into real, unrelocated
+ * EXC_COMMON_*  - virt, relocated common handlers
+ *
+ * The EXC handlers are given a name, and branch to name_common, or the
+ * appropriate KVM or masking function. Vector handler verieties are as
+ * follows:
+ *
+ * EXC_{REAL|VIRT}_BEGIN/END - used to open-code the exception
+ *
+ * EXC_{REAL|VIRT}  - standard exception
+ *
+ * EXC_{REAL|VIRT}_suffix
+ *     where _suffix is:
+ *   - _MASKABLE               - maskable exception
+ *   - _OOL                    - out of line with trampoline to common handler
+ *   - _HV                     - HV exception
+ *
+ * There can be combinations, e.g., EXC_VIRT_OOL_MASKABLE_HV
+ *
+ * The one unusual case is __EXC_REAL_OOL_HV_DIRECT, which is
+ * an OOL vector that branches to a specified handler rather than the usual
+ * trampoline that goes to common. It, and other underscore macros, should
+ * be used with care.
+ *
+ * KVM handlers come in the following verieties:
+ * TRAMP_KVM
+ * TRAMP_KVM_SKIP
+ * TRAMP_KVM_HV
+ * TRAMP_KVM_HV_SKIP
+ *
+ * COMMON handlers come in the following verieties:
+ * EXC_COMMON_BEGIN/END - used to open-code the handler
+ * EXC_COMMON
+ * EXC_COMMON_ASYNC
+ * EXC_COMMON_HV
+ *
+ * TRAMP_REAL and TRAMP_VIRT can be used with BEGIN/END. KVM
+ * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers.
+ */
+
+#define EXC_REAL_BEGIN(name, start, end)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start)
+
+#define EXC_REAL_END(name, start, end)			\
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, end)
+
+#define EXC_VIRT_BEGIN(name, start, end)			\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start)
+
+#define EXC_VIRT_END(name, start, end)			\
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, end)
+
+#define EXC_COMMON_BEGIN(name)					\
+	USE_TEXT_SECTION();						\
+	.align	7;							\
+	.global name;							\
+	DEFINE_FIXED_SYMBOL(name);					\
+name:
+
+#define TRAMP_REAL_BEGIN(name)					\
+	FIXED_SECTION_ENTRY_BEGIN(real_trampolines, name)
+
+#define TRAMP_VIRT_BEGIN(name)					\
+	FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+#define TRAMP_KVM_BEGIN(name)						\
+	TRAMP_REAL_BEGIN(name)
+#else
+#define TRAMP_KVM_BEGIN(name)
+#endif
+
+#define EXC_REAL_NONE(start, end)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start); \
+	FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, end)
+
+#define EXC_VIRT_NONE(start, end)				\
+	FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start); \
+	FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, end);
+
+
+#define EXC_REAL(name, start, end)				\
+	EXC_REAL_BEGIN(name, start, end);			\
+	STD_EXCEPTION_PSERIES(start, name##_common);			\
+	EXC_REAL_END(name, start, end);
+
+#define EXC_VIRT(name, start, end, realvec)			\
+	EXC_VIRT_BEGIN(name, start, end);			\
+	STD_RELON_EXCEPTION_PSERIES(start, realvec, name##_common);	\
+	EXC_VIRT_END(name, start, end);
+
+#define EXC_REAL_MASKABLE(name, start, end)			\
+	EXC_REAL_BEGIN(name, start, end);			\
+	MASKABLE_EXCEPTION_PSERIES(start, start, name##_common);	\
+	EXC_REAL_END(name, start, end);
+
+#define EXC_VIRT_MASKABLE(name, start, end, realvec)		\
+	EXC_VIRT_BEGIN(name, start, end);			\
+	MASKABLE_RELON_EXCEPTION_PSERIES(start, realvec, name##_common); \
+	EXC_VIRT_END(name, start, end);
+
+#define EXC_REAL_HV(name, start, end)			\
+	EXC_REAL_BEGIN(name, start, end);			\
+	STD_EXCEPTION_HV(start, start, name##_common);			\
+	EXC_REAL_END(name, start, end);
+
+#define EXC_VIRT_HV(name, start, end, realvec)		\
+	EXC_VIRT_BEGIN(name, start, end);			\
+	STD_RELON_EXCEPTION_HV(start, realvec, name##_common);		\
+	EXC_VIRT_END(name, start, end);
+
+#define __EXC_REAL_OOL(name, start, end)			\
+	EXC_REAL_BEGIN(name, start, end);			\
+	__OOL_EXCEPTION(start, label, tramp_real_##name);		\
+	EXC_REAL_END(name, start, end);
+
+#define __TRAMP_REAL_REAL_OOL(name, vec)				\
+	TRAMP_REAL_BEGIN(tramp_real_##name);				\
+	STD_EXCEPTION_PSERIES_OOL(vec, name##_common);			\
+
+#define EXC_REAL_OOL(name, start, end)			\
+	__EXC_REAL_OOL(name, start, end);			\
+	__TRAMP_REAL_REAL_OOL(name, start);
+
+#define __EXC_REAL_OOL_MASKABLE(name, start, end)		\
+	__EXC_REAL_OOL(name, start, end);
+
+#define __TRAMP_REAL_REAL_OOL_MASKABLE(name, vec)			\
+	TRAMP_REAL_BEGIN(tramp_real_##name);				\
+	MASKABLE_EXCEPTION_PSERIES_OOL(vec, name##_common);		\
+
+#define EXC_REAL_OOL_MASKABLE(name, start, end)		\
+	__EXC_REAL_OOL_MASKABLE(name, start, end);		\
+	__TRAMP_REAL_REAL_OOL_MASKABLE(name, start);
+
+#define __EXC_REAL_OOL_HV_DIRECT(name, start, end, handler)	\
+	EXC_REAL_BEGIN(name, start, end);			\
+	__OOL_EXCEPTION(start, label, handler);				\
+	EXC_REAL_END(name, start, end);
+
+#define __EXC_REAL_OOL_HV(name, start, end)			\
+	__EXC_REAL_OOL(name, start, end);
+
+#define __TRAMP_REAL_REAL_OOL_HV(name, vec)				\
+	TRAMP_REAL_BEGIN(tramp_real_##name);				\
+	STD_EXCEPTION_HV_OOL(vec, name##_common);			\
+
+#define EXC_REAL_OOL_HV(name, start, end)			\
+	__EXC_REAL_OOL_HV(name, start, end);			\
+	__TRAMP_REAL_REAL_OOL_HV(name, start);
+
+#define __EXC_REAL_OOL_MASKABLE_HV(name, start, end)		\
+	__EXC_REAL_OOL(name, start, end);
+
+#define __TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, vec)			\
+	TRAMP_REAL_BEGIN(tramp_real_##name);				\
+	MASKABLE_EXCEPTION_HV_OOL(vec, name##_common);			\
+
+#define EXC_REAL_OOL_MASKABLE_HV(name, start, end)		\
+	__EXC_REAL_OOL_MASKABLE_HV(name, start, end);	\
+	__TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, start);
+
+#define __EXC_VIRT_OOL(name, start, end)			\
+	EXC_VIRT_BEGIN(name, start, end);			\
+	__OOL_EXCEPTION(start, label, tramp_virt_##name);		\
+	EXC_VIRT_END(name, start, end);
+
+#define __TRAMP_REAL_VIRT_OOL(name, realvec)				\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+	STD_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
+
+#define EXC_VIRT_OOL(name, start, end, realvec)		\
+	__EXC_VIRT_OOL(name, start, end);			\
+	__TRAMP_REAL_VIRT_OOL(name, realvec);
+
+#define __EXC_VIRT_OOL_MASKABLE(name, start, end)		\
+	__EXC_VIRT_OOL(name, start, end);
+
+#define __TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec)		\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+	MASKABLE_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common);	\
+
+#define EXC_VIRT_OOL_MASKABLE(name, start, end, realvec)	\
+	__EXC_VIRT_OOL_MASKABLE(name, start, end);		\
+	__TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec);
+
+#define __EXC_VIRT_OOL_HV(name, start, end)			\
+	__EXC_VIRT_OOL(name, start, end);
+
+#define __TRAMP_REAL_VIRT_OOL_HV(name, realvec)			\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+	STD_RELON_EXCEPTION_HV_OOL(realvec, name##_common);		\
+
+#define EXC_VIRT_OOL_HV(name, start, end, realvec)		\
+	__EXC_VIRT_OOL_HV(name, start, end);			\
+	__TRAMP_REAL_VIRT_OOL_HV(name, realvec);
+
+#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, end)		\
+	__EXC_VIRT_OOL(name, start, end);
+
+#define __TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec)		\
+	TRAMP_VIRT_BEGIN(tramp_virt_##name);			\
+	MASKABLE_RELON_EXCEPTION_HV_OOL(realvec, name##_common);	\
+
+#define EXC_VIRT_OOL_MASKABLE_HV(name, start, end, realvec)	\
+	__EXC_VIRT_OOL_MASKABLE_HV(name, start, end);	\
+	__TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec);
+
+#define TRAMP_KVM(area, n)						\
+	TRAMP_KVM_BEGIN(do_kvm_##n);					\
+	KVM_HANDLER(area, EXC_STD, n);					\
+
+#define TRAMP_KVM_SKIP(area, n)						\
+	TRAMP_KVM_BEGIN(do_kvm_##n);					\
+	KVM_HANDLER_SKIP(area, EXC_STD, n);				\
+
+/*
+ * HV variant exceptions get the 0x2 bit added to their trap number.
+ */
+#define TRAMP_KVM_HV(area, n)						\
+	TRAMP_KVM_BEGIN(do_kvm_H##n);					\
+	KVM_HANDLER(area, EXC_HV, n + 0x2);				\
+
+#define TRAMP_KVM_HV_SKIP(area, n)					\
+	TRAMP_KVM_BEGIN(do_kvm_H##n);					\
+	KVM_HANDLER_SKIP(area, EXC_HV, n + 0x2);			\
+
+#define EXC_COMMON(name, realvec, hdlr)				\
+	EXC_COMMON_BEGIN(name);					\
+	STD_EXCEPTION_COMMON(realvec, name, hdlr);			\
+
+#define EXC_COMMON_ASYNC(name, realvec, hdlr)			\
+	EXC_COMMON_BEGIN(name);					\
+	STD_EXCEPTION_COMMON_ASYNC(realvec, name, hdlr);		\
+
+#define EXC_COMMON_HV(name, realvec, hdlr)				\
+	EXC_COMMON_BEGIN(name);					\
+	STD_EXCEPTION_COMMON(realvec + 0x2, name, hdlr);		\
+
+#endif	/* _ASM_POWERPC_HEAD_64_H */

+ 1 - 1
arch/powerpc/include/asm/machdep.h

@@ -61,7 +61,7 @@ struct machdep_calls {
 
 	void		(*init_IRQ)(void);
 
-	/* Return an irq, or NO_IRQ to indicate there are none pending. */
+	/* Return an irq, or 0 to indicate there are none pending. */
 	unsigned int	(*get_irq)(void);
 
 	/* PCI stuff */

+ 3 - 0
arch/powerpc/include/asm/mmu-book3e.h

@@ -313,6 +313,9 @@ extern int book3e_htw_mode;
  * return 1, indicating that the tlb requires preloading.
  */
 #define HUGETLB_NEED_PRELOAD
+
+#define mmu_cleanup_all NULL
+
 #endif
 
 #endif /* !__ASSEMBLY__ */

+ 4 - 0
arch/powerpc/include/asm/mmu.h

@@ -204,6 +204,10 @@ extern unsigned int __start___mmu_ftr_fixup, __stop___mmu_ftr_fixup;
  * make it match the size our of bolted TLB area
  */
 extern u64 ppc64_rma_size;
+
+/* Cleanup function used by kexec */
+extern void mmu_cleanup_all(void);
+extern void radix__mmu_cleanup_all(void);
 #endif /* CONFIG_PPC64 */
 
 struct mm_struct;

+ 3 - 1
arch/powerpc/include/asm/mmu_context.h

@@ -18,6 +18,7 @@ extern void destroy_context(struct mm_struct *mm);
 #ifdef CONFIG_SPAPR_TCE_IOMMU
 struct mm_iommu_table_group_mem_t;
 
+extern int isolate_lru_page(struct page *page);	/* from internal.h */
 extern bool mm_iommu_preregistered(void);
 extern long mm_iommu_get(unsigned long ua, unsigned long entries,
 		struct mm_iommu_table_group_mem_t **pmem);
@@ -71,7 +72,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			     struct task_struct *tsk)
 {
 	/* Mark this context has been used on the new CPU */
-	cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
+	if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next)))
+		cpumask_set_cpu(smp_processor_id(), mm_cpumask(next));
 
 	/* 32-bit keeps track of the current PGDIR in the thread struct */
 #ifdef CONFIG_PPC32

+ 3 - 3
arch/powerpc/include/asm/mpic_msgr.h

@@ -122,9 +122,9 @@ static inline void mpic_msgr_set_destination(struct mpic_msgr *msgr,
  * @msgr:	the message register whose IRQ is to be returned
  *
  * Returns the IRQ number associated with the given message register.
- * NO_IRQ is returned if this message register is not capable of
- * receiving interrupts.  What message register can and cannot receive
- * interrupts is specified in the device tree for the system.
+ * 0 is returned if this message register is not capable of receiving
+ * interrupts.  What message register can and cannot receive interrupts is
+ * specified in the device tree for the system.
  */
 static inline int mpic_msgr_get_irq(struct mpic_msgr *msgr)
 {

+ 2 - 1
arch/powerpc/include/asm/nohash/32/pgtable.h

@@ -267,7 +267,8 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
 }
 
 
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void __ptep_set_access_flags(struct mm_struct *mm,
+					   pte_t *ptep, pte_t entry)
 {
 	unsigned long set = pte_val(entry) &
 		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);

+ 2 - 1
arch/powerpc/include/asm/nohash/64/pgtable.h

@@ -300,7 +300,8 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
 /* Set the dirty and/or accessed bits atomically in a linux PTE, this
  * function doesn't need to flush the hash entry
  */
-static inline void __ptep_set_access_flags(pte_t *ptep, pte_t entry)
+static inline void __ptep_set_access_flags(struct mm_struct *mm,
+					   pte_t *ptep, pte_t entry)
 {
 	unsigned long bits = pte_val(entry) &
 		(_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);

+ 1 - 1
arch/powerpc/include/asm/parport.h

@@ -28,7 +28,7 @@ static int parport_pc_find_nonpci_ports (int autoirq, int autodma)
 		io1 = prop[1]; io2 = prop[2];
 
 		virq = irq_of_parse_and_map(np, 0);
-		if (virq == NO_IRQ)
+		if (!virq)
 			continue;
 
 		if (parport_pc_probe_port(io1, io2, virq, autodma, NULL, 0)

+ 3 - 1
arch/powerpc/include/asm/pnv-pci.h

@@ -16,7 +16,7 @@
 #include <misc/cxl-base.h>
 #include <asm/opal-api.h>
 
-#define PCI_SLOT_ID_PREFIX	0x8000000000000000
+#define PCI_SLOT_ID_PREFIX	(1UL << 63)
 #define PCI_SLOT_ID(phb_id, bdfn)	\
 	(PCI_SLOT_ID_PREFIX | ((uint64_t)(bdfn) << 16) | (phb_id))
 
@@ -63,6 +63,8 @@ struct pnv_php_slot {
 #define PNV_PHP_STATE_POPULATED		2
 #define PNV_PHP_STATE_OFFLINE		3
 	int				state;
+	int				irq;
+	struct workqueue_struct		*wq;
 	struct device_node		*dn;
 	struct pci_dev			*pdev;
 	struct pci_bus			*bus;

+ 2 - 0
arch/powerpc/include/asm/ppc-opcode.h

@@ -236,6 +236,7 @@
 #define PPC_INST_STWU			0x94000000
 #define PPC_INST_MFLR			0x7c0802a6
 #define PPC_INST_MTLR			0x7c0803a6
+#define PPC_INST_MTCTR			0x7c0903a6
 #define PPC_INST_CMPWI			0x2c000000
 #define PPC_INST_CMPDI			0x2c200000
 #define PPC_INST_CMPW			0x7c000000
@@ -250,6 +251,7 @@
 #define PPC_INST_SUB			0x7c000050
 #define PPC_INST_BLR			0x4e800020
 #define PPC_INST_BLRL			0x4e800021
+#define PPC_INST_BCTR			0x4e800420
 #define PPC_INST_MULLD			0x7c0001d2
 #define PPC_INST_MULLW			0x7c0001d6
 #define PPC_INST_MULHWU			0x7c000016

+ 16 - 33
arch/powerpc/include/asm/ppc_asm.h

@@ -201,14 +201,12 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #ifdef PPC64_ELF_ABI_v2
 
 #define _GLOBAL(name) \
-	.section ".text"; \
 	.align 2 ; \
 	.type name,@function; \
 	.globl name; \
 name:
 
 #define _GLOBAL_TOC(name) \
-	.section ".text"; \
 	.align 2 ; \
 	.type name,@function; \
 	.globl name; \
@@ -217,13 +215,6 @@ name: \
 	addi r2,r2,(.TOC.-0b)@l; \
 	.localentry name,.-name
 
-#define _KPROBE(name) \
-	.section ".kprobes.text","a"; \
-	.align 2 ; \
-	.type name,@function; \
-	.globl name; \
-name:
-
 #define DOTSYM(a)	a
 
 #else
@@ -232,35 +223,20 @@ name:
 #define GLUE(a,b) XGLUE(a,b)
 
 #define _GLOBAL(name) \
-	.section ".text"; \
 	.align 2 ; \
 	.globl name; \
 	.globl GLUE(.,name); \
-	.section ".opd","aw"; \
+	.pushsection ".opd","aw"; \
 name: \
 	.quad GLUE(.,name); \
 	.quad .TOC.@tocbase; \
 	.quad 0; \
-	.previous; \
+	.popsection; \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
 #define _GLOBAL_TOC(name) _GLOBAL(name)
 
-#define _KPROBE(name) \
-	.section ".kprobes.text","a"; \
-	.align 2 ; \
-	.globl name; \
-	.globl GLUE(.,name); \
-	.section ".opd","aw"; \
-name: \
-	.quad GLUE(.,name); \
-	.quad .TOC.@tocbase; \
-	.quad 0; \
-	.previous; \
-	.type GLUE(.,name),@function; \
-GLUE(.,name):
-
 #define DOTSYM(a)	GLUE(.,a)
 
 #endif
@@ -272,20 +248,28 @@ GLUE(.,name):
 n:
 
 #define _GLOBAL(n)	\
-	.text;		\
 	.stabs __stringify(n:F-1),N_FUN,0,0,n;\
 	.globl n;	\
 n:
 
 #define _GLOBAL_TOC(name) _GLOBAL(name)
 
-#define _KPROBE(n)	\
-	.section ".kprobes.text","a";	\
-	.globl	n;	\
-n:
-
 #endif
 
+/*
+ * __kprobes (the C annotation) puts the symbol into the .kprobes.text
+ * section, which gets emitted at the end of regular text.
+ *
+ * _ASM_NOKPROBE_SYMBOL and NOKPROBE_SYMBOL just adds the symbol to
+ * a blacklist. The former is for core kprobe functions/data, the
+ * latter is for those that incdentially must be excluded from probing
+ * and allows them to be linked at more optimal location within text.
+ */
+#define _ASM_NOKPROBE_SYMBOL(entry)			\
+	.pushsection "_kprobe_blacklist","aw";		\
+	PPC_LONG (entry) ;				\
+	.popsection
+
 #define FUNC_START(name)	_GLOBAL(name)
 #define FUNC_END(name)
 
@@ -527,7 +511,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_601)
 #endif
 #define MTMSRD(r)	mtmsr	r
 #define MTMSR_EERI(reg)	mtmsr	reg
-#define CLR_TOP32(r)
 #endif
 
 #endif /* __KERNEL__ */

+ 7 - 9
arch/powerpc/include/asm/processor.h

@@ -147,7 +147,7 @@ typedef struct {
 } mm_segment_t;
 
 #define TS_FPR(i) fp_state.fpr[i][TS_FPROFFSET]
-#define TS_TRANS_FPR(i) transact_fp.fpr[i][TS_FPROFFSET]
+#define TS_CKFPR(i) ckfp_state.fpr[i][TS_FPROFFSET]
 
 /* FP and VSX 0-31 register set */
 struct thread_fp_state {
@@ -257,6 +257,7 @@ struct thread_struct {
 	int		used_spe;	/* set if process has used spe */
 #endif /* CONFIG_SPE */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	u8	load_tm;
 	u64		tm_tfhar;	/* Transaction fail handler addr */
 	u64		tm_texasr;	/* Transaction exception & summary */
 	u64		tm_tfiar;	/* Transaction fail instr address reg */
@@ -267,20 +268,17 @@ struct thread_struct {
 	unsigned long	tm_dscr;
 
 	/*
-	 * Transactional FP and VSX 0-31 register set.
-	 * NOTE: the sense of these is the opposite of the integer ckpt_regs!
+	 * Checkpointed FP and VSX 0-31 register set.
 	 *
 	 * When a transaction is active/signalled/scheduled etc., *regs is the
 	 * most recent set of/speculated GPRs with ckpt_regs being the older
 	 * checkpointed regs to which we roll back if transaction aborts.
 	 *
-	 * However, fpr[] is the checkpointed 'base state' of FP regs, and
-	 * transact_fpr[] is the new set of transactional values.
-	 * VRs work the same way.
+	 * These are analogous to how ckpt_regs and pt_regs work
 	 */
-	struct thread_fp_state transact_fp;
-	struct thread_vr_state transact_vr;
-	unsigned long	transact_vrsave;
+	struct thread_fp_state ckfp_state; /* Checkpointed FP state */
+	struct thread_vr_state ckvr_state; /* Checkpointed VR state */
+	unsigned long	ckvrsave; /* Checkpointed VRSAVE */
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 #ifdef CONFIG_KVM_BOOK3S_32_HANDLER
 	void*		kvm_shadow_vcpu; /* KVM internal data */

+ 4 - 1
arch/powerpc/include/asm/reg.h

@@ -475,6 +475,9 @@
 #define HID0_POWER8_1TO4LPAR	__MASK(51)
 #define HID0_POWER8_DYNLPARDIS	__MASK(48)
 
+/* POWER9 HID0 bits */
+#define HID0_POWER9_RADIX	__MASK(63 - 8)
+
 #define SPRN_HID1	0x3F1		/* Hardware Implementation Register 1 */
 #ifdef CONFIG_6xx
 #define HID1_EMCP	(1<<31)		/* 7450 Machine Check Pin Enable */
@@ -1248,7 +1251,7 @@ static inline void mtmsr_isync(unsigned long val)
 				     : "memory")
 #endif
 
-extern void msr_check_and_set(unsigned long bits);
+extern unsigned long msr_check_and_set(unsigned long bits);
 extern bool strict_msr_control;
 extern void __msr_check_and_clear(unsigned long bits);
 static inline void msr_check_and_clear(unsigned long bits)

+ 0 - 2
arch/powerpc/include/asm/signal.h

@@ -5,6 +5,4 @@
 #include <uapi/asm/signal.h>
 #include <uapi/asm/ptrace.h>
 
-extern unsigned long get_tm_stackpointer(struct pt_regs *regs);
-
 #endif /* _ASM_POWERPC_SIGNAL_H */

+ 0 - 5
arch/powerpc/include/asm/tm.h

@@ -9,11 +9,6 @@
 
 #ifndef __ASSEMBLY__
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-extern void do_load_up_transact_fpu(struct thread_struct *thread);
-extern void do_load_up_transact_altivec(struct thread_struct *thread);
-#endif
-
 extern void tm_enable(void);
 extern void tm_reclaim(struct thread_struct *thread,
 		       unsigned long orig_msr, uint8_t cause);

+ 7 - 8
arch/powerpc/kernel/Makefile

@@ -31,8 +31,7 @@ obj-y				:= cputable.o ptrace.o syscalls.o \
 				   process.o systbl.o idle.o \
 				   signal.o sysfs.o cacheinfo.o time.o \
 				   prom.o traps.o setup-common.o \
-				   udbg.o misc.o io.o dma.o \
-				   misc_$(CONFIG_WORD_SIZE).o \
+				   udbg.o misc.o io.o dma.o misc_$(BITS).o \
 				   of_platform.o prom_parse.o
 obj-$(CONFIG_PPC64)		+= setup_64.o sys_ppc32.o \
 				   signal_64.o ptrace32.o \
@@ -70,23 +69,23 @@ obj-$(CONFIG_HIBERNATION)	+= swsusp.o suspend.o
 ifeq ($(CONFIG_FSL_BOOKE),y)
 obj-$(CONFIG_HIBERNATION)	+= swsusp_booke.o
 else
-obj-$(CONFIG_HIBERNATION)	+= swsusp_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_HIBERNATION)	+= swsusp_$(BITS).o
 endif
 obj64-$(CONFIG_HIBERNATION)	+= swsusp_asm64.o
-obj-$(CONFIG_MODULES)		+= module.o module_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_MODULES)		+= module.o module_$(BITS).o
 obj-$(CONFIG_44x)		+= cpu_setup_44x.o
 obj-$(CONFIG_PPC_FSL_BOOK3E)	+= cpu_setup_fsl_booke.o
 obj-$(CONFIG_PPC_DOORBELL)	+= dbell.o
 obj-$(CONFIG_JUMP_LABEL)	+= jump_label.o
 
-extra-y				:= head_$(CONFIG_WORD_SIZE).o
+extra-y				:= head_$(BITS).o
 extra-$(CONFIG_40x)		:= head_40x.o
 extra-$(CONFIG_44x)		:= head_44x.o
 extra-$(CONFIG_FSL_BOOKE)	:= head_fsl_booke.o
 extra-$(CONFIG_8xx)		:= head_8xx.o
 extra-y				+= vmlinux.lds
 
-obj-$(CONFIG_RELOCATABLE)	+= reloc_$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_RELOCATABLE)	+= reloc_$(BITS).o
 
 obj-$(CONFIG_PPC32)		+= entry_32.o setup_32.o
 obj-$(CONFIG_PPC64)		+= dma-iommu.o iommu.o
@@ -104,11 +103,11 @@ obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
 
 pci64-$(CONFIG_PPC64)		+= pci_dn.o pci-hotplug.o isa-bridge.o
-obj-$(CONFIG_PCI)		+= pci_$(CONFIG_WORD_SIZE).o $(pci64-y) \
+obj-$(CONFIG_PCI)		+= pci_$(BITS).o $(pci64-y) \
 				   pci-common.o pci_of_scan.o
 obj-$(CONFIG_PCI_MSI)		+= msi.o
 obj-$(CONFIG_KEXEC)		+= machine_kexec.o crash.o \
-				   machine_kexec_$(CONFIG_WORD_SIZE).o
+				   machine_kexec_$(BITS).o
 obj-$(CONFIG_AUDIT)		+= audit.o
 obj64-$(CONFIG_AUDIT)		+= compat_audit.o
 

+ 6 - 6
arch/powerpc/kernel/asm-offsets.c

@@ -142,12 +142,12 @@ int main(void)
 	DEFINE(THREAD_TM_PPR, offsetof(struct thread_struct, tm_ppr));
 	DEFINE(THREAD_TM_DSCR, offsetof(struct thread_struct, tm_dscr));
 	DEFINE(PT_CKPT_REGS, offsetof(struct thread_struct, ckpt_regs));
-	DEFINE(THREAD_TRANSACT_VRSTATE, offsetof(struct thread_struct,
-						 transact_vr));
-	DEFINE(THREAD_TRANSACT_VRSAVE, offsetof(struct thread_struct,
-					    transact_vrsave));
-	DEFINE(THREAD_TRANSACT_FPSTATE, offsetof(struct thread_struct,
-						 transact_fp));
+	DEFINE(THREAD_CKVRSTATE, offsetof(struct thread_struct,
+						 ckvr_state));
+	DEFINE(THREAD_CKVRSAVE, offsetof(struct thread_struct,
+					    ckvrsave));
+	DEFINE(THREAD_CKFPSTATE, offsetof(struct thread_struct,
+						 ckfp_state));
 	/* Local pt_regs on stack for Transactional Memory funcs. */
 	DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD +
 	       sizeof(struct pt_regs) + 16);

+ 19 - 0
arch/powerpc/kernel/cputable.c

@@ -506,6 +506,25 @@ static struct cpu_spec __initdata cpu_specs[] = {
 		.machine_check_early	= __machine_check_early_realmode_p8,
 		.platform		= "power8",
 	},
+	{	/* Power9 DD1*/
+		.pvr_mask		= 0xffffff00,
+		.pvr_value		= 0x004e0100,
+		.cpu_name		= "POWER9 (raw)",
+		.cpu_features		= CPU_FTRS_POWER9_DD1,
+		.cpu_user_features	= COMMON_USER_POWER9,
+		.cpu_user_features2	= COMMON_USER2_POWER9,
+		.mmu_features		= MMU_FTRS_POWER9,
+		.icache_bsize		= 128,
+		.dcache_bsize		= 128,
+		.num_pmcs		= 6,
+		.pmc_type		= PPC_PMC_IBM,
+		.oprofile_cpu_type	= "ppc64/power9",
+		.oprofile_type		= PPC_OPROFILE_INVALID,
+		.cpu_setup		= __setup_cpu_power9,
+		.cpu_restore		= __restore_cpu_power9,
+		.flush_tlb		= __flush_tlb_power9,
+		.platform		= "power9",
+	},
 	{	/* Power9 */
 		.pvr_mask		= 0xffff0000,
 		.pvr_value		= 0x004e0000,

+ 3 - 1
arch/powerpc/kernel/eeh.c

@@ -116,6 +116,7 @@ struct eeh_ops *eeh_ops = NULL;
 
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
+EXPORT_SYMBOL_GPL(confirm_error_lock);
 
 /* Lock to protect passed flags */
 static DEFINE_MUTEX(eeh_dev_mutex);
@@ -1044,7 +1045,7 @@ int eeh_init(void)
 	if (eeh_enabled())
 		pr_info("EEH: PCI Enhanced I/O Error Handling Enabled\n");
 	else
-		pr_warn("EEH: No capable adapters found\n");
+		pr_info("EEH: No capable adapters found\n");
 
 	return ret;
 }
@@ -1502,6 +1503,7 @@ int eeh_pe_set_option(struct eeh_pe *pe, int option)
 		break;
 	case EEH_OPT_THAW_MMIO:
 	case EEH_OPT_THAW_DMA:
+	case EEH_OPT_FREEZE_PE:
 		if (!eeh_ops || !eeh_ops->set_option) {
 			ret = -ENOENT;
 			break;

+ 9 - 1
arch/powerpc/kernel/eeh_driver.c

@@ -993,9 +993,17 @@ static void eeh_handle_special_event(void)
 
 				/* Notify all devices to be down */
 				eeh_pe_state_clear(pe, EEH_PE_PRI_BUS);
-				bus = eeh_pe_bus_get(phb_pe);
 				eeh_pe_dev_traverse(pe,
 					eeh_report_failure, NULL);
+				bus = eeh_pe_bus_get(phb_pe);
+				if (!bus) {
+					pr_err("%s: Cannot find PCI bus for "
+					       "PHB#%d-PE#%x\n",
+					       __func__,
+					       pe->phb->global_number,
+					       pe->addr);
+					break;
+				}
 				pci_hp_remove_devices(bus);
 			}
 			pci_unlock_rescan_remove();

+ 1 - 0
arch/powerpc/kernel/eeh_pe.c

@@ -581,6 +581,7 @@ void eeh_pe_state_mark(struct eeh_pe *pe, int state)
 {
 	eeh_pe_traverse(pe, __eeh_pe_state_mark, &state);
 }
+EXPORT_SYMBOL_GPL(eeh_pe_state_mark);
 
 static void *__eeh_pe_dev_mode_mark(void *data, void *flag)
 {

+ 0 - 1
arch/powerpc/kernel/entry_32.S

@@ -654,7 +654,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_SPE)
 #endif /* CONFIG_SMP */
 
 	tophys(r0,r4)
-	CLR_TOP32(r0)
 	mtspr	SPRN_SPRG_THREAD,r0	/* Update current THREAD phys addr */
 	lwz	r1,KSP(r4)	/* Load new stack pointer */
 

+ 9 - 12
arch/powerpc/kernel/entry_64.S

@@ -139,7 +139,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR)
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	1
 #else
-	ld	r11,PACAKMSR(r13)
+	li	r11,MSR_RI
 	ori	r11,r11,MSR_EE
 	mtmsrd	r11,1
 #endif /* CONFIG_PPC_BOOK3E */
@@ -195,7 +195,6 @@ system_call:			/* label this so stack traces look sane */
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	0
 #else
-	ld	r10,PACAKMSR(r13)
 	/*
 	 * For performance reasons we clear RI the same time that we
 	 * clear EE. We only need to clear RI just before we restore r13
@@ -203,8 +202,7 @@ system_call:			/* label this so stack traces look sane */
 	 * We have to be careful to restore RI if we branch anywhere from
 	 * here (eg syscall_exit_work).
 	 */
-	li	r9,MSR_RI
-	andc	r11,r10,r9
+	li	r11,0
 	mtmsrd	r11,1
 #endif /* CONFIG_PPC_BOOK3E */
 
@@ -221,13 +219,12 @@ system_call:			/* label this so stack traces look sane */
 #endif
 2:	addi    r3,r1,STACK_FRAME_OVERHEAD
 #ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
 	mtmsrd	r10,1		/* Restore RI */
 #endif
 	bl	restore_math
 #ifdef CONFIG_PPC_BOOK3S
-	ld	r10,PACAKMSR(r13)
-	li	r9,MSR_RI
-	andc	r11,r10,r9 /* Re-clear RI */
+	li	r11,0
 	mtmsrd	r11,1
 #endif
 	ld	r8,_MSR(r1)
@@ -308,6 +305,7 @@ syscall_enosys:
 	
 syscall_exit_work:
 #ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
 	mtmsrd	r10,1		/* Restore RI */
 #endif
 	/* If TIF_RESTOREALL is set, don't scribble on either r3 or ccr.
@@ -354,7 +352,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	1
 #else
-	ld	r10,PACAKMSR(r13)
+	li	r10,MSR_RI
 	ori	r10,r10,MSR_EE
 	mtmsrd	r10,1
 #endif /* CONFIG_PPC_BOOK3E */
@@ -619,7 +617,7 @@ _GLOBAL(ret_from_except_lite)
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	0
 #else
-	ld	r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+	li	r10,MSR_RI
 	mtmsrd	r10,1		  /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 
@@ -751,7 +749,7 @@ resume_kernel:
 #ifdef CONFIG_PPC_BOOK3E
 	wrteei	0
 #else
-	ld	r10,PACAKMSR(r13) /* Get kernel MSR without EE */
+	li	r10,MSR_RI
 	mtmsrd	r10,1		  /* Update machine state */
 #endif /* CONFIG_PPC_BOOK3E */
 #endif /* CONFIG_PREEMPT */
@@ -841,8 +839,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	 * userspace and we take an exception after restoring r13,
 	 * we end up corrupting the userspace r13 value.
 	 */
-	ld	r4,PACAKMSR(r13) /* Get kernel MSR without EE */
-	andc	r4,r4,r0	 /* r0 contains MSR_RI here */
+	li	r4,0
 	mtmsrd	r4,1
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM

+ 1008 - 1060
arch/powerpc/kernel/exceptions-64s.S

@@ -16,72 +16,71 @@
 #include <asm/exception-64s.h>
 #include <asm/ptrace.h>
 #include <asm/cpuidle.h>
+#include <asm/head-64.h>
 
 /*
+ * There are a few constraints to be concerned with.
+ * - Real mode exceptions code/data must be located at their physical location.
+ * - Virtual mode exceptions must be mapped at their 0xc000... location.
+ * - Fixed location code must not call directly beyond the __end_interrupts
+ *   area when built with CONFIG_RELOCATABLE. LOAD_HANDLER / bctr sequence
+ *   must be used.
+ * - LOAD_HANDLER targets must be within first 64K of physical 0 /
+ *   virtual 0xc00...
+ * - Conditional branch targets must be within +/-32K of caller.
+ *
+ * "Virtual exceptions" run with relocation on (MSR_IR=1, MSR_DR=1), and
+ * therefore don't have to run in physically located code or rfid to
+ * virtual mode kernel code. However on relocatable kernels they do have
+ * to branch to KERNELBASE offset because the rest of the kernel (outside
+ * the exception vectors) may be located elsewhere.
+ *
+ * Virtual exceptions correspond with physical, except their entry points
+ * are offset by 0xc000000000000000 and also tend to get an added 0x4000
+ * offset applied. Virtual exceptions are enabled with the Alternate
+ * Interrupt Location (AIL) bit set in the LPCR. However this does not
+ * guarantee they will be delivered virtually. Some conditions (see the ISA)
+ * cause exceptions to be delivered in real mode.
+ *
+ * It's impossible to receive interrupts below 0x300 via AIL.
+ *
+ * KVM: None of the virtual exceptions are from the guest. Anything that
+ * escalated to HV=1 from HV=0 is delivered via real mode handlers.
+ *
+ *
  * We layout physical memory as follows:
  * 0x0000 - 0x00ff : Secondary processor spin code
- * 0x0100 - 0x17ff : pSeries Interrupt prologs
- * 0x1800 - 0x4000 : interrupt support common interrupt prologs
- * 0x4000 - 0x5fff : pSeries interrupts with IR=1,DR=1
- * 0x6000 - 0x6fff : more interrupt support including for IR=1,DR=1
+ * 0x0100 - 0x18ff : Real mode pSeries interrupt vectors
+ * 0x1900 - 0x3fff : Real mode trampolines
+ * 0x4000 - 0x58ff : Relon (IR=1,DR=1) mode pSeries interrupt vectors
+ * 0x5900 - 0x6fff : Relon mode trampolines
  * 0x7000 - 0x7fff : FWNMI data area
- * 0x8000 - 0x8fff : Initial (CPU0) segment table
- * 0x9000 -        : Early init and support code
+ * 0x8000 -   .... : Common interrupt handlers, remaining early
+ *                   setup code, rest of kernel.
+ *
+ * We could reclaim 0x4000-0x42ff for real mode trampolines if the space
+ * is necessary. Until then it's more consistent to explicitly put VIRT_NONE
+ * vectors there.
  */
-	/* Syscall routine is used twice, in reloc-off and reloc-on paths */
-#define SYSCALL_PSERIES_1 					\
-BEGIN_FTR_SECTION						\
-	cmpdi	r0,0x1ebe ; 					\
-	beq-	1f ;						\
-END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
-	mr	r9,r13 ;					\
-	GET_PACA(r13) ;						\
-	mfspr	r11,SPRN_SRR0 ;					\
-0:
-
-#define SYSCALL_PSERIES_2_RFID 					\
-	mfspr	r12,SPRN_SRR1 ;					\
-	ld	r10,PACAKBASE(r13) ; 				\
-	LOAD_HANDLER(r10, system_call_entry) ; 			\
-	mtspr	SPRN_SRR0,r10 ; 				\
-	ld	r10,PACAKMSR(r13) ;				\
-	mtspr	SPRN_SRR1,r10 ; 				\
-	rfid ; 							\
-	b	. ;	/* prevent speculative execution */
-
-#define SYSCALL_PSERIES_3					\
-	/* Fast LE/BE switch system call */			\
-1:	mfspr	r12,SPRN_SRR1 ;					\
-	xori	r12,r12,MSR_LE ;				\
-	mtspr	SPRN_SRR1,r12 ;					\
-	rfid ;		/* return to userspace */		\
-	b	. ;	/* prevent speculative execution */
-
-#if defined(CONFIG_RELOCATABLE)
-	/*
-	 * We can't branch directly so we do it via the CTR which
-	 * is volatile across system calls.
-	 */
-#define SYSCALL_PSERIES_2_DIRECT				\
-	mflr	r10 ;						\
-	ld	r12,PACAKBASE(r13) ; 				\
-	LOAD_HANDLER(r12, system_call_entry) ;			\
-	mtctr	r12 ;						\
-	mfspr	r12,SPRN_SRR1 ;					\
-	/* Re-use of r13... No spare regs to do this */	\
-	li	r13,MSR_RI ;					\
-	mtmsrd 	r13,1 ;						\
-	GET_PACA(r13) ;	/* get r13 back */			\
-	bctr ;
+OPEN_FIXED_SECTION(real_vectors,        0x0100, 0x1900)
+OPEN_FIXED_SECTION(real_trampolines,    0x1900, 0x4000)
+OPEN_FIXED_SECTION(virt_vectors,        0x4000, 0x5900)
+OPEN_FIXED_SECTION(virt_trampolines,    0x5900, 0x7000)
+#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+/*
+ * Data area reserved for FWNMI option.
+ * This address (0x7000) is fixed by the RPA.
+ * pseries and powernv need to keep the whole page from
+ * 0x7000 to 0x8000 free for use by the firmware
+ */
+ZERO_FIXED_SECTION(fwnmi_page,          0x7000, 0x8000)
+OPEN_TEXT_SECTION(0x8000)
 #else
-	/* We can branch directly */
-#define SYSCALL_PSERIES_2_DIRECT				\
-	mfspr	r12,SPRN_SRR1 ;					\
-	li	r10,MSR_RI ;					\
-	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
-	b	system_call_common ;
+OPEN_TEXT_SECTION(0x7000)
 #endif
 
+USE_FIXED_SECTION(real_vectors)
+
 /*
  * This is the start of the interrupt handlers for pSeries
  * This code runs with relocation off.
@@ -90,12 +89,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
  * Therefore any relative branches in this section must only
  * branch to labels in this section.
  */
-	. = 0x100
 	.globl __start_interrupts
 __start_interrupts:
 
-	.globl system_reset_pSeries;
-system_reset_pSeries:
+/* No virt vectors corresponding with 0x0..0x100 */
+EXC_VIRT_NONE(0x4000, 0x4100)
+
+EXC_REAL_BEGIN(system_reset, 0x100, 0x200)
 	SET_SCRATCH0(r13)
 #ifdef CONFIG_PPC_P7_NAP
 BEGIN_FTR_SECTION
@@ -136,9 +136,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
 #endif /* CONFIG_PPC_P7_NAP */
 	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
 				 NOTEST, 0x100)
+EXC_REAL_END(system_reset, 0x100, 0x200)
+EXC_VIRT_NONE(0x4100, 0x4200)
+EXC_COMMON(system_reset_common, 0x100, system_reset_exception)
+
+#ifdef CONFIG_PPC_PSERIES
+/*
+ * Vectors for the FWNMI option.  Share common code.
+ */
+TRAMP_REAL_BEGIN(system_reset_fwnmi)
+	SET_SCRATCH0(r13)		/* save r13 */
+	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
+				 NOTEST, 0x100)
+#endif /* CONFIG_PPC_PSERIES */
 
-	. = 0x200
-machine_check_pSeries_1:
+
+EXC_REAL_BEGIN(machine_check, 0x200, 0x300)
 	/* This is moved out of line as it can be patched by FW, but
 	 * some code path might still want to branch into the original
 	 * vector
@@ -158,253 +171,9 @@ BEGIN_FTR_SECTION
 FTR_SECTION_ELSE
 	b	machine_check_pSeries_0
 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
-
-	. = 0x300
-	.globl data_access_pSeries
-data_access_pSeries:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, data_access_common, EXC_STD,
-				 KVMTEST, 0x300)
-
-	. = 0x380
-	.globl data_access_slb_pSeries
-data_access_slb_pSeries:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXSLB)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x380)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r3,SPRN_DAR
-	mfspr	r12,SPRN_SRR1
-#ifndef CONFIG_RELOCATABLE
-	b	slb_miss_realmode
-#else
-	/*
-	 * We can't just use a direct branch to slb_miss_realmode
-	 * because the distance from here to there depends on where
-	 * the kernel ends up being put.
-	 */
-	mfctr	r11
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, slb_miss_realmode)
-	mtctr	r10
-	bctr
-#endif
-
-	STD_EXCEPTION_PSERIES(0x400, instruction_access)
-
-	. = 0x480
-	.globl instruction_access_slb_pSeries
-instruction_access_slb_pSeries:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXSLB)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST, 0x480)
-	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
-	mfspr	r12,SPRN_SRR1
-#ifndef CONFIG_RELOCATABLE
-	b	slb_miss_realmode
-#else
-	mfctr	r11
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10, slb_miss_realmode)
-	mtctr	r10
-	bctr
-#endif
-
-	/* We open code these as we can't have a ". = x" (even with
-	 * x = "." within a feature section
-	 */
-	. = 0x500;
-	.globl hardware_interrupt_pSeries;
-	.globl hardware_interrupt_hv;
-hardware_interrupt_pSeries:
-hardware_interrupt_hv:
-	BEGIN_FTR_SECTION
-		_MASKABLE_EXCEPTION_PSERIES(0x502, hardware_interrupt,
-					    EXC_HV, SOFTEN_TEST_HV)
-		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
-	FTR_SECTION_ELSE
-		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt,
-					    EXC_STD, SOFTEN_TEST_PR)
-		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
-
-	STD_EXCEPTION_PSERIES(0x600, alignment)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x600)
-
-	STD_EXCEPTION_PSERIES(0x700, program_check)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x700)
-
-	STD_EXCEPTION_PSERIES(0x800, fp_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x800)
-
-	. = 0x900
-	.globl decrementer_pSeries
-decrementer_pSeries:
-	_MASKABLE_EXCEPTION_PSERIES(0x900, decrementer, EXC_STD, SOFTEN_TEST_PR)
-
-	STD_EXCEPTION_HV(0x980, 0x982, hdecrementer)
-
-	MASKABLE_EXCEPTION_PSERIES(0xa00, 0xa00, doorbell_super)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xa00)
-
-	STD_EXCEPTION_PSERIES(0xb00, trap_0b)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xb00)
-
-	. = 0xc00
-	.globl	system_call_pSeries
-system_call_pSeries:
-	 /*
-	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
-	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
-	  * code to save that value into the guest state (it is the guest's PPR
-	  * value). Otherwise just change to HMT_MEDIUM as userspace has
-	  * already saved the PPR.
-	  */
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	SET_SCRATCH0(r13)
-	GET_PACA(r13)
-	std	r9,PACA_EXGEN+EX_R9(r13)
-	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
-	HMT_MEDIUM;
-	std	r10,PACA_EXGEN+EX_R10(r13)
-	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
-	mfcr	r9
-	KVMTEST(0xc00)
-	GET_SCRATCH0(r13)
-#else
-	HMT_MEDIUM;
-#endif
-	SYSCALL_PSERIES_1
-	SYSCALL_PSERIES_2_RFID
-	SYSCALL_PSERIES_3
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00)
-
-	STD_EXCEPTION_PSERIES(0xd00, single_step)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xd00)
-
-	/* At 0xe??? we have a bunch of hypervisor exceptions, we branch
-	 * out of line to handle them
-	 */
-	. = 0xe00
-hv_data_storage_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_data_storage_hv
-
-	. = 0xe20
-hv_instr_storage_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_instr_storage_hv
-
-	. = 0xe40
-emulation_assist_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	emulation_assist_hv
-
-	. = 0xe60
-hv_exception_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	hmi_exception_early
-
-	. = 0xe80
-hv_doorbell_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_doorbell_hv
-
-	. = 0xea0
-hv_virt_irq_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_virt_irq_hv
-
-	/* We need to deal with the Altivec unavailable exception
-	 * here which is at 0xf20, thus in the middle of the
-	 * prolog code of the PerformanceMonitor one. A little
-	 * trickery is thus necessary
-	 */
-	. = 0xf00
-performance_monitor_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	performance_monitor_pSeries
-
-	. = 0xf20
-altivec_unavailable_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	altivec_unavailable_pSeries
-
-	. = 0xf40
-vsx_unavailable_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	vsx_unavailable_pSeries
-
-	. = 0xf60
-facility_unavailable_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	facility_unavailable_pSeries
-
-	. = 0xf80
-hv_facility_unavailable_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	facility_unavailable_hv
-
-#ifdef CONFIG_CBE_RAS
-	STD_EXCEPTION_HV(0x1200, 0x1202, cbe_system_error)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1202)
-#endif /* CONFIG_CBE_RAS */
-
-	STD_EXCEPTION_PSERIES(0x1300, instruction_breakpoint)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1300)
-
-	. = 0x1500
-	.global denorm_exception_hv
-denorm_exception_hv:
-	mtspr	SPRN_SPRG_HSCRATCH0,r13
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
-
-#ifdef CONFIG_PPC_DENORMALISATION
-	mfspr	r10,SPRN_HSRR1
-	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
-	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
-	addi	r11,r11,-4		/* HSRR0 is next instruction */
-	bne+	denorm_assist
-#endif
-
-	KVMTEST(0x1500)
-	EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x1500)
-
-#ifdef CONFIG_CBE_RAS
-	STD_EXCEPTION_HV(0x1600, 0x1602, cbe_maintenance)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1602)
-#endif /* CONFIG_CBE_RAS */
-
-	STD_EXCEPTION_PSERIES(0x1700, altivec_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x1700)
-
-#ifdef CONFIG_CBE_RAS
-	STD_EXCEPTION_HV(0x1800, 0x1802, cbe_thermal)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0x1802)
-#else
-	. = 0x1800
-#endif /* CONFIG_CBE_RAS */
-
-
-/*** Out of line interrupts support ***/
-
-	.align	7
-	/* moved from 0x200 */
-machine_check_powernv_early:
+EXC_REAL_END(machine_check, 0x200, 0x300)
+EXC_VIRT_NONE(0x4200, 0x4300)
+TRAMP_REAL_BEGIN(machine_check_powernv_early)
 BEGIN_FTR_SECTION
 	EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200)
 	/*
@@ -457,7 +226,6 @@ BEGIN_FTR_SECTION
 	mfmsr	r11			/* get MSR value */
 	ori	r11,r11,MSR_ME		/* turn on ME bit */
 	ori	r11,r11,MSR_RI		/* turn on RI bit */
-	ld	r12,PACAKBASE(r13)	/* get high part of &label */
 	LOAD_HANDLER(r12, machine_check_handle_early)
 1:	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r11
@@ -470,7 +238,6 @@ BEGIN_FTR_SECTION
 	 */
 	addi	r1,r1,INT_FRAME_SIZE	/* go back to previous stack frame */
 	ld	r11,PACAKMSR(r13)
-	ld	r12,PACAKBASE(r13)
 	LOAD_HANDLER(r12, unrecover_mce)
 	li	r10,MSR_ME
 	andc	r11,r11,r10		/* Turn off MSR_ME */
@@ -478,20 +245,19 @@ BEGIN_FTR_SECTION
 	b	.	/* prevent speculative execution */
 END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 
-machine_check_pSeries:
+TRAMP_REAL_BEGIN(machine_check_pSeries)
 	.globl machine_check_fwnmi
 machine_check_fwnmi:
 	SET_SCRATCH0(r13)		/* save r13 */
 	EXCEPTION_PROLOG_0(PACA_EXMC)
 machine_check_pSeries_0:
-	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST, 0x200)
+	EXCEPTION_PROLOG_1(PACA_EXMC, KVMTEST_PR, 0x200)
 	/*
 	 * The following is essentially EXCEPTION_PROLOG_PSERIES_1 with the
 	 * difference that MSR_RI is not enabled, because PACA_EXMC is being
 	 * used, so nested machine check corrupts it. machine_check_common
 	 * enables MSR_RI.
 	 */
-	ld	r12,PACAKBASE(r13)
 	ld	r10,PACAKMSR(r13)
 	xori	r10,r10,MSR_RI
 	mfspr	r11,SPRN_SRR0
@@ -502,287 +268,243 @@ machine_check_pSeries_0:
 	rfid
 	b	.	/* prevent speculative execution */
 
-	KVM_HANDLER_SKIP(PACA_EXMC, EXC_STD, 0x200)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_STD, 0x300)
-	KVM_HANDLER_SKIP(PACA_EXSLB, EXC_STD, 0x380)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x400)
-	KVM_HANDLER(PACA_EXSLB, EXC_STD, 0x480)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x900)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x982)
-
-#ifdef CONFIG_PPC_DENORMALISATION
-denorm_assist:
-BEGIN_FTR_SECTION
-/*
- * To denormalise we need to move a copy of the register to itself.
- * For POWER6 do that here for all FP regs.
- */
-	mfmsr	r10
-	ori	r10,r10,(MSR_FP|MSR_FE0|MSR_FE1)
-	xori	r10,r10,(MSR_FE0|MSR_FE1)
-	mtmsrd	r10
-	sync
-
-#define FMR2(n)  fmr (n), (n) ; fmr n+1, n+1
-#define FMR4(n)  FMR2(n) ; FMR2(n+2)
-#define FMR8(n)  FMR4(n) ; FMR4(n+4)
-#define FMR16(n) FMR8(n) ; FMR8(n+8)
-#define FMR32(n) FMR16(n) ; FMR16(n+16)
-	FMR32(0)
+TRAMP_KVM_SKIP(PACA_EXMC, 0x200)
 
-FTR_SECTION_ELSE
-/*
- * To denormalise we need to move a copy of the register to itself.
- * For POWER7 do that here for the first 32 VSX registers only.
- */
-	mfmsr	r10
-	oris	r10,r10,MSR_VSX@h
-	mtmsrd	r10
-	sync
+EXC_COMMON_BEGIN(machine_check_common)
+	/*
+	 * Machine check is different because we use a different
+	 * save area: PACA_EXMC instead of PACA_EXGEN.
+	 */
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXMC+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXMC+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
+	FINISH_NAP
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r3,PACA_EXMC+EX_DAR(r13)
+	lwz	r4,PACA_EXMC+EX_DSISR(r13)
+	/* Enable MSR_RI when finished with PACA_EXMC */
+	li	r10,MSR_RI
+	mtmsrd 	r10,1
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	b	ret_from_except
 
-#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
-#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
-#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
-#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
-#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
-	XVCPSGNDP32(0)
+#define MACHINE_CHECK_HANDLER_WINDUP			\
+	/* Clear MSR_RI before setting SRR0 and SRR1. */\
+	li	r0,MSR_RI;				\
+	mfmsr	r9;		/* get MSR value */	\
+	andc	r9,r9,r0;				\
+	mtmsrd	r9,1;		/* Clear MSR_RI */	\
+	/* Move original SRR0 and SRR1 into the respective regs */	\
+	ld	r9,_MSR(r1);				\
+	mtspr	SPRN_SRR1,r9;				\
+	ld	r3,_NIP(r1);				\
+	mtspr	SPRN_SRR0,r3;				\
+	ld	r9,_CTR(r1);				\
+	mtctr	r9;					\
+	ld	r9,_XER(r1);				\
+	mtxer	r9;					\
+	ld	r9,_LINK(r1);				\
+	mtlr	r9;					\
+	REST_GPR(0, r1);				\
+	REST_8GPRS(2, r1);				\
+	REST_GPR(10, r1);				\
+	ld	r11,_CCR(r1);				\
+	mtcr	r11;					\
+	/* Decrement paca->in_mce. */			\
+	lhz	r12,PACA_IN_MCE(r13);			\
+	subi	r12,r12,1;				\
+	sth	r12,PACA_IN_MCE(r13);			\
+	REST_GPR(11, r1);				\
+	REST_2GPRS(12, r1);				\
+	/* restore original r1. */			\
+	ld	r1,GPR1(r1)
 
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
+	/*
+	 * Handle machine check early in real mode. We come here with
+	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
+	 */
+EXC_COMMON_BEGIN(machine_check_handle_early)
+	std	r0,GPR0(r1)	/* Save r0 */
+	EXCEPTION_PROLOG_COMMON_3(0x200)
+	bl	save_nvgprs
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_early
+	std	r3,RESULT(r1)	/* Save result */
+	ld	r12,_MSR(r1)
+#ifdef	CONFIG_PPC_P7_NAP
+	/*
+	 * Check if thread was in power saving mode. We come here when any
+	 * of the following is true:
+	 * a. thread wasn't in power saving mode
+	 * b. thread was in power saving mode with no state loss,
+	 *    supervisor state loss or hypervisor state loss.
+	 *
+	 * Go back to nap/sleep/winkle mode again if (b) is true.
+	 */
+	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
+	beq	4f			/* No, it wasn;t */
+	/* Thread was in power saving mode. Go back to nap again. */
+	cmpwi	r11,2
+	blt	3f
+	/* Supervisor/Hypervisor state loss */
+	li	r0,1
+	stb	r0,PACA_NAPSTATELOST(r13)
+3:	bl	machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	GET_PACA(r13)
+	ld	r1,PACAR1(r13)
+	/*
+	 * Check what idle state this CPU was in and go back to same mode
+	 * again.
+	 */
+	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
+	cmpwi	r3,PNV_THREAD_NAP
+	bgt	10f
+	IDLE_STATE_ENTER_SEQ(PPC_NAP)
+	/* No return */
+10:
+	cmpwi	r3,PNV_THREAD_SLEEP
+	bgt	2f
+	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
+	/* No return */
 
-BEGIN_FTR_SECTION
-	b	denorm_done
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
-/*
- * To denormalise we need to move a copy of the register to itself.
- * For POWER8 we need to do that for all 64 VSX registers
- */
-	XVCPSGNDP32(32)
-denorm_done:
-	mtspr	SPRN_HSRR0,r11
-	mtcrf	0x80,r9
-	ld	r9,PACA_EXGEN+EX_R9(r13)
-	RESTORE_PPR_PACA(PACA_EXGEN, r10)
-BEGIN_FTR_SECTION
-	ld	r10,PACA_EXGEN+EX_CFAR(r13)
-	mtspr	SPRN_CFAR,r10
-END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
-	ld	r10,PACA_EXGEN+EX_R10(r13)
-	ld	r11,PACA_EXGEN+EX_R11(r13)
-	ld	r12,PACA_EXGEN+EX_R12(r13)
-	ld	r13,PACA_EXGEN+EX_R13(r13)
-	HRFID
-	b	.
+2:
+	/*
+	 * Go back to winkle. Please note that this thread was woken up in
+	 * machine check from winkle and have not restored the per-subcore
+	 * state. Hence before going back to winkle, set last bit of HSPGR0
+	 * to 1. This will make sure that if this thread gets woken up
+	 * again at reset vector 0x100 then it will get chance to restore
+	 * the subcore state.
+	 */
+	ori	r13,r13,1
+	SET_PACA(r13)
+	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
+	/* No return */
+4:
 #endif
-
-	.align	7
-	/* moved from 0xe00 */
-	STD_EXCEPTION_HV_OOL(0xe02, h_data_storage)
-	KVM_HANDLER_SKIP(PACA_EXGEN, EXC_HV, 0xe02)
-	STD_EXCEPTION_HV_OOL(0xe22, h_instr_storage)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe22)
-	STD_EXCEPTION_HV_OOL(0xe42, emulation_assist)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe42)
-	MASKABLE_EXCEPTION_HV_OOL(0xe62, hmi_exception)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe62)
-
-	MASKABLE_EXCEPTION_HV_OOL(0xe82, h_doorbell)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xe82)
-
-	MASKABLE_EXCEPTION_HV_OOL(0xea2, h_virt_irq)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xea2)
-
-	/* moved from 0xf00 */
-	STD_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf00)
-	STD_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf20)
-	STD_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf40)
-	STD_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xf60)
-	STD_EXCEPTION_HV_OOL(0xf82, facility_unavailable)
-	KVM_HANDLER(PACA_EXGEN, EXC_HV, 0xf82)
-
-/*
- * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
- * - If it was a decrementer interrupt, we bump the dec to max and and return.
- * - If it was a doorbell we return immediately since doorbells are edge
- *   triggered and won't automatically refire.
- * - If it was a HMI we return immediately since we handled it in realmode
- *   and it won't refire.
- * - else we hard disable and return.
- * This is called with r10 containing the value to OR to the paca field.
- */
-#define MASKED_INTERRUPT(_H)				\
-masked_##_H##interrupt:					\
-	std	r11,PACA_EXGEN+EX_R11(r13);		\
-	lbz	r11,PACAIRQHAPPENED(r13);		\
-	or	r11,r11,r10;				\
-	stb	r11,PACAIRQHAPPENED(r13);		\
-	cmpwi	r10,PACA_IRQ_DEC;			\
-	bne	1f;					\
-	lis	r10,0x7fff;				\
-	ori	r10,r10,0xffff;				\
-	mtspr	SPRN_DEC,r10;				\
-	b	2f;					\
-1:	cmpwi	r10,PACA_IRQ_DBELL;			\
-	beq	2f;					\
-	cmpwi	r10,PACA_IRQ_HMI;			\
-	beq	2f;					\
-	mfspr	r10,SPRN_##_H##SRR1;			\
-	rldicl	r10,r10,48,1; /* clear MSR_EE */	\
-	rotldi	r10,r10,16;				\
-	mtspr	SPRN_##_H##SRR1,r10;			\
-2:	mtcrf	0x80,r9;				\
-	ld	r9,PACA_EXGEN+EX_R9(r13);		\
-	ld	r10,PACA_EXGEN+EX_R10(r13);		\
-	ld	r11,PACA_EXGEN+EX_R11(r13);		\
-	GET_SCRATCH0(r13);				\
-	##_H##rfid;					\
-	b	.
-	
-	MASKED_INTERRUPT()
-	MASKED_INTERRUPT(H)
-
-/*
- * Called from arch_local_irq_enable when an interrupt needs
- * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
- * which kind of interrupt. MSR:EE is already off. We generate a
- * stackframe like if a real interrupt had happened.
- *
- * Note: While MSR:EE is off, we need to make sure that _MSR
- * in the generated frame has EE set to 1 or the exception
- * handler will not properly re-enable them.
- */
-_GLOBAL(__replay_interrupt)
-	/* We are going to jump to the exception common code which
-	 * will retrieve various register values from the PACA which
-	 * we don't give a damn about, so we don't bother storing them.
+	/*
+	 * Check if we are coming from hypervisor userspace. If yes then we
+	 * continue in host kernel in V mode to deliver the MC event.
 	 */
-	mfmsr	r12
-	mflr	r11
-	mfcr	r9
-	ori	r12,r12,MSR_EE
-	cmpwi	r3,0x900
-	beq	decrementer_common
-	cmpwi	r3,0x500
-	beq	hardware_interrupt_common
-BEGIN_FTR_SECTION
-	cmpwi	r3,0xe80
-	beq	h_doorbell_common
-	cmpwi	r3,0xea0
-	beq	h_virt_irq_common
-	cmpwi	r3,0xe60
-	beq	hmi_exception_common
-FTR_SECTION_ELSE
-	cmpwi	r3,0xa00
-	beq	doorbell_super_common
-ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
-	blr
-
-#ifdef CONFIG_PPC_PSERIES
-/*
- * Vectors for the FWNMI option.  Share common code.
- */
-	.globl system_reset_fwnmi
-      .align 7
-system_reset_fwnmi:
-	SET_SCRATCH0(r13)		/* save r13 */
-	EXCEPTION_PROLOG_PSERIES(PACA_EXGEN, system_reset_common, EXC_STD,
-				 NOTEST, 0x100)
-
-#endif /* CONFIG_PPC_PSERIES */
+	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
+	beq	5f
+	andi.	r11,r12,MSR_PR		/* See if coming from user. */
+	bne	9f			/* continue in V mode if we are. */
 
+5:
 #ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-kvmppc_skip_interrupt:
 	/*
-	 * Here all GPRs are unchanged from when the interrupt happened
-	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 * We are coming from kernel context. Check if we are coming from
+	 * guest. if yes, then we can continue. We will fall through
+	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
 	 */
-	mfspr	r13, SPRN_SRR0
-	addi	r13, r13, 4
-	mtspr	SPRN_SRR0, r13
-	GET_SCRATCH0(r13)
+	lbz	r11,HSTATE_IN_GUEST(r13)
+	cmpwi	r11,0			/* Check if coming from guest */
+	bne	9f			/* continue if we are. */
+#endif
+	/*
+	 * At this point we are not sure about what context we come from.
+	 * Queue up the MCE event and return from the interrupt.
+	 * But before that, check if this is an un-recoverable exception.
+	 * If yes, then stay on emergency stack and panic.
+	 */
+	andi.	r11,r12,MSR_RI
+	bne	2f
+1:	mfspr	r11,SPRN_SRR0
+	LOAD_HANDLER(r10,unrecover_mce)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	/*
+	 * We are going down. But there are chances that we might get hit by
+	 * another MCE during panic path and we may run into unstable state
+	 * with no way out. Hence, turn ME bit off while going down, so that
+	 * when another MCE is hit during panic path, system will checkstop
+	 * and hypervisor will get restarted cleanly by SP.
+	 */
+	li	r3,MSR_ME
+	andc	r10,r10,r3		/* Turn off MSR_ME */
+	mtspr	SPRN_SRR1,r10
 	rfid
 	b	.
-
-kvmppc_skip_Hinterrupt:
+2:
 	/*
-	 * Here all GPRs are unchanged from when the interrupt happened
-	 * except for r13, which is saved in SPRG_SCRATCH0.
+	 * Check if we have successfully handled/recovered from error, if not
+	 * then stay on emergency stack and panic.
 	 */
-	mfspr	r13, SPRN_HSRR0
-	addi	r13, r13, 4
-	mtspr	SPRN_HSRR0, r13
-	GET_SCRATCH0(r13)
-	hrfid
-	b	.
-#endif
+	ld	r3,RESULT(r1)	/* Load result */
+	cmpdi	r3,0		/* see if we handled MCE successfully */
 
-/*
- * Ensure that any handlers that get invoked from the exception prologs
- * above are below the first 64KB (0x10000) of the kernel image because
- * the prologs assemble the addresses of these handlers using the
- * LOAD_HANDLER macro, which uses an ori instruction.
- */
+	beq	1b		/* if !handled then panic */
+	/*
+	 * Return from MC interrupt.
+	 * Queue up the MCE event so that we can log it later, while
+	 * returning from kernel or opal call.
+	 */
+	bl	machine_check_queue_event
+	MACHINE_CHECK_HANDLER_WINDUP
+	rfid
+9:
+	/* Deliver the machine check to host kernel in V mode. */
+	MACHINE_CHECK_HANDLER_WINDUP
+	b	machine_check_pSeries
 
-/*** Common interrupt handlers ***/
+EXC_COMMON_BEGIN(unrecover_mce)
+	/* Invoke machine_check_exception to print MCE event and panic. */
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	machine_check_exception
+	/*
+	 * We will not reach here. Even if we did, there is no way out. Call
+	 * unrecoverable_exception and die.
+	 */
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	1b
 
-	STD_EXCEPTION_COMMON(0x100, system_reset, system_reset_exception)
 
-	STD_EXCEPTION_COMMON_ASYNC(0x500, hardware_interrupt, do_IRQ)
-	STD_EXCEPTION_COMMON_ASYNC(0x900, decrementer, timer_interrupt)
-	STD_EXCEPTION_COMMON(0x980, hdecrementer, hdec_interrupt)
-#ifdef CONFIG_PPC_DOORBELL
-	STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, doorbell_exception)
-#else
-	STD_EXCEPTION_COMMON_ASYNC(0xa00, doorbell_super, unknown_exception)
-#endif
-	STD_EXCEPTION_COMMON(0xb00, trap_0b, unknown_exception)
-	STD_EXCEPTION_COMMON(0xd00, single_step, single_step_exception)
-	STD_EXCEPTION_COMMON(0xe00, trap_0e, unknown_exception)
-	STD_EXCEPTION_COMMON(0xe40, emulation_assist, emulation_assist_interrupt)
-	STD_EXCEPTION_COMMON_ASYNC(0xe60, hmi_exception, handle_hmi_exception)
-#ifdef CONFIG_PPC_DOORBELL
-	STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, doorbell_exception)
-#else
-	STD_EXCEPTION_COMMON_ASYNC(0xe80, h_doorbell, unknown_exception)
-#endif
-	STD_EXCEPTION_COMMON_ASYNC(0xea0, h_virt_irq, do_IRQ)
-	STD_EXCEPTION_COMMON_ASYNC(0xf00, performance_monitor, performance_monitor_exception)
-	STD_EXCEPTION_COMMON(0x1300, instruction_breakpoint, instruction_breakpoint_exception)
-	STD_EXCEPTION_COMMON(0x1502, denorm, unknown_exception)
-#ifdef CONFIG_ALTIVEC
-	STD_EXCEPTION_COMMON(0x1700, altivec_assist, altivec_assist_exception)
-#else
-	STD_EXCEPTION_COMMON(0x1700, altivec_assist, unknown_exception)
-#endif
+EXC_REAL(data_access, 0x300, 0x380)
+EXC_VIRT(data_access, 0x4300, 0x4380, 0x300)
+TRAMP_KVM_SKIP(PACA_EXGEN, 0x300)
 
+EXC_COMMON_BEGIN(data_access_common)
 	/*
-	 * Relocation-on interrupts: A subset of the interrupts can be delivered
-	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
-	 * it.  Addresses are the same as the original interrupt addresses, but
-	 * offset by 0xc000000000004000.
-	 * It's impossible to receive interrupts below 0x300 via this mechanism.
-	 * KVM: None of these traps are from the guest ; anything that escalated
-	 * to HV=1 from HV=0 is delivered via real mode handlers.
+	 * Here r13 points to the paca, r9 contains the saved CR,
+	 * SRR0 and SRR1 are saved in r11 and r12,
+	 * r9 - r13 are saved in paca->exgen.
 	 */
+	mfspr	r10,SPRN_DAR
+	std	r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr	r10,SPRN_DSISR
+	stw	r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r12,_MSR(r1)
+	ld	r3,PACA_EXGEN+EX_DAR(r13)
+	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
+	li	r5,0x300
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
+	b	do_hash_page		/* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+	b	handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
-	/*
-	 * This uses the standard macro, since the original 0x300 vector
-	 * only has extra guff for STAB-based processors -- which never
-	 * come here.
-	 */
-	STD_RELON_EXCEPTION_PSERIES(0x4300, 0x300, data_access)
-	. = 0x4380
-	.globl data_access_slb_relon_pSeries
-data_access_slb_relon_pSeries:
+
+EXC_REAL_BEGIN(data_access_slb, 0x380, 0x400)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
 	mfspr	r3,SPRN_DAR
 	mfspr	r12,SPRN_SRR1
+	crset	4*cr6+eq
 #ifndef CONFIG_RELOCATABLE
 	b	slb_miss_realmode
 #else
@@ -792,220 +514,221 @@ data_access_slb_relon_pSeries:
 	 * the kernel ends up being put.
 	 */
 	mfctr	r11
-	ld	r10,PACAKBASE(r13)
 	LOAD_HANDLER(r10, slb_miss_realmode)
 	mtctr	r10
 	bctr
 #endif
+EXC_REAL_END(data_access_slb, 0x380, 0x400)
 
-	STD_RELON_EXCEPTION_PSERIES(0x4400, 0x400, instruction_access)
-	. = 0x4480
-	.globl instruction_access_slb_relon_pSeries
-instruction_access_slb_relon_pSeries:
+EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x4400)
 	SET_SCRATCH0(r13)
 	EXCEPTION_PROLOG_0(PACA_EXSLB)
-	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380)
 	std	r3,PACA_EXSLB+EX_R3(r13)
-	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r3,SPRN_DAR
 	mfspr	r12,SPRN_SRR1
+	crset	4*cr6+eq
 #ifndef CONFIG_RELOCATABLE
 	b	slb_miss_realmode
 #else
+	/*
+	 * We can't just use a direct branch to slb_miss_realmode
+	 * because the distance from here to there depends on where
+	 * the kernel ends up being put.
+	 */
 	mfctr	r11
-	ld	r10,PACAKBASE(r13)
 	LOAD_HANDLER(r10, slb_miss_realmode)
 	mtctr	r10
 	bctr
 #endif
+EXC_VIRT_END(data_access_slb, 0x4380, 0x4400)
+TRAMP_KVM_SKIP(PACA_EXSLB, 0x380)
 
-	. = 0x4500
-	.globl hardware_interrupt_relon_pSeries;
-	.globl hardware_interrupt_relon_hv;
-hardware_interrupt_relon_pSeries:
-hardware_interrupt_relon_hv:
-	BEGIN_FTR_SECTION
-		_MASKABLE_RELON_EXCEPTION_PSERIES(0x502, hardware_interrupt, EXC_HV, SOFTEN_TEST_HV)
-	FTR_SECTION_ELSE
-		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt, EXC_STD, SOFTEN_TEST_PR)
-	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
-	STD_RELON_EXCEPTION_PSERIES(0x4600, 0x600, alignment)
-	STD_RELON_EXCEPTION_PSERIES(0x4700, 0x700, program_check)
-	STD_RELON_EXCEPTION_PSERIES(0x4800, 0x800, fp_unavailable)
-	MASKABLE_RELON_EXCEPTION_PSERIES(0x4900, 0x900, decrementer)
-	STD_RELON_EXCEPTION_HV(0x4980, 0x982, hdecrementer)
-	MASKABLE_RELON_EXCEPTION_PSERIES(0x4a00, 0xa00, doorbell_super)
-	STD_RELON_EXCEPTION_PSERIES(0x4b00, 0xb00, trap_0b)
-
-	. = 0x4c00
-	.globl system_call_relon_pSeries
-system_call_relon_pSeries:
-	HMT_MEDIUM
-	SYSCALL_PSERIES_1
-	SYSCALL_PSERIES_2_DIRECT
-	SYSCALL_PSERIES_3
 
-	STD_RELON_EXCEPTION_PSERIES(0x4d00, 0xd00, single_step)
+EXC_REAL(instruction_access, 0x400, 0x480)
+EXC_VIRT(instruction_access, 0x4400, 0x4480, 0x400)
+TRAMP_KVM(PACA_EXGEN, 0x400)
 
-	. = 0x4e00
-	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
+EXC_COMMON_BEGIN(instruction_access_common)
+	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
+	RECONCILE_IRQ_STATE(r10, r11)
+	ld	r12,_MSR(r1)
+	ld	r3,_NIP(r1)
+	andis.	r4,r12,0x5820
+	li	r5,0x400
+	std	r3,_DAR(r1)
+	std	r4,_DSISR(r1)
+BEGIN_MMU_FTR_SECTION
+	b	do_hash_page		/* Try to handle as hpte fault */
+MMU_FTR_SECTION_ELSE
+	b	handle_page_fault
+ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
 
-	. = 0x4e20
-	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
 
-	. = 0x4e40
-emulation_assist_relon_trampoline:
+EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x500)
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	emulation_assist_relon_hv
-
-	. = 0x4e60
-	b	.	/* Can't happen, see v2.07 Book III-S section 6.5 */
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r12,SPRN_SRR1
+	crclr	4*cr6+eq
+#ifndef CONFIG_RELOCATABLE
+	b	slb_miss_realmode
+#else
+	mfctr	r11
+	LOAD_HANDLER(r10, slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+EXC_REAL_END(instruction_access_slb, 0x480, 0x500)
 
-	. = 0x4e80
-h_doorbell_relon_trampoline:
+EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x4500)
 	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_doorbell_relon_hv
+	EXCEPTION_PROLOG_0(PACA_EXSLB)
+	EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480)
+	std	r3,PACA_EXSLB+EX_R3(r13)
+	mfspr	r3,SPRN_SRR0		/* SRR0 is faulting address */
+	mfspr	r12,SPRN_SRR1
+	crclr	4*cr6+eq
+#ifndef CONFIG_RELOCATABLE
+	b	slb_miss_realmode
+#else
+	mfctr	r11
+	LOAD_HANDLER(r10, slb_miss_realmode)
+	mtctr	r10
+	bctr
+#endif
+EXC_VIRT_END(instruction_access_slb, 0x4480, 0x4500)
+TRAMP_KVM(PACA_EXSLB, 0x480)
 
-	. = 0x4ea0
-h_virt_irq_relon_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	h_virt_irq_relon_hv
 
-	. = 0x4f00
-performance_monitor_relon_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	performance_monitor_relon_pSeries
+/* This handler is used by both 0x380 and 0x480 slb miss interrupts */
+EXC_COMMON_BEGIN(slb_miss_realmode)
+	/*
+	 * r13 points to the PACA, r9 contains the saved CR,
+	 * r12 contain the saved SRR1, SRR0 is still ready for return
+	 * r3 has the faulting address
+	 * r9 - r13 are saved in paca->exslb.
+	 * r3 is saved in paca->slb_r3
+ 	 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss
+	 * We assume we aren't going to take any exceptions during this
+	 * procedure.
+	 */
+	mflr	r10
+#ifdef CONFIG_RELOCATABLE
+	mtctr	r11
+#endif
 
-	. = 0x4f20
-altivec_unavailable_relon_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	altivec_unavailable_relon_pSeries
+	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
+	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
+	std	r3,PACA_EXSLB+EX_DAR(r13)
 
-	. = 0x4f40
-vsx_unavailable_relon_pseries_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	vsx_unavailable_relon_pSeries
+	crset	4*cr0+eq
+#ifdef CONFIG_PPC_STD_MMU_64
+BEGIN_MMU_FTR_SECTION
+	bl	slb_allocate_realmode
+END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
+#endif
 
-	. = 0x4f60
-facility_unavailable_relon_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	facility_unavailable_relon_pSeries
+	ld	r10,PACA_EXSLB+EX_LR(r13)
+	ld	r3,PACA_EXSLB+EX_R3(r13)
+	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+	mtlr	r10
 
-	. = 0x4f80
-hv_facility_unavailable_relon_trampoline:
-	SET_SCRATCH0(r13)
-	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	hv_facility_unavailable_relon_hv
+	beq	8f		/* if bad address, make full stack frame */
 
-	STD_RELON_EXCEPTION_PSERIES(0x5300, 0x1300, instruction_breakpoint)
-#ifdef CONFIG_PPC_DENORMALISATION
-	. = 0x5500
-	b	denorm_exception_hv
-#endif
-	STD_RELON_EXCEPTION_PSERIES(0x5700, 0x1700, altivec_assist)
+	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
+	beq-	2f
+
+	/* All done -- return from exception. */
+
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x02,r9		/* I/D indication is in cr6 */
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	rfid
+	b	.	/* prevent speculative execution */
 
-	.align	7
-system_call_entry:
-	b	system_call_common
+2:	mfspr	r11,SPRN_SRR0
+	LOAD_HANDLER(r10,unrecov_slb)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	rfid
+	b	.
 
-ppc64_runlatch_on_trampoline:
-	b	__ppc64_runlatch_on
+8:	mfspr	r11,SPRN_SRR0
+	LOAD_HANDLER(r10,bad_addr_slb)
+	mtspr	SPRN_SRR0,r10
+	ld	r10,PACAKMSR(r13)
+	mtspr	SPRN_SRR1,r10
+	rfid
+	b	.
 
-/*
- * Here r13 points to the paca, r9 contains the saved CR,
- * SRR0 and SRR1 are saved in r11 and r12,
- * r9 - r13 are saved in paca->exgen.
- */
-	.align	7
-	.globl data_access_common
-data_access_common:
-	mfspr	r10,SPRN_DAR
-	std	r10,PACA_EXGEN+EX_DAR(r13)
-	mfspr	r10,SPRN_DSISR
-	stw	r10,PACA_EXGEN+EX_DSISR(r13)
-	EXCEPTION_PROLOG_COMMON(0x300, PACA_EXGEN)
+EXC_COMMON_BEGIN(unrecov_slb)
+	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
 	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r12,_MSR(r1)
-	ld	r3,PACA_EXGEN+EX_DAR(r13)
-	lwz	r4,PACA_EXGEN+EX_DSISR(r13)
-	li	r5,0x300
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-BEGIN_MMU_FTR_SECTION
-	b	do_hash_page		/* Try to handle as hpte fault */
-MMU_FTR_SECTION_ELSE
-	b	handle_page_fault
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+	bl	save_nvgprs
+1:	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	unrecoverable_exception
+	b	1b
 
-	.align  7
-	.globl  h_data_storage_common
-h_data_storage_common:
-	mfspr   r10,SPRN_HDAR
-	std     r10,PACA_EXGEN+EX_DAR(r13)
-	mfspr   r10,SPRN_HDSISR
-	stw     r10,PACA_EXGEN+EX_DSISR(r13)
-	EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
-	bl      save_nvgprs
+EXC_COMMON_BEGIN(bad_addr_slb)
+	EXCEPTION_PROLOG_COMMON(0x380, PACA_EXSLB)
 	RECONCILE_IRQ_STATE(r10, r11)
-	addi    r3,r1,STACK_FRAME_OVERHEAD
-	bl      unknown_exception
-	b       ret_from_except
+	ld	r3, PACA_EXSLB+EX_DAR(r13)
+	std	r3, _DAR(r1)
+	beq	cr6, 2f
+	li	r10, 0x480		/* fix trap number for I-SLB miss */
+	std	r10, _TRAP(r1)
+2:	bl	save_nvgprs
+	addi	r3, r1, STACK_FRAME_OVERHEAD
+	bl	slb_miss_bad_addr
+	b	ret_from_except
 
-	.align	7
-	.globl instruction_access_common
-instruction_access_common:
-	EXCEPTION_PROLOG_COMMON(0x400, PACA_EXGEN)
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r12,_MSR(r1)
-	ld	r3,_NIP(r1)
-	andis.	r4,r12,0x5820
-	li	r5,0x400
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-BEGIN_MMU_FTR_SECTION
-	b	do_hash_page		/* Try to handle as hpte fault */
-MMU_FTR_SECTION_ELSE
-	b	handle_page_fault
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x600)
+	.globl hardware_interrupt_hv;
+hardware_interrupt_hv:
+	BEGIN_FTR_SECTION
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
+					    EXC_HV, SOFTEN_TEST_HV)
+do_kvm_H0x500:
+		KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502)
+	FTR_SECTION_ELSE
+		_MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common,
+					    EXC_STD, SOFTEN_TEST_PR)
+do_kvm_0x500:
+		KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
+EXC_REAL_END(hardware_interrupt, 0x500, 0x600)
 
-	STD_EXCEPTION_COMMON(0xe20, h_instr_storage, unknown_exception)
+EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x4600)
+	.globl hardware_interrupt_relon_hv;
+hardware_interrupt_relon_hv:
+	BEGIN_FTR_SECTION
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_HV, SOFTEN_TEST_HV)
+	FTR_SECTION_ELSE
+		_MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR)
+	ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600)
 
-	/*
-	 * Machine check is different because we use a different
-	 * save area: PACA_EXMC instead of PACA_EXGEN.
-	 */
-	.align	7
-	.globl machine_check_common
-machine_check_common:
+EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ)
 
-	mfspr	r10,SPRN_DAR
-	std	r10,PACA_EXMC+EX_DAR(r13)
-	mfspr	r10,SPRN_DSISR
-	stw	r10,PACA_EXMC+EX_DSISR(r13)
-	EXCEPTION_PROLOG_COMMON(0x200, PACA_EXMC)
-	FINISH_NAP
-	RECONCILE_IRQ_STATE(r10, r11)
-	ld	r3,PACA_EXMC+EX_DAR(r13)
-	lwz	r4,PACA_EXMC+EX_DSISR(r13)
-	/* Enable MSR_RI when finished with PACA_EXMC */
-	li	r10,MSR_RI
-	mtmsrd 	r10,1
-	std	r3,_DAR(r1)
-	std	r4,_DSISR(r1)
-	bl	save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
-	b	ret_from_except
 
-	.align	7
-	.globl alignment_common
-alignment_common:
+EXC_REAL(alignment, 0x600, 0x700)
+EXC_VIRT(alignment, 0x4600, 0x4700, 0x600)
+TRAMP_KVM(PACA_EXGEN, 0x600)
+EXC_COMMON_BEGIN(alignment_common)
 	mfspr	r10,SPRN_DAR
 	std	r10,PACA_EXGEN+EX_DAR(r13)
 	mfspr	r10,SPRN_DSISR
@@ -1021,9 +744,11 @@ alignment_common:
 	bl	alignment_exception
 	b	ret_from_except
 
-	.align	7
-	.globl program_check_common
-program_check_common:
+
+EXC_REAL(program_check, 0x700, 0x800)
+EXC_VIRT(program_check, 0x4700, 0x4800, 0x700)
+TRAMP_KVM(PACA_EXGEN, 0x700)
+EXC_COMMON_BEGIN(program_check_common)
 	EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN)
 	bl	save_nvgprs
 	RECONCILE_IRQ_STATE(r10, r11)
@@ -1031,9 +756,11 @@ program_check_common:
 	bl	program_check_exception
 	b	ret_from_except
 
-	.align	7
-	.globl fp_unavailable_common
-fp_unavailable_common:
+
+EXC_REAL(fp_unavailable, 0x800, 0x900)
+EXC_VIRT(fp_unavailable, 0x4800, 0x4900, 0x800)
+TRAMP_KVM(PACA_EXGEN, 0x800)
+EXC_COMMON_BEGIN(fp_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN)
 	bne	1f			/* if from user, just load it up */
 	bl	save_nvgprs
@@ -1061,9 +788,250 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
 	bl	fp_unavailable_tm
 	b	ret_from_except
 #endif
-	.align	7
-	.globl altivec_unavailable_common
-altivec_unavailable_common:
+
+
+EXC_REAL_MASKABLE(decrementer, 0x900, 0x980)
+EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x4980, 0x900)
+TRAMP_KVM(PACA_EXGEN, 0x900)
+EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt)
+
+
+EXC_REAL_HV(hdecrementer, 0x980, 0xa00)
+EXC_VIRT_HV(hdecrementer, 0x4980, 0x4a00, 0x980)
+TRAMP_KVM_HV(PACA_EXGEN, 0x980)
+EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt)
+
+
+EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0xb00)
+EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x4b00, 0xa00)
+TRAMP_KVM(PACA_EXGEN, 0xa00)
+#ifdef CONFIG_PPC_DOORBELL
+EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception)
+#else
+EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception)
+#endif
+
+
+EXC_REAL(trap_0b, 0xb00, 0xc00)
+EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00)
+TRAMP_KVM(PACA_EXGEN, 0xb00)
+EXC_COMMON(trap_0b_common, 0xb00, unknown_exception)
+
+
+#define LOAD_SYSCALL_HANDLER(reg)				\
+	ld	reg,PACAKBASE(r13);				\
+	ori	reg,reg,(ABS_ADDR(system_call_common))@l;
+
+/* Syscall routine is used twice, in reloc-off and reloc-on paths */
+#define SYSCALL_PSERIES_1 					\
+BEGIN_FTR_SECTION						\
+	cmpdi	r0,0x1ebe ; 					\
+	beq-	1f ;						\
+END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
+	mr	r9,r13 ;					\
+	GET_PACA(r13) ;						\
+	mfspr	r11,SPRN_SRR0 ;					\
+0:
+
+#define SYSCALL_PSERIES_2_RFID 					\
+	mfspr	r12,SPRN_SRR1 ;					\
+	LOAD_SYSCALL_HANDLER(r10) ; 				\
+	mtspr	SPRN_SRR0,r10 ; 				\
+	ld	r10,PACAKMSR(r13) ;				\
+	mtspr	SPRN_SRR1,r10 ; 				\
+	rfid ; 							\
+	b	. ;	/* prevent speculative execution */
+
+#define SYSCALL_PSERIES_3					\
+	/* Fast LE/BE switch system call */			\
+1:	mfspr	r12,SPRN_SRR1 ;					\
+	xori	r12,r12,MSR_LE ;				\
+	mtspr	SPRN_SRR1,r12 ;					\
+	rfid ;		/* return to userspace */		\
+	b	. ;	/* prevent speculative execution */
+
+#if defined(CONFIG_RELOCATABLE)
+	/*
+	 * We can't branch directly so we do it via the CTR which
+	 * is volatile across system calls.
+	 */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	LOAD_SYSCALL_HANDLER(r12) ;				\
+	mtctr	r12 ;						\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;						\
+	bctr ;
+#else
+	/* We can branch directly */
+#define SYSCALL_PSERIES_2_DIRECT				\
+	mfspr	r12,SPRN_SRR1 ;					\
+	li	r10,MSR_RI ;					\
+	mtmsrd 	r10,1 ;			/* Set RI (EE=0) */	\
+	b	system_call_common ;
+#endif
+
+EXC_REAL_BEGIN(system_call, 0xc00, 0xd00)
+	 /*
+	  * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems
+	  * that support it) before changing to HMT_MEDIUM. That allows the KVM
+	  * code to save that value into the guest state (it is the guest's PPR
+	  * value). Otherwise just change to HMT_MEDIUM as userspace has
+	  * already saved the PPR.
+	  */
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+	SET_SCRATCH0(r13)
+	GET_PACA(r13)
+	std	r9,PACA_EXGEN+EX_R9(r13)
+	OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR);
+	HMT_MEDIUM;
+	std	r10,PACA_EXGEN+EX_R10(r13)
+	OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR);
+	mfcr	r9
+	KVMTEST_PR(0xc00)
+	GET_SCRATCH0(r13)
+#else
+	HMT_MEDIUM;
+#endif
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_RFID
+	SYSCALL_PSERIES_3
+EXC_REAL_END(system_call, 0xc00, 0xd00)
+
+EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00)
+	HMT_MEDIUM
+	SYSCALL_PSERIES_1
+	SYSCALL_PSERIES_2_DIRECT
+	SYSCALL_PSERIES_3
+EXC_VIRT_END(system_call, 0x4c00, 0x4d00)
+
+TRAMP_KVM(PACA_EXGEN, 0xc00)
+
+
+EXC_REAL(single_step, 0xd00, 0xe00)
+EXC_VIRT(single_step, 0x4d00, 0x4e00, 0xd00)
+TRAMP_KVM(PACA_EXGEN, 0xd00)
+EXC_COMMON(single_step_common, 0xd00, single_step_exception)
+
+EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20)
+EXC_VIRT_NONE(0x4e00, 0x4e20)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00)
+EXC_COMMON_BEGIN(h_data_storage_common)
+	mfspr   r10,SPRN_HDAR
+	std     r10,PACA_EXGEN+EX_DAR(r13)
+	mfspr   r10,SPRN_HDSISR
+	stw     r10,PACA_EXGEN+EX_DSISR(r13)
+	EXCEPTION_PROLOG_COMMON(0xe00, PACA_EXGEN)
+	bl      save_nvgprs
+	RECONCILE_IRQ_STATE(r10, r11)
+	addi    r3,r1,STACK_FRAME_OVERHEAD
+	bl      unknown_exception
+	b       ret_from_except
+
+
+EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40)
+EXC_VIRT_NONE(0x4e20, 0x4e40)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe20)
+EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception)
+
+
+EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0xe60)
+EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x4e60, 0xe40)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe40)
+EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt)
+
+
+/*
+ * hmi_exception trampoline is a special case. It jumps to hmi_exception_early
+ * first, and then eventaully from there to the trampoline to get into virtual
+ * mode.
+ */
+__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0xe80, hmi_exception_early)
+__TRAMP_REAL_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60)
+EXC_VIRT_NONE(0x4e60, 0x4e80)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe60)
+TRAMP_REAL_BEGIN(hmi_exception_early)
+	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60)
+	mr	r10,r1			/* Save r1			*/
+	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
+	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
+	std	r9,_CCR(r1)		/* save CR in stackframe	*/
+	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
+	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
+	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
+	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
+	std	r10,0(r1)		/* make stack chain pointer	*/
+	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
+	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
+	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
+	EXCEPTION_PROLOG_COMMON_3(0xe60)
+	addi	r3,r1,STACK_FRAME_OVERHEAD
+	bl	hmi_exception_realmode
+	/* Windup the stack. */
+	/* Move original HSRR0 and HSRR1 into the respective regs */
+	ld	r9,_MSR(r1)
+	mtspr	SPRN_HSRR1,r9
+	ld	r3,_NIP(r1)
+	mtspr	SPRN_HSRR0,r3
+	ld	r9,_CTR(r1)
+	mtctr	r9
+	ld	r9,_XER(r1)
+	mtxer	r9
+	ld	r9,_LINK(r1)
+	mtlr	r9
+	REST_GPR(0, r1)
+	REST_8GPRS(2, r1)
+	REST_GPR(10, r1)
+	ld	r11,_CCR(r1)
+	mtcr	r11
+	REST_GPR(11, r1)
+	REST_2GPRS(12, r1)
+	/* restore original r1. */
+	ld	r1,GPR1(r1)
+
+	/*
+	 * Go to virtual mode and pull the HMI event information from
+	 * firmware.
+	 */
+	.globl hmi_exception_after_realmode
+hmi_exception_after_realmode:
+	SET_SCRATCH0(r13)
+	EXCEPTION_PROLOG_0(PACA_EXGEN)
+	b	tramp_real_hmi_exception
+
+EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception)
+
+
+EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0xea0)
+EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x4ea0, 0xe80)
+TRAMP_KVM_HV(PACA_EXGEN, 0xe80)
+#ifdef CONFIG_PPC_DOORBELL
+EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception)
+#else
+EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception)
+#endif
+
+
+EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0xec0)
+EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x4ec0, 0xea0)
+TRAMP_KVM_HV(PACA_EXGEN, 0xea0)
+EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ)
+
+
+EXC_REAL_NONE(0xec0, 0xf00)
+EXC_VIRT_NONE(0x4ec0, 0x4f00)
+
+
+EXC_REAL_OOL(performance_monitor, 0xf00, 0xf20)
+EXC_VIRT_OOL(performance_monitor, 0x4f00, 0x4f20, 0xf00)
+TRAMP_KVM(PACA_EXGEN, 0xf00)
+EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception)
+
+
+EXC_REAL_OOL(altivec_unavailable, 0xf20, 0xf40)
+EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x4f40, 0xf20)
+TRAMP_KVM(PACA_EXGEN, 0xf20)
+EXC_COMMON_BEGIN(altivec_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN)
 #ifdef CONFIG_ALTIVEC
 BEGIN_FTR_SECTION
@@ -1096,9 +1064,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 	bl	altivec_unavailable_exception
 	b	ret_from_except
 
-	.align	7
-	.globl vsx_unavailable_common
-vsx_unavailable_common:
+
+EXC_REAL_OOL(vsx_unavailable, 0xf40, 0xf60)
+EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x4f60, 0xf40)
+TRAMP_KVM(PACA_EXGEN, 0xf40)
+EXC_COMMON_BEGIN(vsx_unavailable_common)
 	EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN)
 #ifdef CONFIG_VSX
 BEGIN_FTR_SECTION
@@ -1130,349 +1100,284 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX)
 	bl	vsx_unavailable_exception
 	b	ret_from_except
 
-	/* Equivalents to the above handlers for relocation-on interrupt vectors */
-	STD_RELON_EXCEPTION_HV_OOL(0xe40, emulation_assist)
-	MASKABLE_RELON_EXCEPTION_HV_OOL(0xe80, h_doorbell)
-	MASKABLE_RELON_EXCEPTION_HV_OOL(0xea0, h_virt_irq)
 
-	STD_RELON_EXCEPTION_PSERIES_OOL(0xf00, performance_monitor)
-	STD_RELON_EXCEPTION_PSERIES_OOL(0xf20, altivec_unavailable)
-	STD_RELON_EXCEPTION_PSERIES_OOL(0xf40, vsx_unavailable)
-	STD_RELON_EXCEPTION_PSERIES_OOL(0xf60, facility_unavailable)
-	STD_RELON_EXCEPTION_HV_OOL(0xf80, hv_facility_unavailable)
+EXC_REAL_OOL(facility_unavailable, 0xf60, 0xf80)
+EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x4f80, 0xf60)
+TRAMP_KVM(PACA_EXGEN, 0xf60)
+EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception)
 
-	/*
-	 * The __end_interrupts marker must be past the out-of-line (OOL)
-	 * handlers, so that they are copied to real address 0x100 when running
-	 * a relocatable kernel. This ensures they can be reached from the short
-	 * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch
-	 * directly, without using LOAD_HANDLER().
-	 */
-	.align	7
-	.globl	__end_interrupts
-__end_interrupts:
 
-#if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
-/*
- * Data area reserved for FWNMI option.
- * This address (0x7000) is fixed by the RPA.
- */
-	.= 0x7000
-	.globl fwnmi_data_area
-fwnmi_data_area:
+EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0xfa0)
+EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x4fa0, 0xf80)
+TRAMP_KVM_HV(PACA_EXGEN, 0xf80)
+EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception)
 
-	/* pseries and powernv need to keep the whole page from
-	 * 0x7000 to 0x8000 free for use by the firmware
-	 */
-	. = 0x8000
-#endif /* defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) */
 
-	STD_EXCEPTION_COMMON(0xf60, facility_unavailable, facility_unavailable_exception)
-	STD_EXCEPTION_COMMON(0xf80, hv_facility_unavailable, facility_unavailable_exception)
+EXC_REAL_NONE(0xfa0, 0x1200)
+EXC_VIRT_NONE(0x4fa0, 0x5200)
 
 #ifdef CONFIG_CBE_RAS
-	STD_EXCEPTION_COMMON(0x1200, cbe_system_error, cbe_system_error_exception)
-	STD_EXCEPTION_COMMON(0x1600, cbe_maintenance, cbe_maintenance_exception)
-	STD_EXCEPTION_COMMON(0x1800, cbe_thermal, cbe_thermal_exception)
-#endif /* CONFIG_CBE_RAS */
-
-	.globl hmi_exception_early
-hmi_exception_early:
-	EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
-	mr	r10,r1			/* Save r1			*/
-	ld	r1,PACAEMERGSP(r13)	/* Use emergency stack		*/
-	subi	r1,r1,INT_FRAME_SIZE	/* alloc stack frame		*/
-	std	r9,_CCR(r1)		/* save CR in stackframe	*/
-	mfspr	r11,SPRN_HSRR0		/* Save HSRR0 */
-	std	r11,_NIP(r1)		/* save HSRR0 in stackframe	*/
-	mfspr	r12,SPRN_HSRR1		/* Save SRR1 */
-	std	r12,_MSR(r1)		/* save SRR1 in stackframe	*/
-	std	r10,0(r1)		/* make stack chain pointer	*/
-	std	r0,GPR0(r1)		/* save r0 in stackframe	*/
-	std	r10,GPR1(r1)		/* save r1 in stackframe	*/
-	EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN)
-	EXCEPTION_PROLOG_COMMON_3(0xe60)
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	hmi_exception_realmode
-	/* Windup the stack. */
-	/* Move original HSRR0 and HSRR1 into the respective regs */
-	ld	r9,_MSR(r1)
-	mtspr	SPRN_HSRR1,r9
-	ld	r3,_NIP(r1)
-	mtspr	SPRN_HSRR0,r3
-	ld	r9,_CTR(r1)
-	mtctr	r9
-	ld	r9,_XER(r1)
-	mtxer	r9
-	ld	r9,_LINK(r1)
-	mtlr	r9
-	REST_GPR(0, r1)
-	REST_8GPRS(2, r1)
-	REST_GPR(10, r1)
-	ld	r11,_CCR(r1)
-	mtcr	r11
-	REST_GPR(11, r1)
-	REST_2GPRS(12, r1)
-	/* restore original r1. */
-	ld	r1,GPR1(r1)
+EXC_REAL_HV(cbe_system_error, 0x1200, 0x1300)
+EXC_VIRT_NONE(0x5200, 0x5300)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200)
+EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1200, 0x1300)
+EXC_VIRT_NONE(0x5200, 0x5300)
+#endif
 
-	/*
-	 * Go to virtual mode and pull the HMI event information from
-	 * firmware.
-	 */
-	.globl hmi_exception_after_realmode
-hmi_exception_after_realmode:
-	SET_SCRATCH0(r13)
+
+EXC_REAL(instruction_breakpoint, 0x1300, 0x1400)
+EXC_VIRT(instruction_breakpoint, 0x5300, 0x5400, 0x1300)
+TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300)
+EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception)
+
+EXC_REAL_NONE(0x1400, 0x1500)
+EXC_VIRT_NONE(0x5400, 0x5500)
+
+EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x1600)
+	mtspr	SPRN_SPRG_HSCRATCH0,r13
 	EXCEPTION_PROLOG_0(PACA_EXGEN)
-	b	hmi_exception_hv
+	EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500)
 
+#ifdef CONFIG_PPC_DENORMALISATION
+	mfspr	r10,SPRN_HSRR1
+	mfspr	r11,SPRN_HSRR0		/* save HSRR0 */
+	andis.	r10,r10,(HSRR1_DENORM)@h /* denorm? */
+	addi	r11,r11,-4		/* HSRR0 is next instruction */
+	bne+	denorm_assist
+#endif
 
-#define MACHINE_CHECK_HANDLER_WINDUP			\
-	/* Clear MSR_RI before setting SRR0 and SRR1. */\
-	li	r0,MSR_RI;				\
-	mfmsr	r9;		/* get MSR value */	\
-	andc	r9,r9,r0;				\
-	mtmsrd	r9,1;		/* Clear MSR_RI */	\
-	/* Move original SRR0 and SRR1 into the respective regs */	\
-	ld	r9,_MSR(r1);				\
-	mtspr	SPRN_SRR1,r9;				\
-	ld	r3,_NIP(r1);				\
-	mtspr	SPRN_SRR0,r3;				\
-	ld	r9,_CTR(r1);				\
-	mtctr	r9;					\
-	ld	r9,_XER(r1);				\
-	mtxer	r9;					\
-	ld	r9,_LINK(r1);				\
-	mtlr	r9;					\
-	REST_GPR(0, r1);				\
-	REST_8GPRS(2, r1);				\
-	REST_GPR(10, r1);				\
-	ld	r11,_CCR(r1);				\
-	mtcr	r11;					\
-	/* Decrement paca->in_mce. */			\
-	lhz	r12,PACA_IN_MCE(r13);			\
-	subi	r12,r12,1;				\
-	sth	r12,PACA_IN_MCE(r13);			\
-	REST_GPR(11, r1);				\
-	REST_2GPRS(12, r1);				\
-	/* restore original r1. */			\
-	ld	r1,GPR1(r1)
+	KVMTEST_PR(0x1500)
+	EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV)
+EXC_REAL_END(denorm_exception_hv, 0x1500, 0x1600)
 
-	/*
-	 * Handle machine check early in real mode. We come here with
-	 * ME=1, MMU (IR=0 and DR=0) off and using MC emergency stack.
-	 */
-	.align	7
-	.globl machine_check_handle_early
-machine_check_handle_early:
-	std	r0,GPR0(r1)	/* Save r0 */
-	EXCEPTION_PROLOG_COMMON_3(0x200)
-	bl	save_nvgprs
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_early
-	std	r3,RESULT(r1)	/* Save result */
-	ld	r12,_MSR(r1)
-#ifdef	CONFIG_PPC_P7_NAP
-	/*
-	 * Check if thread was in power saving mode. We come here when any
-	 * of the following is true:
-	 * a. thread wasn't in power saving mode
-	 * b. thread was in power saving mode with no state loss,
-	 *    supervisor state loss or hypervisor state loss.
-	 *
-	 * Go back to nap/sleep/winkle mode again if (b) is true.
-	 */
-	rlwinm.	r11,r12,47-31,30,31	/* Was it in power saving mode? */
-	beq	4f			/* No, it wasn;t */
-	/* Thread was in power saving mode. Go back to nap again. */
-	cmpwi	r11,2
-	blt	3f
-	/* Supervisor/Hypervisor state loss */
-	li	r0,1
-	stb	r0,PACA_NAPSTATELOST(r13)
-3:	bl	machine_check_queue_event
-	MACHINE_CHECK_HANDLER_WINDUP
-	GET_PACA(r13)
-	ld	r1,PACAR1(r13)
-	/*
-	 * Check what idle state this CPU was in and go back to same mode
-	 * again.
-	 */
-	lbz	r3,PACA_THREAD_IDLE_STATE(r13)
-	cmpwi	r3,PNV_THREAD_NAP
-	bgt	10f
-	IDLE_STATE_ENTER_SEQ(PPC_NAP)
-	/* No return */
-10:
-	cmpwi	r3,PNV_THREAD_SLEEP
-	bgt	2f
-	IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
-	/* No return */
+#ifdef CONFIG_PPC_DENORMALISATION
+EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x5600)
+	b	exc_real_0x1500_denorm_exception_hv
+EXC_VIRT_END(denorm_exception, 0x5500, 0x5600)
+#else
+EXC_VIRT_NONE(0x5500, 0x5600)
+#endif
 
-2:
-	/*
-	 * Go back to winkle. Please note that this thread was woken up in
-	 * machine check from winkle and have not restored the per-subcore
-	 * state. Hence before going back to winkle, set last bit of HSPGR0
-	 * to 1. This will make sure that if this thread gets woken up
-	 * again at reset vector 0x100 then it will get chance to restore
-	 * the subcore state.
-	 */
-	ori	r13,r13,1
-	SET_PACA(r13)
-	IDLE_STATE_ENTER_SEQ(PPC_WINKLE)
-	/* No return */
-4:
+TRAMP_KVM_SKIP(PACA_EXGEN, 0x1500)
+
+#ifdef CONFIG_PPC_DENORMALISATION
+TRAMP_REAL_BEGIN(denorm_assist)
+BEGIN_FTR_SECTION
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER6 do that here for all FP regs.
+ */
+	mfmsr	r10
+	ori	r10,r10,(MSR_FP|MSR_FE0|MSR_FE1)
+	xori	r10,r10,(MSR_FE0|MSR_FE1)
+	mtmsrd	r10
+	sync
+
+#define FMR2(n)  fmr (n), (n) ; fmr n+1, n+1
+#define FMR4(n)  FMR2(n) ; FMR2(n+2)
+#define FMR8(n)  FMR4(n) ; FMR4(n+4)
+#define FMR16(n) FMR8(n) ; FMR8(n+8)
+#define FMR32(n) FMR16(n) ; FMR16(n+16)
+	FMR32(0)
+
+FTR_SECTION_ELSE
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER7 do that here for the first 32 VSX registers only.
+ */
+	mfmsr	r10
+	oris	r10,r10,MSR_VSX@h
+	mtmsrd	r10
+	sync
+
+#define XVCPSGNDP2(n) XVCPSGNDP(n,n,n) ; XVCPSGNDP(n+1,n+1,n+1)
+#define XVCPSGNDP4(n) XVCPSGNDP2(n) ; XVCPSGNDP2(n+2)
+#define XVCPSGNDP8(n) XVCPSGNDP4(n) ; XVCPSGNDP4(n+4)
+#define XVCPSGNDP16(n) XVCPSGNDP8(n) ; XVCPSGNDP8(n+8)
+#define XVCPSGNDP32(n) XVCPSGNDP16(n) ; XVCPSGNDP16(n+16)
+	XVCPSGNDP32(0)
+
+ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_206)
+
+BEGIN_FTR_SECTION
+	b	denorm_done
+END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
+/*
+ * To denormalise we need to move a copy of the register to itself.
+ * For POWER8 we need to do that for all 64 VSX registers
+ */
+	XVCPSGNDP32(32)
+denorm_done:
+	mtspr	SPRN_HSRR0,r11
+	mtcrf	0x80,r9
+	ld	r9,PACA_EXGEN+EX_R9(r13)
+	RESTORE_PPR_PACA(PACA_EXGEN, r10)
+BEGIN_FTR_SECTION
+	ld	r10,PACA_EXGEN+EX_CFAR(r13)
+	mtspr	SPRN_CFAR,r10
+END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
+	ld	r10,PACA_EXGEN+EX_R10(r13)
+	ld	r11,PACA_EXGEN+EX_R11(r13)
+	ld	r12,PACA_EXGEN+EX_R12(r13)
+	ld	r13,PACA_EXGEN+EX_R13(r13)
+	HRFID
+	b	.
+#endif
+
+EXC_COMMON_HV(denorm_common, 0x1500, unknown_exception)
+
+
+#ifdef CONFIG_CBE_RAS
+EXC_REAL_HV(cbe_maintenance, 0x1600, 0x1700)
+EXC_VIRT_NONE(0x5600, 0x5700)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600)
+EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1600, 0x1700)
+EXC_VIRT_NONE(0x5600, 0x5700)
 #endif
-	/*
-	 * Check if we are coming from hypervisor userspace. If yes then we
-	 * continue in host kernel in V mode to deliver the MC event.
-	 */
-	rldicl.	r11,r12,4,63		/* See if MC hit while in HV mode. */
-	beq	5f
-	andi.	r11,r12,MSR_PR		/* See if coming from user. */
-	bne	9f			/* continue in V mode if we are. */
 
-5:
-#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
-	/*
-	 * We are coming from kernel context. Check if we are coming from
-	 * guest. if yes, then we can continue. We will fall through
-	 * do_kvm_200->kvmppc_interrupt to deliver the MC event to guest.
-	 */
-	lbz	r11,HSTATE_IN_GUEST(r13)
-	cmpwi	r11,0			/* Check if coming from guest */
-	bne	9f			/* continue if we are. */
+
+EXC_REAL(altivec_assist, 0x1700, 0x1800)
+EXC_VIRT(altivec_assist, 0x5700, 0x5800, 0x1700)
+TRAMP_KVM(PACA_EXGEN, 0x1700)
+#ifdef CONFIG_ALTIVEC
+EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception)
+#else
+EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception)
 #endif
-	/*
-	 * At this point we are not sure about what context we come from.
-	 * Queue up the MCE event and return from the interrupt.
-	 * But before that, check if this is an un-recoverable exception.
-	 * If yes, then stay on emergency stack and panic.
-	 */
-	andi.	r11,r12,MSR_RI
-	bne	2f
-1:	mfspr	r11,SPRN_SRR0
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10,unrecover_mce)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	/*
-	 * We are going down. But there are chances that we might get hit by
-	 * another MCE during panic path and we may run into unstable state
-	 * with no way out. Hence, turn ME bit off while going down, so that
-	 * when another MCE is hit during panic path, system will checkstop
-	 * and hypervisor will get restarted cleanly by SP.
-	 */
-	li	r3,MSR_ME
-	andc	r10,r10,r3		/* Turn off MSR_ME */
-	mtspr	SPRN_SRR1,r10
-	rfid
+
+
+#ifdef CONFIG_CBE_RAS
+EXC_REAL_HV(cbe_thermal, 0x1800, 0x1900)
+EXC_VIRT_NONE(0x5800, 0x5900)
+TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800)
+EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception)
+#else /* CONFIG_CBE_RAS */
+EXC_REAL_NONE(0x1800, 0x1900)
+EXC_VIRT_NONE(0x5800, 0x5900)
+#endif
+
+
+/*
+ * An interrupt came in while soft-disabled. We set paca->irq_happened, then:
+ * - If it was a decrementer interrupt, we bump the dec to max and and return.
+ * - If it was a doorbell we return immediately since doorbells are edge
+ *   triggered and won't automatically refire.
+ * - If it was a HMI we return immediately since we handled it in realmode
+ *   and it won't refire.
+ * - else we hard disable and return.
+ * This is called with r10 containing the value to OR to the paca field.
+ */
+#define MASKED_INTERRUPT(_H)				\
+masked_##_H##interrupt:					\
+	std	r11,PACA_EXGEN+EX_R11(r13);		\
+	lbz	r11,PACAIRQHAPPENED(r13);		\
+	or	r11,r11,r10;				\
+	stb	r11,PACAIRQHAPPENED(r13);		\
+	cmpwi	r10,PACA_IRQ_DEC;			\
+	bne	1f;					\
+	lis	r10,0x7fff;				\
+	ori	r10,r10,0xffff;				\
+	mtspr	SPRN_DEC,r10;				\
+	b	2f;					\
+1:	cmpwi	r10,PACA_IRQ_DBELL;			\
+	beq	2f;					\
+	cmpwi	r10,PACA_IRQ_HMI;			\
+	beq	2f;					\
+	mfspr	r10,SPRN_##_H##SRR1;			\
+	rldicl	r10,r10,48,1; /* clear MSR_EE */	\
+	rotldi	r10,r10,16;				\
+	mtspr	SPRN_##_H##SRR1,r10;			\
+2:	mtcrf	0x80,r9;				\
+	ld	r9,PACA_EXGEN+EX_R9(r13);		\
+	ld	r10,PACA_EXGEN+EX_R10(r13);		\
+	ld	r11,PACA_EXGEN+EX_R11(r13);		\
+	GET_SCRATCH0(r13);				\
+	##_H##rfid;					\
 	b	.
-2:
-	/*
-	 * Check if we have successfully handled/recovered from error, if not
-	 * then stay on emergency stack and panic.
-	 */
-	ld	r3,RESULT(r1)	/* Load result */
-	cmpdi	r3,0		/* see if we handled MCE successfully */
 
-	beq	1b		/* if !handled then panic */
+/*
+ * Real mode exceptions actually use this too, but alternate
+ * instruction code patches (which end up in the common .text area)
+ * cannot reach these if they are put there.
+ */
+USE_FIXED_SECTION(virt_trampolines)
+	MASKED_INTERRUPT()
+	MASKED_INTERRUPT(H)
+
+#ifdef CONFIG_KVM_BOOK3S_64_HANDLER
+TRAMP_REAL_BEGIN(kvmppc_skip_interrupt)
 	/*
-	 * Return from MC interrupt.
-	 * Queue up the MCE event so that we can log it later, while
-	 * returning from kernel or opal call.
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
 	 */
-	bl	machine_check_queue_event
-	MACHINE_CHECK_HANDLER_WINDUP
+	mfspr	r13, SPRN_SRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_SRR0, r13
+	GET_SCRATCH0(r13)
 	rfid
-9:
-	/* Deliver the machine check to host kernel in V mode. */
-	MACHINE_CHECK_HANDLER_WINDUP
-	b	machine_check_pSeries
+	b	.
 
-unrecover_mce:
-	/* Invoke machine_check_exception to print MCE event and panic. */
-	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	machine_check_exception
+TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt)
 	/*
-	 * We will not reach here. Even if we did, there is no way out. Call
-	 * unrecoverable_exception and die.
+	 * Here all GPRs are unchanged from when the interrupt happened
+	 * except for r13, which is saved in SPRG_SCRATCH0.
 	 */
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	unrecoverable_exception
-	b	1b
-/*
- * r13 points to the PACA, r9 contains the saved CR,
- * r12 contain the saved SRR1, SRR0 is still ready for return
- * r3 has the faulting address
- * r9 - r13 are saved in paca->exslb.
- * r3 is saved in paca->slb_r3
- * We assume we aren't going to take any exceptions during this procedure.
- */
-slb_miss_realmode:
-	mflr	r10
-#ifdef CONFIG_RELOCATABLE
-	mtctr	r11
-#endif
-
-	stw	r9,PACA_EXSLB+EX_CCR(r13)	/* save CR in exc. frame */
-	std	r10,PACA_EXSLB+EX_LR(r13)	/* save LR */
-
-#ifdef CONFIG_PPC_STD_MMU_64
-BEGIN_MMU_FTR_SECTION
-	bl	slb_allocate_realmode
-END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
+	mfspr	r13, SPRN_HSRR0
+	addi	r13, r13, 4
+	mtspr	SPRN_HSRR0, r13
+	GET_SCRATCH0(r13)
+	hrfid
+	b	.
 #endif
-	/* All done -- return from exception. */
 
-	ld	r10,PACA_EXSLB+EX_LR(r13)
-	ld	r3,PACA_EXSLB+EX_R3(r13)
-	lwz	r9,PACA_EXSLB+EX_CCR(r13)	/* get saved CR */
+/*
+ * Ensure that any handlers that get invoked from the exception prologs
+ * above are below the first 64KB (0x10000) of the kernel image because
+ * the prologs assemble the addresses of these handlers using the
+ * LOAD_HANDLER macro, which uses an ori instruction.
+ */
 
-	mtlr	r10
-	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
-BEGIN_MMU_FTR_SECTION
-	beq-	2f
-FTR_SECTION_ELSE
-	b	2f
-ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX)
+/*** Common interrupt handlers ***/
 
-.machine	push
-.machine	"power4"
-	mtcrf	0x80,r9
-	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
-.machine	pop
 
-	RESTORE_PPR_PACA(PACA_EXSLB, r9)
-	ld	r9,PACA_EXSLB+EX_R9(r13)
-	ld	r10,PACA_EXSLB+EX_R10(r13)
-	ld	r11,PACA_EXSLB+EX_R11(r13)
-	ld	r12,PACA_EXSLB+EX_R12(r13)
-	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
-	b	.	/* prevent speculative execution */
+	/*
+	 * Relocation-on interrupts: A subset of the interrupts can be delivered
+	 * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering
+	 * it.  Addresses are the same as the original interrupt addresses, but
+	 * offset by 0xc000000000004000.
+	 * It's impossible to receive interrupts below 0x300 via this mechanism.
+	 * KVM: None of these traps are from the guest ; anything that escalated
+	 * to HV=1 from HV=0 is delivered via real mode handlers.
+	 */
 
-2:	mfspr	r11,SPRN_SRR0
-	ld	r10,PACAKBASE(r13)
-	LOAD_HANDLER(r10,unrecov_slb)
-	mtspr	SPRN_SRR0,r10
-	ld	r10,PACAKMSR(r13)
-	mtspr	SPRN_SRR1,r10
-	rfid
-	b	.
+	/*
+	 * This uses the standard macro, since the original 0x300 vector
+	 * only has extra guff for STAB-based processors -- which never
+	 * come here.
+	 */
 
-unrecov_slb:
-	EXCEPTION_PROLOG_COMMON(0x4100, PACA_EXSLB)
-	RECONCILE_IRQ_STATE(r10, r11)
-	bl	save_nvgprs
-1:	addi	r3,r1,STACK_FRAME_OVERHEAD
-	bl	unrecoverable_exception
-	b	1b
+EXC_COMMON_BEGIN(ppc64_runlatch_on_trampoline)
+	b	__ppc64_runlatch_on
 
+USE_FIXED_SECTION(virt_trampolines)
+	/*
+	 * The __end_interrupts marker must be past the out-of-line (OOL)
+	 * handlers, so that they are copied to real address 0x100 when running
+	 * a relocatable kernel. This ensures they can be reached from the short
+	 * trampoline handlers (like 0x4f00, 0x4f20, etc.) which branch
+	 * directly, without using LOAD_HANDLER().
+	 */
+	.align	7
+	.globl	__end_interrupts
+__end_interrupts:
+DEFINE_FIXED_SYMBOL(__end_interrupts)
 
 #ifdef CONFIG_PPC_970_NAP
-power4_fixup_nap:
+TRAMP_REAL_BEGIN(power4_fixup_nap)
 	andc	r9,r9,r10
 	std	r9,TI_LOCAL_FLAGS(r11)
 	ld	r10,_LINK(r1)		/* make idle task do the */
@@ -1480,6 +1385,13 @@ power4_fixup_nap:
 	blr
 #endif
 
+CLOSE_FIXED_SECTION(real_vectors);
+CLOSE_FIXED_SECTION(real_trampolines);
+CLOSE_FIXED_SECTION(virt_vectors);
+CLOSE_FIXED_SECTION(virt_trampolines);
+
+USE_TEXT_SECTION()
+
 /*
  * Hash table stuff
  */
@@ -1625,3 +1537,39 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 1:	addi	r3,r1,STACK_FRAME_OVERHEAD
 	bl	kernel_bad_stack
 	b	1b
+
+/*
+ * Called from arch_local_irq_enable when an interrupt needs
+ * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate
+ * which kind of interrupt. MSR:EE is already off. We generate a
+ * stackframe like if a real interrupt had happened.
+ *
+ * Note: While MSR:EE is off, we need to make sure that _MSR
+ * in the generated frame has EE set to 1 or the exception
+ * handler will not properly re-enable them.
+ */
+_GLOBAL(__replay_interrupt)
+	/* We are going to jump to the exception common code which
+	 * will retrieve various register values from the PACA which
+	 * we don't give a damn about, so we don't bother storing them.
+	 */
+	mfmsr	r12
+	mflr	r11
+	mfcr	r9
+	ori	r12,r12,MSR_EE
+	cmpwi	r3,0x900
+	beq	decrementer_common
+	cmpwi	r3,0x500
+	beq	hardware_interrupt_common
+BEGIN_FTR_SECTION
+	cmpwi	r3,0xe80
+	beq	h_doorbell_common
+	cmpwi	r3,0xea0
+	beq	h_virt_irq_common
+	cmpwi	r3,0xe60
+	beq	hmi_exception_common
+FTR_SECTION_ELSE
+	cmpwi	r3,0xa00
+	beq	doorbell_super_common
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE)
+	blr

+ 7 - 1
arch/powerpc/kernel/fadump.c

@@ -778,7 +778,11 @@ static int fadump_init_elfcore_header(char *bufp)
 	elf->e_entry = 0;
 	elf->e_phoff = sizeof(struct elfhdr);
 	elf->e_shoff = 0;
-	elf->e_flags = ELF_CORE_EFLAGS;
+#if defined(_CALL_ELF)
+	elf->e_flags = _CALL_ELF;
+#else
+	elf->e_flags = 0;
+#endif
 	elf->e_ehsize = sizeof(struct elfhdr);
 	elf->e_phentsize = sizeof(struct elf_phdr);
 	elf->e_phnum = 0;
@@ -1104,7 +1108,9 @@ static ssize_t fadump_release_memory_store(struct kobject *kobj,
 		 * Take away the '/proc/vmcore'. We are releasing the dump
 		 * memory, hence it will not be valid anymore.
 		 */
+#ifdef CONFIG_PROC_VMCORE
 		vmcore_cleanup();
+#endif
 		fadump_invalidate_release_mem();
 
 	} else

+ 0 - 26
arch/powerpc/kernel/fpu.S

@@ -50,32 +50,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX);					\
 #define REST_32FPVSRS(n,c,base) __REST_32FPVSRS(n,__REG_##c,__REG_##base)
 #define SAVE_32FPVSRS(n,c,base) __SAVE_32FPVSRS(n,__REG_##c,__REG_##base)
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/* void do_load_up_transact_fpu(struct thread_struct *thread)
- *
- * This is similar to load_up_fpu but for the transactional version of the FP
- * register set.  It doesn't mess with the task MSR or valid flags.
- * Furthermore, we don't do lazy FP with TM currently.
- */
-_GLOBAL(do_load_up_transact_fpu)
-	mfmsr	r6
-	ori	r5,r6,MSR_FP
-#ifdef CONFIG_VSX
-BEGIN_FTR_SECTION
-	oris	r5,r5,MSR_VSX@h
-END_FTR_SECTION_IFSET(CPU_FTR_VSX)
-#endif
-	SYNC
-	MTMSRD(r5)
-
-	addi	r7,r3,THREAD_TRANSACT_FPSTATE
-	lfd	fr0,FPSTATE_FPSCR(r7)
-	MTFSF_L(fr0)
-	REST_32FPVSRS(0, R4, R7)
-
-	blr
-#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
-
 /*
  * Load state from memory into FP registers including FPSCR.
  * Assumes the caller has enabled FP in the MSR.

+ 0 - 3
arch/powerpc/kernel/head_32.S

@@ -266,7 +266,6 @@ __secondary_hold_acknowledge:
 
 
 #define EXCEPTION_PROLOG_2	\
-	CLR_TOP32(r11);		\
 	stw	r10,_CCR(r11);		/* save registers */ \
 	stw	r12,GPR12(r11);	\
 	stw	r9,GPR9(r11);	\
@@ -862,7 +861,6 @@ __secondary_start:
 	/* ptr to phys current thread */
 	tophys(r4,r2)
 	addi	r4,r4,THREAD	/* phys address of our thread_struct */
-	CLR_TOP32(r4)
 	mtspr	SPRN_SPRG_THREAD,r4
 	li	r3,0
 	mtspr	SPRN_SPRG_RTAS,r3	/* 0 => not in RTAS */
@@ -949,7 +947,6 @@ start_here:
 	/* ptr to phys current thread */
 	tophys(r4,r2)
 	addi	r4,r4,THREAD	/* init task's THREAD */
-	CLR_TOP32(r4)
 	mtspr	SPRN_SPRG_THREAD,r4
 	li	r3,0
 	mtspr	SPRN_SPRG_RTAS,r3	/* 0 => not in RTAS */

+ 36 - 17
arch/powerpc/kernel/head_64.S

@@ -28,6 +28,7 @@
 #include <asm/page.h>
 #include <asm/mmu.h>
 #include <asm/ppc_asm.h>
+#include <asm/head-64.h>
 #include <asm/asm-offsets.h>
 #include <asm/bug.h>
 #include <asm/cputable.h>
@@ -65,9 +66,14 @@
  *   2. The kernel is entered at __start
  */
 
-	.text
-	.globl  _stext
-_stext:
+OPEN_FIXED_SECTION(first_256B, 0x0, 0x100)
+USE_FIXED_SECTION(first_256B)
+	/*
+	 * Offsets are relative from the start of fixed section, and
+	 * first_256B starts at 0. Offsets are a bit easier to use here
+	 * than the fixed section entry macros.
+	 */
+	. = 0x0
 _GLOBAL(__start)
 	/* NOP this out unconditionally */
 BEGIN_FTR_SECTION
@@ -104,6 +110,7 @@ __secondary_hold_acknowledge:
 	. = 0x5c
 	.globl	__run_at_load
 __run_at_load:
+DEFINE_FIXED_SYMBOL(__run_at_load)
 	.long	0x72756e30	/* "run0" -- relocate to 0 by default */
 #endif
 
@@ -133,7 +140,7 @@ __secondary_hold:
 	/* Tell the master cpu we're here */
 	/* Relocation is off & we are located at an address less */
 	/* than 0x100, so only need to grab low order offset.    */
-	std	r24,__secondary_hold_acknowledge-_stext(0)
+	std	r24,(ABS_ADDR(__secondary_hold_acknowledge))(0)
 	sync
 
 	li	r26,0
@@ -141,7 +148,7 @@ __secondary_hold:
 	tovirt(r26,r26)
 #endif
 	/* All secondary cpus wait here until told to start. */
-100:	ld	r12,__secondary_hold_spinloop-_stext(r26)
+100:	ld	r12,(ABS_ADDR(__secondary_hold_spinloop))(r26)
 	cmpdi	0,r12,0
 	beq	100b
 
@@ -166,12 +173,13 @@ __secondary_hold:
 #else
 	BUG_OPCODE
 #endif
+CLOSE_FIXED_SECTION(first_256B)
 
 /* This value is used to mark exception frames on the stack. */
 	.section ".toc","aw"
 exception_marker:
 	.tc	ID_72656773_68657265[TC],0x7265677368657265
-	.text
+	.previous
 
 /*
  * On server, we include the exception vectors code here as it
@@ -180,8 +188,12 @@ exception_marker:
  */
 #ifdef CONFIG_PPC_BOOK3S
 #include "exceptions-64s.S"
+#else
+OPEN_TEXT_SECTION(0x100)
 #endif
 
+USE_TEXT_SECTION()
+
 #ifdef CONFIG_PPC_BOOK3E
 /*
  * The booting_thread_hwid holds the thread id we want to boot in cpu
@@ -558,7 +570,7 @@ __after_prom_start:
 #if defined(CONFIG_PPC_BOOK3E)
 	tovirt(r26,r26)		/* on booke, we already run at PAGE_OFFSET */
 #endif
-	lwz	r7,__run_at_load-_stext(r26)
+	lwz	r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26)
 #if defined(CONFIG_PPC_BOOK3E)
 	tophys(r26,r26)
 #endif
@@ -601,7 +613,7 @@ __after_prom_start:
 #if defined(CONFIG_PPC_BOOK3E)
 	tovirt(r26,r26)		/* on booke, we already run at PAGE_OFFSET */
 #endif
-	lwz	r7,__run_at_load-_stext(r26)
+	lwz	r7,(FIXED_SYMBOL_ABS_ADDR(__run_at_load))(r26)
 	cmplwi	cr0,r7,1
 	bne	3f
 
@@ -611,28 +623,35 @@ __after_prom_start:
 	sub	r5,r5,r11
 #else
 	/* just copy interrupts */
-	LOAD_REG_IMMEDIATE(r5, __end_interrupts - _stext)
+	LOAD_REG_IMMEDIATE(r5, FIXED_SYMBOL_ABS_ADDR(__end_interrupts))
 #endif
 	b	5f
 3:
 #endif
-	lis	r5,(copy_to_here - _stext)@ha
-	addi	r5,r5,(copy_to_here - _stext)@l /* # bytes of memory to copy */
+	/* # bytes of memory to copy */
+	lis	r5,(ABS_ADDR(copy_to_here))@ha
+	addi	r5,r5,(ABS_ADDR(copy_to_here))@l
 
 	bl	copy_and_flush		/* copy the first n bytes	 */
 					/* this includes the code being	 */
 					/* executed here.		 */
-	addis	r8,r3,(4f - _stext)@ha	/* Jump to the copy of this code */
-	addi	r12,r8,(4f - _stext)@l	/* that we just made */
+	/* Jump to the copy of this code that we just made */
+	addis	r8,r3,(ABS_ADDR(4f))@ha
+	addi	r12,r8,(ABS_ADDR(4f))@l
 	mtctr	r12
 	bctr
 
 .balign 8
-p_end:	.llong	_end - _stext
+p_end: .llong _end - copy_to_here
 
-4:	/* Now copy the rest of the kernel up to _end */
-	addis	r5,r26,(p_end - _stext)@ha
-	ld	r5,(p_end - _stext)@l(r5)	/* get _end */
+4:
+	/*
+	 * Now copy the rest of the kernel up to _end, add
+	 * _end - copy_to_here to the copy limit and run again.
+	 */
+	addis   r8,r26,(ABS_ADDR(p_end))@ha
+	ld      r8,(ABS_ADDR(p_end))@l(r8)
+	add	r5,r5,r8
 5:	bl	copy_and_flush		/* copy the rest */
 
 9:	b	start_here_multiplatform

+ 0 - 1
arch/powerpc/kernel/head_8xx.S

@@ -151,7 +151,6 @@ turn_on_mmu:
 
 
 #define EXCEPTION_PROLOG_2	\
-	CLR_TOP32(r11);		\
 	stw	r10,_CCR(r11);		/* save registers */ \
 	stw	r12,GPR12(r11);	\
 	stw	r9,GPR9(r11);	\

+ 6 - 3
arch/powerpc/kernel/hw_breakpoint.c

@@ -206,7 +206,7 @@ void thread_change_pc(struct task_struct *tsk, struct pt_regs *regs)
 /*
  * Handle debug exception notifications.
  */
-int __kprobes hw_breakpoint_handler(struct die_args *args)
+int hw_breakpoint_handler(struct die_args *args)
 {
 	int rc = NOTIFY_STOP;
 	struct perf_event *bp;
@@ -290,11 +290,12 @@ out:
 	rcu_read_unlock();
 	return rc;
 }
+NOKPROBE_SYMBOL(hw_breakpoint_handler);
 
 /*
  * Handle single-step exceptions following a DABR hit.
  */
-static int __kprobes single_step_dabr_instruction(struct die_args *args)
+static int single_step_dabr_instruction(struct die_args *args)
 {
 	struct pt_regs *regs = args->regs;
 	struct perf_event *bp = NULL;
@@ -329,11 +330,12 @@ static int __kprobes single_step_dabr_instruction(struct die_args *args)
 
 	return NOTIFY_STOP;
 }
+NOKPROBE_SYMBOL(single_step_dabr_instruction);
 
 /*
  * Handle debug exception notifications.
  */
-int __kprobes hw_breakpoint_exceptions_notify(
+int hw_breakpoint_exceptions_notify(
 		struct notifier_block *unused, unsigned long val, void *data)
 {
 	int ret = NOTIFY_DONE;
@@ -349,6 +351,7 @@ int __kprobes hw_breakpoint_exceptions_notify(
 
 	return ret;
 }
+NOKPROBE_SYMBOL(hw_breakpoint_exceptions_notify);
 
 /*
  * Release the user breakpoints used by ptrace

+ 1 - 1
arch/powerpc/kernel/ibmebus.c

@@ -227,7 +227,7 @@ int ibmebus_request_irq(u32 ist, irq_handler_t handler,
 {
 	unsigned int irq = irq_create_mapping(NULL, ist);
 
-	if (irq == NO_IRQ)
+	if (!irq)
 		return -EINVAL;
 
 	return request_irq(irq, handler, irq_flags, devname, dev_id);

+ 11 - 6
arch/powerpc/kernel/irq.c

@@ -67,6 +67,7 @@
 #include <asm/smp.h>
 #include <asm/debug.h>
 #include <asm/livepatch.h>
+#include <asm/asm-prototypes.h>
 
 #ifdef CONFIG_PPC64
 #include <asm/paca.h>
@@ -155,6 +156,15 @@ notrace unsigned int __check_irq_replay(void)
 		lv1_get_version_info(&tmp, &tmp2);
 	}
 
+	/*
+	 * Check if an hypervisor Maintenance interrupt happened.
+	 * This is a higher priority interrupt than the others, so
+	 * replay it first.
+	 */
+	local_paca->irq_happened &= ~PACA_IRQ_HMI;
+	if (happened & PACA_IRQ_HMI)
+		return 0xe60;
+
 	/*
 	 * We may have missed a decrementer interrupt. We check the
 	 * decrementer itself rather than the paca irq_happened field
@@ -190,11 +200,6 @@ notrace unsigned int __check_irq_replay(void)
 	}
 #endif /* CONFIG_PPC_BOOK3E */
 
-	/* Check if an hypervisor Maintenance interrupt happened */
-	local_paca->irq_happened &= ~PACA_IRQ_HMI;
-	if (happened & PACA_IRQ_HMI)
-		return 0xe60;
-
 	/* There should be nothing left ! */
 	BUG_ON(local_paca->irq_happened != 0);
 
@@ -514,7 +519,7 @@ void __do_irq(struct pt_regs *regs)
 	may_hard_irq_enable();
 
 	/* And finally process it */
-	if (unlikely(irq == NO_IRQ))
+	if (unlikely(!irq))
 		__this_cpu_inc(irq_stat.spurious_irqs);
 	else
 		generic_handle_irq(irq);

+ 7 - 7
arch/powerpc/kernel/legacy_serial.c

@@ -193,10 +193,10 @@ static int __init add_legacy_soc_port(struct device_node *np,
 	 */
 	if (tsi && !strcmp(tsi->type, "tsi-bridge"))
 		return add_legacy_port(np, -1, UPIO_TSI, addr, addr,
-				       NO_IRQ, legacy_port_flags, 0);
+				       0, legacy_port_flags, 0);
 	else
 		return add_legacy_port(np, -1, UPIO_MEM, addr, addr,
-				       NO_IRQ, legacy_port_flags, 0);
+				       0, legacy_port_flags, 0);
 }
 
 static int __init add_legacy_isa_port(struct device_node *np,
@@ -242,7 +242,7 @@ static int __init add_legacy_isa_port(struct device_node *np,
 
 	/* Add port, irq will be dealt with later */
 	return add_legacy_port(np, index, UPIO_PORT, be32_to_cpu(reg[1]),
-			       taddr, NO_IRQ, legacy_port_flags, 0);
+			       taddr, 0, legacy_port_flags, 0);
 
 }
 
@@ -314,7 +314,7 @@ static int __init add_legacy_pci_port(struct device_node *np,
 	/* Add port, irq will be dealt with later. We passed a translated
 	 * IO port value. It will be fixed up later along with the irq
 	 */
-	return add_legacy_port(np, index, iotype, base, addr, NO_IRQ,
+	return add_legacy_port(np, index, iotype, base, addr, 0,
 			       legacy_port_flags, np != pci_dev);
 }
 #endif
@@ -462,14 +462,14 @@ static void __init fixup_port_irq(int index,
 	DBG("fixup_port_irq(%d)\n", index);
 
 	virq = irq_of_parse_and_map(np, 0);
-	if (virq == NO_IRQ && legacy_serial_infos[index].irq_check_parent) {
+	if (!virq && legacy_serial_infos[index].irq_check_parent) {
 		np = of_get_parent(np);
 		if (np == NULL)
 			return;
 		virq = irq_of_parse_and_map(np, 0);
 		of_node_put(np);
 	}
-	if (virq == NO_IRQ)
+	if (!virq)
 		return;
 
 	port->irq = virq;
@@ -543,7 +543,7 @@ static int __init serial_dev_init(void)
 		struct plat_serial8250_port *port = &legacy_serial_ports[i];
 		struct device_node *np = legacy_serial_infos[i].np;
 
-		if (port->irq == NO_IRQ)
+		if (!port->irq)
 			fixup_port_irq(i, np, port);
 		if (port->iotype == UPIO_PORT)
 			fixup_port_pio(i, np, port);

+ 23 - 52
arch/powerpc/kernel/machine_kexec_64.c

@@ -23,6 +23,7 @@
 #include <asm/current.h>
 #include <asm/machdep.h>
 #include <asm/cacheflush.h>
+#include <asm/firmware.h>
 #include <asm/paca.h>
 #include <asm/mmu.h>
 #include <asm/sections.h>	/* _end */
@@ -31,21 +32,6 @@
 #include <asm/hw_breakpoint.h>
 #include <asm/asm-prototypes.h>
 
-#ifdef CONFIG_PPC_BOOK3E
-int default_machine_kexec_prepare(struct kimage *image)
-{
-	int i;
-	/*
-	 * Since we use the kernel fault handlers and paging code to
-	 * handle the virtual mode, we must make sure no destination
-	 * overlaps kernel static data or bss.
-	 */
-	for (i = 0; i < image->nr_segments; i++)
-		if (image->segment[i].mem < __pa(_end))
-			return -ETXTBSY;
-	return 0;
-}
-#else
 int default_machine_kexec_prepare(struct kimage *image)
 {
 	int i;
@@ -55,9 +41,6 @@ int default_machine_kexec_prepare(struct kimage *image)
 	const unsigned long *basep;
 	const unsigned int *sizep;
 
-	if (!mmu_hash_ops.hpte_clear_all)
-		return -ENOENT;
-
 	/*
 	 * Since we use the kernel fault handlers and paging code to
 	 * handle the virtual mode, we must make sure no destination
@@ -67,31 +50,6 @@ int default_machine_kexec_prepare(struct kimage *image)
 		if (image->segment[i].mem < __pa(_end))
 			return -ETXTBSY;
 
-	/*
-	 * For non-LPAR, we absolutely can not overwrite the mmu hash
-	 * table, since we are still using the bolted entries in it to
-	 * do the copy.  Check that here.
-	 *
-	 * It is safe if the end is below the start of the blocked
-	 * region (end <= low), or if the beginning is after the
-	 * end of the blocked region (begin >= high).  Use the
-	 * boolean identity !(a || b)  === (!a && !b).
-	 */
-#ifdef CONFIG_PPC_STD_MMU_64
-	if (htab_address) {
-		low = __pa(htab_address);
-		high = low + htab_size_bytes;
-
-		for (i = 0; i < image->nr_segments; i++) {
-			begin = image->segment[i].mem;
-			end = begin + image->segment[i].memsz;
-
-			if ((begin < high) && (end > low))
-				return -ETXTBSY;
-		}
-	}
-#endif /* CONFIG_PPC_STD_MMU_64 */
-
 	/* We also should not overwrite the tce tables */
 	for_each_node_by_type(node, "pci") {
 		basep = of_get_property(node, "linux,tce-base", NULL);
@@ -113,7 +71,6 @@ int default_machine_kexec_prepare(struct kimage *image)
 
 	return 0;
 }
-#endif /* !CONFIG_PPC_BOOK3E */
 
 static void copy_segments(unsigned long ind)
 {
@@ -332,11 +289,14 @@ struct paca_struct kexec_paca;
 /* Our assembly helper, in misc_64.S */
 extern void kexec_sequence(void *newstack, unsigned long start,
 			   void *image, void *control,
-			   void (*clear_all)(void)) __noreturn;
+			   void (*clear_all)(void),
+			   bool copy_with_mmu_off) __noreturn;
 
 /* too late to fail here */
 void default_machine_kexec(struct kimage *image)
 {
+	bool copy_with_mmu_off;
+
 	/* prepare control code if any */
 
 	/*
@@ -374,18 +334,29 @@ void default_machine_kexec(struct kimage *image)
 	/* XXX: If anyone does 'dynamic lppacas' this will also need to be
 	 * switched to a static version!
 	 */
+	/*
+	 * On Book3S, the copy must happen with the MMU off if we are either
+	 * using Radix page tables or we are not in an LPAR since we can
+	 * overwrite the page tables while copying.
+	 *
+	 * In an LPAR, we keep the MMU on otherwise we can't access beyond
+	 * the RMA. On BookE there is no real MMU off mode, so we have to
+	 * keep it enabled as well (but then we have bolted TLB entries).
+	 */
+#ifdef CONFIG_PPC_BOOK3E
+	copy_with_mmu_off = false;
+#else
+	copy_with_mmu_off = radix_enabled() ||
+		!(firmware_has_feature(FW_FEATURE_LPAR) ||
+		  firmware_has_feature(FW_FEATURE_PS3_LV1));
+#endif
 
 	/* Some things are best done in assembly.  Finding globals with
 	 * a toc is easier in C, so pass in what we can.
 	 */
 	kexec_sequence(&kexec_stack, image->start, image,
-			page_address(image->control_code_page),
-#ifdef CONFIG_PPC_STD_MMU
-			mmu_hash_ops.hpte_clear_all
-#else
-			NULL
-#endif
-	);
+		       page_address(image->control_code_page),
+		       mmu_cleanup_all, copy_with_mmu_off);
 	/* NOTREACHED */
 }
 

+ 3 - 1
arch/powerpc/kernel/misc_32.S

@@ -328,7 +328,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_UNIFIED_ID_CACHE)
  *
  * flush_icache_range(unsigned long start, unsigned long stop)
  */
-_KPROBE(flush_icache_range)
+_GLOBAL(flush_icache_range)
 BEGIN_FTR_SECTION
 	PURGE_PREFETCHED_INS
 	blr				/* for 601, do nothing */
@@ -358,6 +358,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	sync				/* additional sync needed on g4 */
 	isync
 	blr
+_ASM_NOKPROBE_SYMBOL(flush_icache_range)
+
 /*
  * Flush a particular page from the data cache to RAM.
  * Note: this is necessary because the instruction cache does *not*

+ 21 - 9
arch/powerpc/kernel/misc_64.S

@@ -66,7 +66,7 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_KPROBE(flush_icache_range)
+_GLOBAL(flush_icache_range)
 BEGIN_FTR_SECTION
 	PURGE_PREFETCHED_INS
 	blr
@@ -109,7 +109,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
 	bdnz	2b
 	isync
 	blr
-	.previous .text
+_ASM_NOKPROBE_SYMBOL(flush_icache_range)
+
 /*
  * Like above, but only do the D-cache.
  *
@@ -591,7 +592,8 @@ real_mode:	/* assume normal blr return */
 #endif
 
 /*
- * kexec_sequence(newstack, start, image, control, clear_all())
+ * kexec_sequence(newstack, start, image, control, clear_all(),
+	          copy_with_mmu_off)
  *
  * does the grungy work with stack switching and real mode switches
  * also does simple calls to other code
@@ -627,7 +629,7 @@ _GLOBAL(kexec_sequence)
 	mr	r29,r5			/* image (virt) */
 	mr	r28,r6			/* control, unused */
 	mr	r27,r7			/* clear_all() fn desc */
-	mr	r26,r8			/* spare */
+	mr	r26,r8			/* copy_with_mmu_off */
 	lhz	r25,PACAHWCPUID(r13)	/* get our phys cpu from paca */
 
 	/* disable interrupts, we are overwriting kernel data next */
@@ -639,15 +641,24 @@ _GLOBAL(kexec_sequence)
 	mtmsrd	r3,1
 #endif
 
+	/* We need to turn the MMU off unless we are in hash mode
+	 * under a hypervisor
+	 */
+	cmpdi	r26,0
+	beq	1f
+	bl	real_mode
+1:
 	/* copy dest pages, flush whole dest image */
 	mr	r3,r29
 	bl	kexec_copy_flush	/* (image) */
 
-	/* turn off mmu */
+	/* turn off mmu now if not done earlier */
+	cmpdi	r26,0
+	bne	1f
 	bl	real_mode
 
 	/* copy  0x100 bytes starting at start to 0 */
-	li	r3,0
+1:	li	r3,0
 	mr	r4,r30		/* start, aka phys mem offset */
 	li	r5,0x100
 	li	r6,0
@@ -659,7 +670,9 @@ _GLOBAL(kexec_sequence)
 	li	r6,1
 	stw	r6,kexec_flag-1b(5)
 
-#ifndef CONFIG_PPC_BOOK3E
+	cmpdi	r27,0
+	beq	1f
+
 	/* clear out hardware hash page table and tlb */
 #ifdef PPC64_ELF_ABI_v1
 	ld	r12,0(r27)		/* deref function descriptor */
@@ -668,7 +681,6 @@ _GLOBAL(kexec_sequence)
 #endif
 	mtctr	r12
 	bctrl				/* mmu_hash_ops.hpte_clear_all(void); */
-#endif /* !CONFIG_PPC_BOOK3E */
 
 /*
  *   kexec image calling is:
@@ -695,7 +707,7 @@ _GLOBAL(kexec_sequence)
  *    are the boot cpu ?????
  *    other device tree differences (prop sizes, va vs pa, etc)...
  */
-	mr	r3,r25	# my phys cpu
+1:	mr	r3,r25	# my phys cpu
 	mr	r4,r30	# start, aka phys mem offset
 	mtlr	4
 	li	r5,0

+ 1 - 1
arch/powerpc/kernel/module.c

@@ -27,7 +27,7 @@
 #include <linux/sort.h>
 #include <asm/setup.h>
 
-LIST_HEAD(module_bug_list);
+static LIST_HEAD(module_bug_list);
 
 static const Elf_Shdr *find_section(const Elf_Ehdr *hdr,
 				    const Elf_Shdr *sechdrs,

+ 5 - 5
arch/powerpc/kernel/nvram_64.c

@@ -542,9 +542,9 @@ static ssize_t nvram_pstore_read(u64 *id, enum pstore_type_id *type,
 			time->tv_nsec = 0;
 		}
 		*buf = kmemdup(buff + hdr_size, length, GFP_KERNEL);
+		kfree(buff);
 		if (*buf == NULL)
 			return -ENOMEM;
-		kfree(buff);
 
 		*ecc_notice_size = 0;
 		if (err_type == ERR_TYPE_KERNEL_PANIC_GZ)
@@ -851,7 +851,7 @@ static long dev_nvram_ioctl(struct file *file, unsigned int cmd,
 	}
 }
 
-const struct file_operations nvram_fops = {
+static const struct file_operations nvram_fops = {
 	.owner		= THIS_MODULE,
 	.llseek		= dev_nvram_llseek,
 	.read		= dev_nvram_read,
@@ -956,7 +956,7 @@ int __init nvram_remove_partition(const char *name, int sig,
 
 		/* Make partition a free partition */
 		part->header.signature = NVRAM_SIG_FREE;
-		strncpy(part->header.name, "wwwwwwwwwwww", 12);
+		memset(part->header.name, 'w', 12);
 		part->header.checksum = nvram_checksum(&part->header);
 		rc = nvram_write_header(part);
 		if (rc <= 0) {
@@ -974,8 +974,8 @@ int __init nvram_remove_partition(const char *name, int sig,
 		}
 		if (prev) {
 			prev->header.length += part->header.length;
-			prev->header.checksum = nvram_checksum(&part->header);
-			rc = nvram_write_header(part);
+			prev->header.checksum = nvram_checksum(&prev->header);
+			rc = nvram_write_header(prev);
 			if (rc <= 0) {
 				printk(KERN_ERR "nvram_remove_partition: nvram_write failed (%d)\n", rc);
 				return rc;

+ 3 - 2
arch/powerpc/kernel/pci-common.c

@@ -360,7 +360,7 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 			 line, pin);
 
 		virq = irq_create_mapping(NULL, line);
-		if (virq != NO_IRQ)
+		if (virq)
 			irq_set_irq_type(virq, IRQ_TYPE_LEVEL_LOW);
 	} else {
 		pr_debug(" Got one, spec %d cells (0x%08x 0x%08x...) on %s\n",
@@ -369,7 +369,8 @@ static int pci_read_irq_line(struct pci_dev *pci_dev)
 
 		virq = irq_create_of_mapping(&oirq);
 	}
-	if(virq == NO_IRQ) {
+
+	if (!virq) {
 		pr_debug(" Failed to map !\n");
 		return -1;
 	}

+ 1 - 1
arch/powerpc/kernel/pci_of_scan.c

@@ -178,7 +178,7 @@ struct pci_dev *of_create_pci_dev(struct device_node *node,
 		dev->hdr_type = PCI_HEADER_TYPE_NORMAL;
 		dev->rom_base_reg = PCI_ROM_ADDRESS;
 		/* Maybe do a default OF mapping here */
-		dev->irq = NO_IRQ;
+		dev->irq = 0;
 	}
 
 	of_pci_parse_addrs(node, dev);

+ 106 - 75
arch/powerpc/kernel/process.c

@@ -59,6 +59,7 @@
 #include <asm/exec.h>
 #include <asm/livepatch.h>
 #include <asm/cpu_has_feature.h>
+#include <asm/asm-prototypes.h>
 
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
@@ -88,7 +89,13 @@ static void check_if_tm_restore_required(struct task_struct *tsk)
 		set_thread_flag(TIF_RESTORE_TM);
 	}
 }
+
+static inline bool msr_tm_active(unsigned long msr)
+{
+	return MSR_TM_ACTIVE(msr);
+}
 #else
+static inline bool msr_tm_active(unsigned long msr) { return false; }
 static inline void check_if_tm_restore_required(struct task_struct *tsk) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
@@ -104,7 +111,7 @@ static int __init enable_strict_msr_control(char *str)
 }
 early_param("ppc_strict_facility_enable", enable_strict_msr_control);
 
-void msr_check_and_set(unsigned long bits)
+unsigned long msr_check_and_set(unsigned long bits)
 {
 	unsigned long oldmsr = mfmsr();
 	unsigned long newmsr;
@@ -118,6 +125,8 @@ void msr_check_and_set(unsigned long bits)
 
 	if (oldmsr != newmsr)
 		mtmsr_isync(newmsr);
+
+	return newmsr;
 }
 
 void __msr_check_and_clear(unsigned long bits)
@@ -196,19 +205,30 @@ EXPORT_SYMBOL_GPL(flush_fp_to_thread);
 
 void enable_kernel_fp(void)
 {
+	unsigned long cpumsr;
+
 	WARN_ON(preemptible());
 
-	msr_check_and_set(MSR_FP);
+	cpumsr = msr_check_and_set(MSR_FP);
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_FP)) {
 		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr))
+			return;
 		__giveup_fpu(current);
 	}
 }
 EXPORT_SYMBOL(enable_kernel_fp);
 
 static int restore_fp(struct task_struct *tsk) {
-	if (tsk->thread.load_fp) {
+	if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) {
 		load_fp_state(&current->thread.fp_state);
 		current->thread.load_fp++;
 		return 1;
@@ -248,12 +268,23 @@ EXPORT_SYMBOL(giveup_altivec);
 
 void enable_kernel_altivec(void)
 {
+	unsigned long cpumsr;
+
 	WARN_ON(preemptible());
 
-	msr_check_and_set(MSR_VEC);
+	cpumsr = msr_check_and_set(MSR_VEC);
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VEC)) {
 		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr))
+			return;
 		__giveup_altivec(current);
 	}
 }
@@ -278,7 +309,8 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
 
 static int restore_altivec(struct task_struct *tsk)
 {
-	if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) {
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+		(tsk->thread.load_vec || msr_tm_active(tsk->thread.regs->msr))) {
 		load_vr_state(&tsk->thread.vr_state);
 		tsk->thread.used_vr = 1;
 		tsk->thread.load_vec++;
@@ -321,12 +353,23 @@ static void save_vsx(struct task_struct *tsk)
 
 void enable_kernel_vsx(void)
 {
+	unsigned long cpumsr;
+
 	WARN_ON(preemptible());
 
-	msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
+	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
 	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
 		check_if_tm_restore_required(current);
+		/*
+		 * If a thread has already been reclaimed then the
+		 * checkpointed registers are on the CPU but have definitely
+		 * been saved by the reclaim code. Don't need to and *cannot*
+		 * giveup as this would save  to the 'live' structure not the
+		 * checkpointed structure.
+		 */
+		if(!msr_tm_active(cpumsr) && msr_tm_active(current->thread.regs->msr))
+			return;
 		if (current->thread.regs->msr & MSR_FP)
 			__giveup_fpu(current);
 		if (current->thread.regs->msr & MSR_VEC)
@@ -438,6 +481,7 @@ void giveup_all(struct task_struct *tsk)
 		return;
 
 	msr_check_and_set(msr_all_available);
+	check_if_tm_restore_required(tsk);
 
 #ifdef CONFIG_PPC_FPU
 	if (usermsr & MSR_FP)
@@ -464,7 +508,8 @@ void restore_math(struct pt_regs *regs)
 {
 	unsigned long msr;
 
-	if (!current->thread.load_fp && !loadvec(current->thread))
+	if (!msr_tm_active(regs->msr) &&
+		!current->thread.load_fp && !loadvec(current->thread))
 		return;
 
 	msr = regs->msr;
@@ -767,29 +812,15 @@ static inline bool hw_brk_match(struct arch_hw_breakpoint *a,
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+
+static inline bool tm_enabled(struct task_struct *tsk)
+{
+	return tsk && tsk->thread.regs && (tsk->thread.regs->msr & MSR_TM);
+}
+
 static void tm_reclaim_thread(struct thread_struct *thr,
 			      struct thread_info *ti, uint8_t cause)
 {
-	unsigned long msr_diff = 0;
-
-	/*
-	 * If FP/VSX registers have been already saved to the
-	 * thread_struct, move them to the transact_fp array.
-	 * We clear the TIF_RESTORE_TM bit since after the reclaim
-	 * the thread will no longer be transactional.
-	 */
-	if (test_ti_thread_flag(ti, TIF_RESTORE_TM)) {
-		msr_diff = thr->ckpt_regs.msr & ~thr->regs->msr;
-		if (msr_diff & MSR_FP)
-			memcpy(&thr->transact_fp, &thr->fp_state,
-			       sizeof(struct thread_fp_state));
-		if (msr_diff & MSR_VEC)
-			memcpy(&thr->transact_vr, &thr->vr_state,
-			       sizeof(struct thread_vr_state));
-		clear_ti_thread_flag(ti, TIF_RESTORE_TM);
-		msr_diff &= MSR_FP | MSR_VEC | MSR_VSX | MSR_FE0 | MSR_FE1;
-	}
-
 	/*
 	 * Use the current MSR TM suspended bit to track if we have
 	 * checkpointed state outstanding.
@@ -808,15 +839,9 @@ static void tm_reclaim_thread(struct thread_struct *thr,
 	if (!MSR_TM_SUSPENDED(mfmsr()))
 		return;
 
-	tm_reclaim(thr, thr->regs->msr, cause);
+	giveup_all(container_of(thr, struct task_struct, thread));
 
-	/* Having done the reclaim, we now have the checkpointed
-	 * FP/VSX values in the registers.  These might be valid
-	 * even if we have previously called enable_kernel_fp() or
-	 * flush_fp_to_thread(), so update thr->regs->msr to
-	 * indicate their current validity.
-	 */
-	thr->regs->msr |= msr_diff;
+	tm_reclaim(thr, thr->ckpt_regs.msr, cause);
 }
 
 void tm_reclaim_current(uint8_t cause)
@@ -832,8 +857,8 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 	 *
 	 * In switching we need to maintain a 2nd register state as
 	 * oldtask->thread.ckpt_regs.  We tm_reclaim(oldproc); this saves the
-	 * checkpointed (tbegin) state in ckpt_regs and saves the transactional
-	 * (current) FPRs into oldtask->thread.transact_fpr[].
+	 * checkpointed (tbegin) state in ckpt_regs, ckfp_state and
+	 * ckvr_state
 	 *
 	 * We also context switch (save) TFHAR/TEXASR/TFIAR in here.
 	 */
@@ -845,14 +870,6 @@ static inline void tm_reclaim_task(struct task_struct *tsk)
 	if (!MSR_TM_ACTIVE(thr->regs->msr))
 		goto out_and_saveregs;
 
-	/* Stash the original thread MSR, as giveup_fpu et al will
-	 * modify it.  We hold onto it to see whether the task used
-	 * FP & vector regs.  If the TIF_RESTORE_TM flag is set,
-	 * ckpt_regs.msr is already set.
-	 */
-	if (!test_ti_thread_flag(task_thread_info(tsk), TIF_RESTORE_TM))
-		thr->ckpt_regs.msr = thr->regs->msr;
-
 	TM_DEBUG("--- tm_reclaim on pid %d (NIP=%lx, "
 		 "ccr=%lx, msr=%lx, trap=%lx)\n",
 		 tsk->pid, thr->regs->nip,
@@ -881,6 +898,9 @@ void tm_recheckpoint(struct thread_struct *thread,
 {
 	unsigned long flags;
 
+	if (!(thread->regs->msr & MSR_TM))
+		return;
+
 	/* We really can't be interrupted here as the TEXASR registers can't
 	 * change and later in the trecheckpoint code, we have a userspace R1.
 	 * So let's hard disable over this region.
@@ -910,10 +930,10 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new)
 	 * If the task was using FP, we non-lazily reload both the original and
 	 * the speculative FP register states.  This is because the kernel
 	 * doesn't see if/when a TM rollback occurs, so if we take an FP
-	 * unavoidable later, we are unable to determine which set of FP regs
+	 * unavailable later, we are unable to determine which set of FP regs
 	 * need to be restored.
 	 */
-	if (!new->thread.regs)
+	if (!tm_enabled(new))
 		return;
 
 	if (!MSR_TM_ACTIVE(new->thread.regs->msr)){
@@ -926,35 +946,35 @@ static inline void tm_recheckpoint_new_task(struct task_struct *new)
 		 "(new->msr 0x%lx, new->origmsr 0x%lx)\n",
 		 new->pid, new->thread.regs->msr, msr);
 
-	/* This loads the checkpointed FP/VEC state, if used */
 	tm_recheckpoint(&new->thread, msr);
 
-	/* This loads the speculative FP/VEC state, if used */
-	if (msr & MSR_FP) {
-		do_load_up_transact_fpu(&new->thread);
-		new->thread.regs->msr |=
-			(MSR_FP | new->thread.fpexc_mode);
-	}
-#ifdef CONFIG_ALTIVEC
-	if (msr & MSR_VEC) {
-		do_load_up_transact_altivec(&new->thread);
-		new->thread.regs->msr |= MSR_VEC;
-	}
-#endif
-	/* We may as well turn on VSX too since all the state is restored now */
-	if (msr & MSR_VSX)
-		new->thread.regs->msr |= MSR_VSX;
+	/*
+	 * The checkpointed state has been restored but the live state has
+	 * not, ensure all the math functionality is turned off to trigger
+	 * restore_math() to reload.
+	 */
+	new->thread.regs->msr &= ~(MSR_FP | MSR_VEC | MSR_VSX);
 
 	TM_DEBUG("*** tm_recheckpoint of pid %d complete "
 		 "(kernel msr 0x%lx)\n",
 		 new->pid, mfmsr());
 }
 
-static inline void __switch_to_tm(struct task_struct *prev)
+static inline void __switch_to_tm(struct task_struct *prev,
+		struct task_struct *new)
 {
 	if (cpu_has_feature(CPU_FTR_TM)) {
-		tm_enable();
-		tm_reclaim_task(prev);
+		if (tm_enabled(prev) || tm_enabled(new))
+			tm_enable();
+
+		if (tm_enabled(prev)) {
+			prev->thread.load_tm++;
+			tm_reclaim_task(prev);
+			if (!MSR_TM_ACTIVE(prev->thread.regs->msr) && prev->thread.load_tm == 0)
+				prev->thread.regs->msr &= ~MSR_TM;
+		}
+
+		tm_recheckpoint_new_task(new);
 	}
 }
 
@@ -976,6 +996,12 @@ void restore_tm_state(struct pt_regs *regs)
 {
 	unsigned long msr_diff;
 
+	/*
+	 * This is the only moment we should clear TIF_RESTORE_TM as
+	 * it is here that ckpt_regs.msr and pt_regs.msr become the same
+	 * again, anything else could lead to an incorrect ckpt_msr being
+	 * saved and therefore incorrect signal contexts.
+	 */
 	clear_thread_flag(TIF_RESTORE_TM);
 	if (!MSR_TM_ACTIVE(regs->msr))
 		return;
@@ -983,6 +1009,13 @@ void restore_tm_state(struct pt_regs *regs)
 	msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
 	msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
 
+	/* Ensure that restore_math() will restore */
+	if (msr_diff & MSR_FP)
+		current->thread.load_fp = 1;
+#ifdef CONFIG_ALIVEC
+	if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC)
+		current->thread.load_vec = 1;
+#endif
 	restore_math(regs);
 
 	regs->msr |= msr_diff;
@@ -990,7 +1023,7 @@ void restore_tm_state(struct pt_regs *regs)
 
 #else
 #define tm_recheckpoint_new_task(new)
-#define __switch_to_tm(prev)
+#define __switch_to_tm(prev, new)
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
 static inline void save_sprs(struct thread_struct *t)
@@ -1131,11 +1164,11 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	 */
 	save_sprs(&prev->thread);
 
-	__switch_to_tm(prev);
-
 	/* Save FPU, Altivec, VSX and SPE state */
 	giveup_all(prev);
 
+	__switch_to_tm(prev, new);
+
 	/*
 	 * We can't take a PMU exception inside _switch() since there is a
 	 * window where the kernel stack SLB and the kernel stack are out
@@ -1143,8 +1176,6 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	 */
 	hard_irq_disable();
 
-	tm_recheckpoint_new_task(new);
-
 	/*
 	 * Call restore_sprs() before calling _switch(). If we move it after
 	 * _switch() then we miss out on calling it for new tasks. The reason
@@ -1379,9 +1410,11 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 	 * transitions the CPU out of TM mode.  Hence we need to call
 	 * tm_recheckpoint_new_task() (on the same task) to restore the
 	 * checkpointed state back and the TM mode.
+	 *
+	 * Can't pass dst because it isn't ready. Doesn't matter, passing
+	 * dst is only important for __switch_to()
 	 */
-	__switch_to_tm(src);
-	tm_recheckpoint_new_task(src);
+	__switch_to_tm(src, src);
 
 	*dst = *src;
 
@@ -1623,8 +1656,6 @@ void start_thread(struct pt_regs *regs, unsigned long start, unsigned long sp)
 	current->thread.used_spe = 0;
 #endif /* CONFIG_SPE */
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (cpu_has_feature(CPU_FTR_TM))
-		regs->msr |= MSR_TM;
 	current->thread.tm_tfhar = 0;
 	current->thread.tm_texasr = 0;
 	current->thread.tm_tfiar = 0;

+ 82 - 0
arch/powerpc/kernel/prom_init.c

@@ -42,6 +42,7 @@
 #include <asm/sections.h>
 #include <asm/machdep.h>
 #include <asm/opal.h>
+#include <asm/asm-prototypes.h>
 
 #include <linux/linux_logo.h>
 
@@ -2643,6 +2644,86 @@ static void __init fixup_device_tree_efika(void)
 #define fixup_device_tree_efika()
 #endif
 
+#ifdef CONFIG_PPC_PASEMI_NEMO
+/*
+ * CFE supplied on Nemo is broken in several ways, biggest
+ * problem is that it reassigns ISA interrupts to unused mpic ints.
+ * Add an interrupt-controller property for the io-bridge to use
+ * and correct the ints so we can attach them to an irq_domain
+ */
+static void __init fixup_device_tree_pasemi(void)
+{
+	u32 interrupts[2], parent, rval, val = 0;
+	char *name, *pci_name;
+	phandle iob, node;
+
+	/* Find the root pci node */
+	name = "/pxp@0,e0000000";
+	iob = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(iob))
+		return;
+
+	/* check if interrupt-controller node set yet */
+	if (prom_getproplen(iob, "interrupt-controller") !=PROM_ERROR)
+		return;
+
+	prom_printf("adding interrupt-controller property for SB600...\n");
+
+	prom_setprop(iob, name, "interrupt-controller", &val, 0);
+
+	pci_name = "/pxp@0,e0000000/pci@11";
+	node = call_prom("finddevice", 1, 1, ADDR(pci_name));
+	parent = ADDR(iob);
+
+	for( ; prom_next_node(&node); ) {
+		/* scan each node for one with an interrupt */
+		if (!PHANDLE_VALID(node))
+			continue;
+
+		rval = prom_getproplen(node, "interrupts");
+		if (rval == 0 || rval == PROM_ERROR)
+			continue;
+
+		prom_getprop(node, "interrupts", &interrupts, sizeof(interrupts));
+		if ((interrupts[0] < 212) || (interrupts[0] > 222))
+			continue;
+
+		/* found a node, update both interrupts and interrupt-parent */
+		if ((interrupts[0] >= 212) && (interrupts[0] <= 215))
+			interrupts[0] -= 203;
+		if ((interrupts[0] >= 216) && (interrupts[0] <= 220))
+			interrupts[0] -= 213;
+		if (interrupts[0] == 221)
+			interrupts[0] = 14;
+		if (interrupts[0] == 222)
+			interrupts[0] = 8;
+
+		prom_setprop(node, pci_name, "interrupts", interrupts,
+					sizeof(interrupts));
+		prom_setprop(node, pci_name, "interrupt-parent", &parent,
+					sizeof(parent));
+	}
+
+	/*
+	 * The io-bridge has device_type set to 'io-bridge' change it to 'isa'
+	 * so that generic isa-bridge code can add the SB600 and its on-board
+	 * peripherals.
+	 */
+	name = "/pxp@0,e0000000/io-bridge@0";
+	iob = call_prom("finddevice", 1, 1, ADDR(name));
+	if (!PHANDLE_VALID(iob))
+		return;
+
+	/* device_type is already set, just change it. */
+
+	prom_printf("Changing device_type of SB600 node...\n");
+
+	prom_setprop(iob, name, "device_type", "isa", sizeof("isa"));
+}
+#else	/* !CONFIG_PPC_PASEMI_NEMO */
+static inline void fixup_device_tree_pasemi(void) { }
+#endif
+
 static void __init fixup_device_tree(void)
 {
 	fixup_device_tree_maple();
@@ -2650,6 +2731,7 @@ static void __init fixup_device_tree(void)
 	fixup_device_tree_chrp();
 	fixup_device_tree_pmac();
 	fixup_device_tree_efika();
+	fixup_device_tree_pasemi();
 }
 
 static void __init prom_find_boot_cpu(void)

+ 98 - 255
arch/powerpc/kernel/ptrace.c

@@ -39,6 +39,7 @@
 #include <asm/pgtable.h>
 #include <asm/switch_to.h>
 #include <asm/tm.h>
+#include <asm/asm-prototypes.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
@@ -402,13 +403,9 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
 }
 
 /*
- * When the transaction is active, 'transact_fp' holds the current running
- * value of all FPR registers and 'fp_state' holds the last checkpointed
- * value of all FPR registers for the current transaction. When transaction
- * is not active 'fp_state' holds the current running state of all the FPR
- * registers. So this function which returns the current running values of
- * all the FPR registers, needs to know whether any transaction is active
- * or not.
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction.
  *
  * Userspace interface buffer layout:
  *
@@ -416,13 +413,6 @@ static int gpr_set(struct task_struct *target, const struct user_regset *regset,
  *	u64	fpr[32];
  *	u64	fpscr;
  * };
- *
- * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM
- * which determines the final code in this function. All the combinations of
- * these two config options are possible except the one below as transactional
- * memory config pulls in CONFIG_VSX automatically.
- *
- *	!defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
  */
 static int fpr_get(struct task_struct *target, const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
@@ -431,50 +421,29 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
 #ifdef CONFIG_VSX
 	u64 buf[33];
 	int i;
-#endif
-	flush_fp_to_thread(target);
 
-#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
-	/* copy to local buffer then write that out */
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		flush_altivec_to_thread(target);
-		flush_tmregs_to_thread(target);
-		for (i = 0; i < 32 ; i++)
-			buf[i] = target->thread.TS_TRANS_FPR(i);
-		buf[32] = target->thread.transact_fp.fpscr;
-	} else {
-		for (i = 0; i < 32 ; i++)
-			buf[i] = target->thread.TS_FPR(i);
-		buf[32] = target->thread.fp_state.fpscr;
-	}
-	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
-#endif
+	flush_fp_to_thread(target);
 
-#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
 	/* copy to local buffer then write that out */
 	for (i = 0; i < 32 ; i++)
 		buf[i] = target->thread.TS_FPR(i);
 	buf[32] = target->thread.fp_state.fpscr;
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
-#endif
-
-#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+#else
 	BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
 		     offsetof(struct thread_fp_state, fpr[32]));
 
+	flush_fp_to_thread(target);
+
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				   &target->thread.fp_state, 0, -1);
 #endif
 }
 
 /*
- * When the transaction is active, 'transact_fp' holds the current running
- * value of all FPR registers and 'fp_state' holds the last checkpointed
- * value of all FPR registers for the current transaction. When transaction
- * is not active 'fp_state' holds the current running state of all the FPR
- * registers. So this function which setss the current running values of
- * all the FPR registers, needs to know whether any transaction is active
- * or not.
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last checkpointed
+ * value of all FPR registers for the current transaction.
  *
  * Userspace interface buffer layout:
  *
@@ -483,12 +452,6 @@ static int fpr_get(struct task_struct *target, const struct user_regset *regset,
  *	u64	fpscr;
  * };
  *
- * There are two config options CONFIG_VSX and CONFIG_PPC_TRANSACTIONAL_MEM
- * which determines the final code in this function. All the combinations of
- * these two config options are possible except the one below as transactional
- * memory config pulls in CONFIG_VSX automatically.
- *
- *	!defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
  */
 static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
@@ -497,44 +460,24 @@ static int fpr_set(struct task_struct *target, const struct user_regset *regset,
 #ifdef CONFIG_VSX
 	u64 buf[33];
 	int i;
-#endif
+
 	flush_fp_to_thread(target);
 
-#if defined(CONFIG_VSX) && defined(CONFIG_PPC_TRANSACTIONAL_MEM)
 	/* copy to local buffer then write that out */
 	i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
 	if (i)
 		return i;
 
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		flush_altivec_to_thread(target);
-		flush_tmregs_to_thread(target);
-		for (i = 0; i < 32 ; i++)
-			target->thread.TS_TRANS_FPR(i) = buf[i];
-		target->thread.transact_fp.fpscr = buf[32];
-	} else {
-		for (i = 0; i < 32 ; i++)
-			target->thread.TS_FPR(i) = buf[i];
-		target->thread.fp_state.fpscr = buf[32];
-	}
-	return 0;
-#endif
-
-#if defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
-	/* copy to local buffer then write that out */
-	i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
-	if (i)
-		return i;
 	for (i = 0; i < 32 ; i++)
 		target->thread.TS_FPR(i) = buf[i];
 	target->thread.fp_state.fpscr = buf[32];
 	return 0;
-#endif
-
-#if !defined(CONFIG_VSX) && !defined(CONFIG_PPC_TRANSACTIONAL_MEM)
+#else
 	BUILD_BUG_ON(offsetof(struct thread_fp_state, fpscr) !=
 		     offsetof(struct thread_fp_state, fpr[32]));
 
+	flush_fp_to_thread(target);
+
 	return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				  &target->thread.fp_state, 0, -1);
 #endif
@@ -562,13 +505,10 @@ static int vr_active(struct task_struct *target,
 }
 
 /*
- * When the transaction is active, 'transact_vr' holds the current running
- * value of all the VMX registers and 'vr_state' holds the last checkpointed
- * value of all the VMX registers for the current transaction to fall back
- * on in case it aborts. When transaction is not active 'vr_state' holds
- * the current running state of all the VMX registers. So this function which
- * gets the current running values of all the VMX registers, needs to know
- * whether any transaction is active or not.
+ * Regardless of transactions, 'vr_state' holds the current running
+ * value of all the VMX registers and 'ckvr_state' holds the last
+ * checkpointed value of all the VMX registers for the current
+ * transaction to fall back on in case it aborts.
  *
  * Userspace interface buffer layout:
  *
@@ -582,7 +522,6 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
 		  unsigned int pos, unsigned int count,
 		  void *kbuf, void __user *ubuf)
 {
-	struct thread_vr_state *addr;
 	int ret;
 
 	flush_altivec_to_thread(target);
@@ -590,19 +529,8 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
 	BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
 		     offsetof(struct thread_vr_state, vr[32]));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		flush_fp_to_thread(target);
-		flush_tmregs_to_thread(target);
-		addr = &target->thread.transact_vr;
-	} else {
-		addr = &target->thread.vr_state;
-	}
-#else
-	addr = &target->thread.vr_state;
-#endif
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-				  addr, 0,
+				  &target->thread.vr_state, 0,
 				  33 * sizeof(vector128));
 	if (!ret) {
 		/*
@@ -614,14 +542,7 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
 		} vrsave;
 		memset(&vrsave, 0, sizeof(vrsave));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-		if (MSR_TM_ACTIVE(target->thread.regs->msr))
-			vrsave.word = target->thread.transact_vrsave;
-		else
-			vrsave.word = target->thread.vrsave;
-#else
 		vrsave.word = target->thread.vrsave;
-#endif
 
 		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
 					  33 * sizeof(vector128), -1);
@@ -631,13 +552,10 @@ static int vr_get(struct task_struct *target, const struct user_regset *regset,
 }
 
 /*
- * When the transaction is active, 'transact_vr' holds the current running
- * value of all the VMX registers and 'vr_state' holds the last checkpointed
- * value of all the VMX registers for the current transaction to fall back
- * on in case it aborts. When transaction is not active 'vr_state' holds
- * the current running state of all the VMX registers. So this function which
- * sets the current running values of all the VMX registers, needs to know
- * whether any transaction is active or not.
+ * Regardless of transactions, 'vr_state' holds the current running
+ * value of all the VMX registers and 'ckvr_state' holds the last
+ * checkpointed value of all the VMX registers for the current
+ * transaction to fall back on in case it aborts.
  *
  * Userspace interface buffer layout:
  *
@@ -651,7 +569,6 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
 		  unsigned int pos, unsigned int count,
 		  const void *kbuf, const void __user *ubuf)
 {
-	struct thread_vr_state *addr;
 	int ret;
 
 	flush_altivec_to_thread(target);
@@ -659,19 +576,8 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
 	BUILD_BUG_ON(offsetof(struct thread_vr_state, vscr) !=
 		     offsetof(struct thread_vr_state, vr[32]));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		flush_fp_to_thread(target);
-		flush_tmregs_to_thread(target);
-		addr = &target->thread.transact_vr;
-	} else {
-		addr = &target->thread.vr_state;
-	}
-#else
-	addr = &target->thread.vr_state;
-#endif
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-				 addr, 0,
+				 &target->thread.vr_state, 0,
 				 33 * sizeof(vector128));
 	if (!ret && count > 0) {
 		/*
@@ -683,27 +589,12 @@ static int vr_set(struct task_struct *target, const struct user_regset *regset,
 		} vrsave;
 		memset(&vrsave, 0, sizeof(vrsave));
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-		if (MSR_TM_ACTIVE(target->thread.regs->msr))
-			vrsave.word = target->thread.transact_vrsave;
-		else
-			vrsave.word = target->thread.vrsave;
-#else
 		vrsave.word = target->thread.vrsave;
-#endif
+
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
 					 33 * sizeof(vector128), -1);
-		if (!ret) {
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-			if (MSR_TM_ACTIVE(target->thread.regs->msr))
-				target->thread.transact_vrsave = vrsave.word;
-			else
-				target->thread.vrsave = vrsave.word;
-#else
+		if (!ret)
 			target->thread.vrsave = vrsave.word;
-#endif
-		}
 	}
 
 	return ret;
@@ -725,13 +616,10 @@ static int vsr_active(struct task_struct *target,
 }
 
 /*
- * When the transaction is active, 'transact_fp' holds the current running
- * value of all FPR registers and 'fp_state' holds the last checkpointed
- * value of all FPR registers for the current transaction. When transaction
- * is not active 'fp_state' holds the current running state of all the FPR
- * registers. So this function which returns the current running values of
- * all the FPR registers, needs to know whether any transaction is active
- * or not.
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last
+ * checkpointed value of all FPR registers for the current
+ * transaction.
  *
  * Userspace interface buffer layout:
  *
@@ -746,27 +634,14 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset,
 	u64 buf[32];
 	int ret, i;
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
-#endif
 	flush_vsx_to_thread(target);
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		for (i = 0; i < 32 ; i++)
-			buf[i] = target->thread.
-				transact_fp.fpr[i][TS_VSRLOWOFFSET];
-	} else {
-		for (i = 0; i < 32 ; i++)
-			buf[i] = target->thread.
-				fp_state.fpr[i][TS_VSRLOWOFFSET];
-	}
-#else
 	for (i = 0; i < 32 ; i++)
 		buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
-#endif
+
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				  buf, 0, 32 * sizeof(double));
 
@@ -774,12 +649,10 @@ static int vsr_get(struct task_struct *target, const struct user_regset *regset,
 }
 
 /*
- * When the transaction is active, 'transact_fp' holds the current running
- * value of all FPR registers and 'fp_state' holds the last checkpointed
- * value of all FPR registers for the current transaction. When transaction
- * is not active 'fp_state' holds the current running state of all the FPR
- * registers. So this function which sets the current running values of all
- * the FPR registers, needs to know whether any transaction is active or not.
+ * Regardless of transactions, 'fp_state' holds the current running
+ * value of all FPR registers and 'ckfp_state' holds the last
+ * checkpointed value of all FPR registers for the current
+ * transaction.
  *
  * Userspace interface buffer layout:
  *
@@ -794,31 +667,16 @@ static int vsr_set(struct task_struct *target, const struct user_regset *regset,
 	u64 buf[32];
 	int ret,i;
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
-#endif
 	flush_vsx_to_thread(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 buf, 0, 32 * sizeof(double));
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(target->thread.regs->msr)) {
-		for (i = 0; i < 32 ; i++)
-			target->thread.transact_fp.
-				fpr[i][TS_VSRLOWOFFSET] = buf[i];
-	} else {
+	if (!ret)
 		for (i = 0; i < 32 ; i++)
-			target->thread.fp_state.
-				fpr[i][TS_VSRLOWOFFSET] = buf[i];
-	}
-#else
-	for (i = 0; i < 32 ; i++)
-		target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
-#endif
-
+			target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
 
 	return ret;
 }
@@ -944,9 +802,9 @@ static int tm_cgpr_get(struct task_struct *target,
 	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
 		return -ENODATA;
 
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				  &target->thread.ckpt_regs,
@@ -1009,9 +867,9 @@ static int tm_cgpr_set(struct task_struct *target,
 	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
 		return -ENODATA;
 
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 &target->thread.ckpt_regs,
@@ -1087,7 +945,7 @@ static int tm_cfpr_active(struct task_struct *target,
  *
  * This function gets in transaction checkpointed FPR registers.
  *
- * When the transaction is active 'fp_state' holds the checkpointed
+ * When the transaction is active 'ckfp_state' holds the checkpointed
  * values for the current transaction to fall back on if it aborts
  * in between. This function gets those checkpointed FPR registers.
  * The userspace interface buffer layout is as follows.
@@ -1111,14 +969,14 @@ static int tm_cfpr_get(struct task_struct *target,
 	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
 		return -ENODATA;
 
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	/* copy to local buffer then write that out */
 	for (i = 0; i < 32 ; i++)
-		buf[i] = target->thread.TS_FPR(i);
-	buf[32] = target->thread.fp_state.fpscr;
+		buf[i] = target->thread.TS_CKFPR(i);
+	buf[32] = target->thread.ckfp_state.fpscr;
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
 }
 
@@ -1133,7 +991,7 @@ static int tm_cfpr_get(struct task_struct *target,
  *
  * This function sets in transaction checkpointed FPR registers.
  *
- * When the transaction is active 'fp_state' holds the checkpointed
+ * When the transaction is active 'ckfp_state' holds the checkpointed
  * FPR register values for the current transaction to fall back on
  * if it aborts in between. This function sets these checkpointed
  * FPR registers. The userspace interface buffer layout is as follows.
@@ -1157,17 +1015,17 @@ static int tm_cfpr_set(struct task_struct *target,
 	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
 		return -ENODATA;
 
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	/* copy to local buffer then write that out */
 	i = user_regset_copyin(&pos, &count, &kbuf, &ubuf, buf, 0, -1);
 	if (i)
 		return i;
 	for (i = 0; i < 32 ; i++)
-		target->thread.TS_FPR(i) = buf[i];
-	target->thread.fp_state.fpscr = buf[32];
+		target->thread.TS_CKFPR(i) = buf[i];
+	target->thread.ckfp_state.fpscr = buf[32];
 	return 0;
 }
 
@@ -1202,7 +1060,7 @@ static int tm_cvmx_active(struct task_struct *target,
  *
  * This function gets in transaction checkpointed VMX registers.
  *
- * When the transaction is active 'vr_state' and 'vr_save' hold
+ * When the transaction is active 'ckvr_state' and 'ckvrsave' hold
  * the checkpointed values for the current transaction to fall
  * back on if it aborts in between. The userspace interface buffer
  * layout is as follows.
@@ -1229,12 +1087,12 @@ static int tm_cvmx_get(struct task_struct *target,
 		return -ENODATA;
 
 	/* Flush the state */
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					&target->thread.vr_state, 0,
+					&target->thread.ckvr_state, 0,
 					33 * sizeof(vector128));
 	if (!ret) {
 		/*
@@ -1245,7 +1103,7 @@ static int tm_cvmx_get(struct task_struct *target,
 			u32 word;
 		} vrsave;
 		memset(&vrsave, 0, sizeof(vrsave));
-		vrsave.word = target->thread.vrsave;
+		vrsave.word = target->thread.ckvrsave;
 		ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, &vrsave,
 						33 * sizeof(vector128), -1);
 	}
@@ -1264,7 +1122,7 @@ static int tm_cvmx_get(struct task_struct *target,
  *
  * This function sets in transaction checkpointed VMX registers.
  *
- * When the transaction is active 'vr_state' and 'vr_save' hold
+ * When the transaction is active 'ckvr_state' and 'ckvrsave' hold
  * the checkpointed values for the current transaction to fall
  * back on if it aborts in between. The userspace interface buffer
  * layout is as follows.
@@ -1290,12 +1148,12 @@ static int tm_cvmx_set(struct task_struct *target,
 	if (!MSR_TM_ACTIVE(target->thread.regs->msr))
 		return -ENODATA;
 
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					&target->thread.vr_state, 0,
+					&target->thread.ckvr_state, 0,
 					33 * sizeof(vector128));
 	if (!ret && count > 0) {
 		/*
@@ -1306,11 +1164,11 @@ static int tm_cvmx_set(struct task_struct *target,
 			u32 word;
 		} vrsave;
 		memset(&vrsave, 0, sizeof(vrsave));
-		vrsave.word = target->thread.vrsave;
+		vrsave.word = target->thread.ckvrsave;
 		ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, &vrsave,
 						33 * sizeof(vector128), -1);
 		if (!ret)
-			target->thread.vrsave = vrsave.word;
+			target->thread.ckvrsave = vrsave.word;
 	}
 
 	return ret;
@@ -1348,7 +1206,7 @@ static int tm_cvsx_active(struct task_struct *target,
  *
  * This function gets in transaction checkpointed VSX registers.
  *
- * When the transaction is active 'fp_state' holds the checkpointed
+ * When the transaction is active 'ckfp_state' holds the checkpointed
  * values for the current transaction to fall back on if it aborts
  * in between. This function gets those checkpointed VSX registers.
  * The userspace interface buffer layout is as follows.
@@ -1372,13 +1230,13 @@ static int tm_cvsx_get(struct task_struct *target,
 		return -ENODATA;
 
 	/* Flush the state */
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 	flush_vsx_to_thread(target);
 
 	for (i = 0; i < 32 ; i++)
-		buf[i] = target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET];
+		buf[i] = target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
 				  buf, 0, 32 * sizeof(double));
 
@@ -1396,7 +1254,7 @@ static int tm_cvsx_get(struct task_struct *target,
  *
  * This function sets in transaction checkpointed VSX registers.
  *
- * When the transaction is active 'fp_state' holds the checkpointed
+ * When the transaction is active 'ckfp_state' holds the checkpointed
  * VSX register values for the current transaction to fall back on
  * if it aborts in between. This function sets these checkpointed
  * FPR registers. The userspace interface buffer layout is as follows.
@@ -1420,15 +1278,16 @@ static int tm_cvsx_set(struct task_struct *target,
 		return -ENODATA;
 
 	/* Flush the state */
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 	flush_vsx_to_thread(target);
 
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
 				 buf, 0, 32 * sizeof(double));
-	for (i = 0; i < 32 ; i++)
-		target->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+	if (!ret)
+		for (i = 0; i < 32 ; i++)
+			target->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
 
 	return ret;
 }
@@ -1484,9 +1343,9 @@ static int tm_spr_get(struct task_struct *target,
 		return -ENODEV;
 
 	/* Flush the states */
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	/* TFHAR register */
 	ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
@@ -1540,9 +1399,9 @@ static int tm_spr_set(struct task_struct *target,
 		return -ENODEV;
 
 	/* Flush the states */
+	flush_tmregs_to_thread(target);
 	flush_fp_to_thread(target);
 	flush_altivec_to_thread(target);
-	flush_tmregs_to_thread(target);
 
 	/* TFHAR register */
 	ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
@@ -2065,33 +1924,12 @@ static const struct user_regset_view user_ppc_native_view = {
 static int gpr32_get_common(struct task_struct *target,
 		     const struct user_regset *regset,
 		     unsigned int pos, unsigned int count,
-			    void *kbuf, void __user *ubuf, bool tm_active)
+			    void *kbuf, void __user *ubuf,
+			    unsigned long *regs)
 {
-	const unsigned long *regs = &target->thread.regs->gpr[0];
-	const unsigned long *ckpt_regs;
 	compat_ulong_t *k = kbuf;
 	compat_ulong_t __user *u = ubuf;
 	compat_ulong_t reg;
-	int i;
-
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	ckpt_regs = &target->thread.ckpt_regs.gpr[0];
-#endif
-	if (tm_active) {
-		regs = ckpt_regs;
-	} else {
-		if (target->thread.regs == NULL)
-			return -EIO;
-
-		if (!FULL_REGS(target->thread.regs)) {
-			/*
-			 * We have a partial register set.
-			 * Fill 14-31 with bogus values.
-			 */
-			for (i = 14; i < 32; i++)
-				target->thread.regs->gpr[i] = NV_REG_POISON;
-		}
-	}
 
 	pos /= sizeof(reg);
 	count /= sizeof(reg);
@@ -2133,29 +1971,13 @@ static int gpr32_get_common(struct task_struct *target,
 static int gpr32_set_common(struct task_struct *target,
 		     const struct user_regset *regset,
 		     unsigned int pos, unsigned int count,
-		     const void *kbuf, const void __user *ubuf, bool tm_active)
+		     const void *kbuf, const void __user *ubuf,
+		     unsigned long *regs)
 {
-	unsigned long *regs = &target->thread.regs->gpr[0];
-	unsigned long *ckpt_regs;
 	const compat_ulong_t *k = kbuf;
 	const compat_ulong_t __user *u = ubuf;
 	compat_ulong_t reg;
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	ckpt_regs = &target->thread.ckpt_regs.gpr[0];
-#endif
-
-	if (tm_active) {
-		regs = ckpt_regs;
-	} else {
-		regs = &target->thread.regs->gpr[0];
-
-		if (target->thread.regs == NULL)
-			return -EIO;
-
-		CHECK_FULL_REGS(target->thread.regs);
-	}
-
 	pos /= sizeof(reg);
 	count /= sizeof(reg);
 
@@ -2220,7 +2042,8 @@ static int tm_cgpr32_get(struct task_struct *target,
 		     unsigned int pos, unsigned int count,
 		     void *kbuf, void __user *ubuf)
 {
-	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 1);
+	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.ckpt_regs.gpr[0]);
 }
 
 static int tm_cgpr32_set(struct task_struct *target,
@@ -2228,7 +2051,8 @@ static int tm_cgpr32_set(struct task_struct *target,
 		     unsigned int pos, unsigned int count,
 		     const void *kbuf, const void __user *ubuf)
 {
-	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 1);
+	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.ckpt_regs.gpr[0]);
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
@@ -2237,7 +2061,21 @@ static int gpr32_get(struct task_struct *target,
 		     unsigned int pos, unsigned int count,
 		     void *kbuf, void __user *ubuf)
 {
-	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf, 0);
+	int i;
+
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	if (!FULL_REGS(target->thread.regs)) {
+		/*
+		 * We have a partial register set.
+		 * Fill 14-31 with bogus values.
+		 */
+		for (i = 14; i < 32; i++)
+			target->thread.regs->gpr[i] = NV_REG_POISON;
+	}
+	return gpr32_get_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.regs->gpr[0]);
 }
 
 static int gpr32_set(struct task_struct *target,
@@ -2245,7 +2083,12 @@ static int gpr32_set(struct task_struct *target,
 		     unsigned int pos, unsigned int count,
 		     const void *kbuf, const void __user *ubuf)
 {
-	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf, 0);
+	if (target->thread.regs == NULL)
+		return -EIO;
+
+	CHECK_FULL_REGS(target->thread.regs);
+	return gpr32_set_common(target, regset, pos, count, kbuf, ubuf,
+			&target->thread.regs->gpr[0]);
 }
 
 /*

+ 23 - 18
arch/powerpc/kernel/signal.c

@@ -99,22 +99,24 @@ static void check_syscall_restart(struct pt_regs *regs, struct k_sigaction *ka,
 	}
 }
 
-static void do_signal(struct pt_regs *regs)
+static void do_signal(struct task_struct *tsk)
 {
 	sigset_t *oldset = sigmask_to_save();
 	struct ksignal ksig;
 	int ret;
 	int is32 = is_32bit_task();
 
+	BUG_ON(tsk != current);
+
 	get_signal(&ksig);
 
 	/* Is there any syscall restart business here ? */
-	check_syscall_restart(regs, &ksig.ka, ksig.sig > 0);
+	check_syscall_restart(tsk->thread.regs, &ksig.ka, ksig.sig > 0);
 
 	if (ksig.sig <= 0) {
 		/* No signal to deliver -- put the saved sigmask back */
 		restore_saved_sigmask();
-		regs->trap = 0;
+		tsk->thread.regs->trap = 0;
 		return;               /* no signals delivered */
 	}
 
@@ -124,23 +126,22 @@ static void do_signal(struct pt_regs *regs)
 	 * user space. The DABR will have been cleared if it
 	 * triggered inside the kernel.
 	 */
-	if (current->thread.hw_brk.address &&
-		current->thread.hw_brk.type)
-		__set_breakpoint(&current->thread.hw_brk);
+	if (tsk->thread.hw_brk.address && tsk->thread.hw_brk.type)
+		__set_breakpoint(&tsk->thread.hw_brk);
 #endif
 	/* Re-enable the breakpoints for the signal stack */
-	thread_change_pc(current, regs);
+	thread_change_pc(tsk, tsk->thread.regs);
 
 	if (is32) {
         	if (ksig.ka.sa.sa_flags & SA_SIGINFO)
-			ret = handle_rt_signal32(&ksig, oldset, regs);
+			ret = handle_rt_signal32(&ksig, oldset, tsk);
 		else
-			ret = handle_signal32(&ksig, oldset, regs);
+			ret = handle_signal32(&ksig, oldset, tsk);
 	} else {
-		ret = handle_rt_signal64(&ksig, oldset, regs);
+		ret = handle_rt_signal64(&ksig, oldset, tsk);
 	}
 
-	regs->trap = 0;
+	tsk->thread.regs->trap = 0;
 	signal_setup_done(ret, &ksig, test_thread_flag(TIF_SINGLESTEP));
 }
 
@@ -151,8 +152,10 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 	if (thread_info_flags & _TIF_UPROBE)
 		uprobe_notify_resume(regs);
 
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
+	if (thread_info_flags & _TIF_SIGPENDING) {
+		BUG_ON(regs != current->thread.regs);
+		do_signal(current);
+	}
 
 	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
 		clear_thread_flag(TIF_NOTIFY_RESUME);
@@ -162,7 +165,7 @@ void do_notify_resume(struct pt_regs *regs, unsigned long thread_info_flags)
 	user_enter();
 }
 
-unsigned long get_tm_stackpointer(struct pt_regs *regs)
+unsigned long get_tm_stackpointer(struct task_struct *tsk)
 {
 	/* When in an active transaction that takes a signal, we need to be
 	 * careful with the stack.  It's possible that the stack has moved back
@@ -187,11 +190,13 @@ unsigned long get_tm_stackpointer(struct pt_regs *regs)
 	 */
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	if (MSR_TM_ACTIVE(regs->msr)) {
+	BUG_ON(tsk != current);
+
+	if (MSR_TM_ACTIVE(tsk->thread.regs->msr)) {
 		tm_reclaim_current(TM_CAUSE_SIGNAL);
-		if (MSR_TM_TRANSACTIONAL(regs->msr))
-			return current->thread.ckpt_regs.gpr[1];
+		if (MSR_TM_TRANSACTIONAL(tsk->thread.regs->msr))
+			return tsk->thread.ckpt_regs.gpr[1];
 	}
 #endif
-	return regs->gpr[1];
+	return tsk->thread.regs->gpr[1];
 }

+ 10 - 8
arch/powerpc/kernel/signal.h

@@ -16,39 +16,41 @@ extern void __user *get_sigframe(struct ksignal *ksig, unsigned long sp,
 				  size_t frame_size, int is_32);
 
 extern int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
-			   struct pt_regs *regs);
+			   struct task_struct *tsk);
 
 extern int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
-			      struct pt_regs *regs);
+			      struct task_struct *tsk);
 
 extern unsigned long copy_fpr_to_user(void __user *to,
 				      struct task_struct *task);
-extern unsigned long copy_transact_fpr_to_user(void __user *to,
+extern unsigned long copy_ckfpr_to_user(void __user *to,
 					       struct task_struct *task);
 extern unsigned long copy_fpr_from_user(struct task_struct *task,
 					void __user *from);
-extern unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+extern unsigned long copy_ckfpr_from_user(struct task_struct *task,
 						 void __user *from);
+extern unsigned long get_tm_stackpointer(struct task_struct *tsk);
+
 #ifdef CONFIG_VSX
 extern unsigned long copy_vsx_to_user(void __user *to,
 				      struct task_struct *task);
-extern unsigned long copy_transact_vsx_to_user(void __user *to,
+extern unsigned long copy_ckvsx_to_user(void __user *to,
 					       struct task_struct *task);
 extern unsigned long copy_vsx_from_user(struct task_struct *task,
 					void __user *from);
-extern unsigned long copy_transact_vsx_from_user(struct task_struct *task,
+extern unsigned long copy_ckvsx_from_user(struct task_struct *task,
 						 void __user *from);
 #endif
 
 #ifdef CONFIG_PPC64
 
 extern int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
-			      struct pt_regs *regs);
+			      struct task_struct *tsk);
 
 #else /* CONFIG_PPC64 */
 
 static inline int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
-				     struct pt_regs *regs)
+				     struct task_struct *tsk)
 {
 	return -EFAULT;
 }

+ 66 - 56
arch/powerpc/kernel/signal_32.c

@@ -44,6 +44,7 @@
 #include <asm/vdso.h>
 #include <asm/switch_to.h>
 #include <asm/tm.h>
+#include <asm/asm-prototypes.h>
 #ifdef CONFIG_PPC64
 #include "ppc32.h"
 #include <asm/unistd.h>
@@ -315,7 +316,7 @@ unsigned long copy_vsx_from_user(struct task_struct *task,
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-unsigned long copy_transact_fpr_to_user(void __user *to,
+unsigned long copy_ckfpr_to_user(void __user *to,
 				  struct task_struct *task)
 {
 	u64 buf[ELF_NFPREG];
@@ -323,12 +324,12 @@ unsigned long copy_transact_fpr_to_user(void __user *to,
 
 	/* save FPR copy to local buffer then write to the thread_struct */
 	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-		buf[i] = task->thread.TS_TRANS_FPR(i);
-	buf[i] = task->thread.transact_fp.fpscr;
+		buf[i] = task->thread.TS_CKFPR(i);
+	buf[i] = task->thread.ckfp_state.fpscr;
 	return __copy_to_user(to, buf, ELF_NFPREG * sizeof(double));
 }
 
-unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+unsigned long copy_ckfpr_from_user(struct task_struct *task,
 					  void __user *from)
 {
 	u64 buf[ELF_NFPREG];
@@ -337,13 +338,13 @@ unsigned long copy_transact_fpr_from_user(struct task_struct *task,
 	if (__copy_from_user(buf, from, ELF_NFPREG * sizeof(double)))
 		return 1;
 	for (i = 0; i < (ELF_NFPREG - 1) ; i++)
-		task->thread.TS_TRANS_FPR(i) = buf[i];
-	task->thread.transact_fp.fpscr = buf[i];
+		task->thread.TS_CKFPR(i) = buf[i];
+	task->thread.ckfp_state.fpscr = buf[i];
 
 	return 0;
 }
 
-unsigned long copy_transact_vsx_to_user(void __user *to,
+unsigned long copy_ckvsx_to_user(void __user *to,
 				  struct task_struct *task)
 {
 	u64 buf[ELF_NVSRHALFREG];
@@ -351,11 +352,11 @@ unsigned long copy_transact_vsx_to_user(void __user *to,
 
 	/* save FPR copy to local buffer then write to the thread_struct */
 	for (i = 0; i < ELF_NVSRHALFREG; i++)
-		buf[i] = task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET];
+		buf[i] = task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET];
 	return __copy_to_user(to, buf, ELF_NVSRHALFREG * sizeof(double));
 }
 
-unsigned long copy_transact_vsx_from_user(struct task_struct *task,
+unsigned long copy_ckvsx_from_user(struct task_struct *task,
 					  void __user *from)
 {
 	u64 buf[ELF_NVSRHALFREG];
@@ -364,7 +365,7 @@ unsigned long copy_transact_vsx_from_user(struct task_struct *task,
 	if (__copy_from_user(buf, from, ELF_NVSRHALFREG * sizeof(double)))
 		return 1;
 	for (i = 0; i < ELF_NVSRHALFREG ; i++)
-		task->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = buf[i];
+		task->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = buf[i];
 	return 0;
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
@@ -384,17 +385,17 @@ inline unsigned long copy_fpr_from_user(struct task_struct *task,
 }
 
 #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-inline unsigned long copy_transact_fpr_to_user(void __user *to,
+inline unsigned long copy_ckfpr_to_user(void __user *to,
 					 struct task_struct *task)
 {
-	return __copy_to_user(to, task->thread.transact_fp.fpr,
+	return __copy_to_user(to, task->thread.ckfp_state.fpr,
 			      ELF_NFPREG * sizeof(double));
 }
 
-inline unsigned long copy_transact_fpr_from_user(struct task_struct *task,
+inline unsigned long copy_ckfpr_from_user(struct task_struct *task,
 						 void __user *from)
 {
-	return __copy_from_user(task->thread.transact_fp.fpr, from,
+	return __copy_from_user(task->thread.ckfp_state.fpr, from,
 				ELF_NFPREG * sizeof(double));
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
@@ -525,9 +526,6 @@ static int save_tm_user_regs(struct pt_regs *regs,
 	 */
 	regs->msr &= ~MSR_TS_MASK;
 
-	/* Make sure floating point registers are stored in regs */
-	flush_fp_to_thread(current);
-
 	/* Save both sets of general registers */
 	if (save_general_regs(&current->thread.ckpt_regs, frame)
 	    || save_general_regs(regs, tm_frame))
@@ -545,18 +543,17 @@ static int save_tm_user_regs(struct pt_regs *regs,
 #ifdef CONFIG_ALTIVEC
 	/* save altivec registers */
 	if (current->thread.used_vr) {
-		flush_altivec_to_thread(current);
-		if (__copy_to_user(&frame->mc_vregs, &current->thread.vr_state,
+		if (__copy_to_user(&frame->mc_vregs, &current->thread.ckvr_state,
 				   ELF_NVRREG * sizeof(vector128)))
 			return 1;
 		if (msr & MSR_VEC) {
 			if (__copy_to_user(&tm_frame->mc_vregs,
-					   &current->thread.transact_vr,
+					   &current->thread.vr_state,
 					   ELF_NVRREG * sizeof(vector128)))
 				return 1;
 		} else {
 			if (__copy_to_user(&tm_frame->mc_vregs,
-					   &current->thread.vr_state,
+					   &current->thread.ckvr_state,
 					   ELF_NVRREG * sizeof(vector128)))
 				return 1;
 		}
@@ -573,28 +570,28 @@ static int save_tm_user_regs(struct pt_regs *regs,
 	 * most significant bits of that same vector. --BenH
 	 */
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		current->thread.vrsave = mfspr(SPRN_VRSAVE);
-	if (__put_user(current->thread.vrsave,
+		current->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+	if (__put_user(current->thread.ckvrsave,
 		       (u32 __user *)&frame->mc_vregs[32]))
 		return 1;
 	if (msr & MSR_VEC) {
-		if (__put_user(current->thread.transact_vrsave,
+		if (__put_user(current->thread.vrsave,
 			       (u32 __user *)&tm_frame->mc_vregs[32]))
 			return 1;
 	} else {
-		if (__put_user(current->thread.vrsave,
+		if (__put_user(current->thread.ckvrsave,
 			       (u32 __user *)&tm_frame->mc_vregs[32]))
 			return 1;
 	}
 #endif /* CONFIG_ALTIVEC */
 
-	if (copy_fpr_to_user(&frame->mc_fregs, current))
+	if (copy_ckfpr_to_user(&frame->mc_fregs, current))
 		return 1;
 	if (msr & MSR_FP) {
-		if (copy_transact_fpr_to_user(&tm_frame->mc_fregs, current))
+		if (copy_fpr_to_user(&tm_frame->mc_fregs, current))
 			return 1;
 	} else {
-		if (copy_fpr_to_user(&tm_frame->mc_fregs, current))
+		if (copy_ckfpr_to_user(&tm_frame->mc_fregs, current))
 			return 1;
 	}
 
@@ -606,15 +603,14 @@ static int save_tm_user_regs(struct pt_regs *regs,
 	 * contains valid data
 	 */
 	if (current->thread.used_vsr) {
-		flush_vsx_to_thread(current);
-		if (copy_vsx_to_user(&frame->mc_vsregs, current))
+		if (copy_ckvsx_to_user(&frame->mc_vsregs, current))
 			return 1;
 		if (msr & MSR_VSX) {
-			if (copy_transact_vsx_to_user(&tm_frame->mc_vsregs,
+			if (copy_vsx_to_user(&tm_frame->mc_vsregs,
 						      current))
 				return 1;
 		} else {
-			if (copy_vsx_to_user(&tm_frame->mc_vsregs, current))
+			if (copy_ckvsx_to_user(&tm_frame->mc_vsregs, current))
 				return 1;
 		}
 
@@ -698,6 +694,7 @@ static long restore_user_regs(struct pt_regs *regs,
 		if (__copy_from_user(&current->thread.vr_state, &sr->mc_vregs,
 				     sizeof(sr->mc_vregs)))
 			return 1;
+		current->thread.used_vr = true;
 	} else if (current->thread.used_vr)
 		memset(&current->thread.vr_state, 0,
 		       ELF_NVRREG * sizeof(vector128));
@@ -724,6 +721,7 @@ static long restore_user_regs(struct pt_regs *regs,
 		 */
 		if (copy_vsx_from_user(current, &sr->mc_vsregs))
 			return 1;
+		current->thread.used_vsr = true;
 	} else if (current->thread.used_vsr)
 		for (i = 0; i < 32 ; i++)
 			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
@@ -743,6 +741,7 @@ static long restore_user_regs(struct pt_regs *regs,
 		if (__copy_from_user(current->thread.evr, &sr->mc_vregs,
 				     ELF_NEVRREG * sizeof(u32)))
 			return 1;
+		current->thread.used_spe = true;
 	} else if (current->thread.used_spe)
 		memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
 
@@ -793,33 +792,34 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	regs->msr &= ~MSR_VEC;
 	if (msr & MSR_VEC) {
 		/* restore altivec registers from the stack */
-		if (__copy_from_user(&current->thread.vr_state, &sr->mc_vregs,
+		if (__copy_from_user(&current->thread.ckvr_state, &sr->mc_vregs,
 				     sizeof(sr->mc_vregs)) ||
-		    __copy_from_user(&current->thread.transact_vr,
+		    __copy_from_user(&current->thread.vr_state,
 				     &tm_sr->mc_vregs,
 				     sizeof(sr->mc_vregs)))
 			return 1;
+		current->thread.used_vr = true;
 	} else if (current->thread.used_vr) {
 		memset(&current->thread.vr_state, 0,
 		       ELF_NVRREG * sizeof(vector128));
-		memset(&current->thread.transact_vr, 0,
+		memset(&current->thread.ckvr_state, 0,
 		       ELF_NVRREG * sizeof(vector128));
 	}
 
 	/* Always get VRSAVE back */
-	if (__get_user(current->thread.vrsave,
+	if (__get_user(current->thread.ckvrsave,
 		       (u32 __user *)&sr->mc_vregs[32]) ||
-	    __get_user(current->thread.transact_vrsave,
+	    __get_user(current->thread.vrsave,
 		       (u32 __user *)&tm_sr->mc_vregs[32]))
 		return 1;
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		mtspr(SPRN_VRSAVE, current->thread.vrsave);
+		mtspr(SPRN_VRSAVE, current->thread.ckvrsave);
 #endif /* CONFIG_ALTIVEC */
 
 	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1);
 
 	if (copy_fpr_from_user(current, &sr->mc_fregs) ||
-	    copy_transact_fpr_from_user(current, &tm_sr->mc_fregs))
+	    copy_ckfpr_from_user(current, &tm_sr->mc_fregs))
 		return 1;
 
 #ifdef CONFIG_VSX
@@ -829,13 +829,14 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 		 * Restore altivec registers from the stack to a local
 		 * buffer, then write this out to the thread_struct
 		 */
-		if (copy_vsx_from_user(current, &sr->mc_vsregs) ||
-		    copy_transact_vsx_from_user(current, &tm_sr->mc_vsregs))
+		if (copy_vsx_from_user(current, &tm_sr->mc_vsregs) ||
+		    copy_ckvsx_from_user(current, &sr->mc_vsregs))
 			return 1;
+		current->thread.used_vsr = true;
 	} else if (current->thread.used_vsr)
 		for (i = 0; i < 32 ; i++) {
 			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
-			current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0;
+			current->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
 		}
 #endif /* CONFIG_VSX */
 
@@ -848,6 +849,7 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 		if (__copy_from_user(current->thread.evr, &sr->mc_vregs,
 				     ELF_NEVRREG * sizeof(u32)))
 			return 1;
+		current->thread.used_spe = true;
 	} else if (current->thread.used_spe)
 		memset(current->thread.evr, 0, ELF_NEVRREG * sizeof(u32));
 
@@ -877,13 +879,14 @@ static long restore_tm_user_regs(struct pt_regs *regs,
 	tm_recheckpoint(&current->thread, msr);
 
 	/* This loads the speculative FP/VEC state, if used */
+	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
 	if (msr & MSR_FP) {
-		do_load_up_transact_fpu(&current->thread);
+		load_fp_state(&current->thread.fp_state);
 		regs->msr |= (MSR_FP | current->thread.fpexc_mode);
 	}
 #ifdef CONFIG_ALTIVEC
 	if (msr & MSR_VEC) {
-		do_load_up_transact_altivec(&current->thread);
+		load_vr_state(&current->thread.vr_state);
 		regs->msr |= MSR_VEC;
 	}
 #endif
@@ -971,7 +974,7 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from)
  * (one which gets siginfo).
  */
 int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
-		       struct pt_regs *regs)
+		       struct task_struct *tsk)
 {
 	struct rt_sigframe __user *rt_sf;
 	struct mcontext __user *frame;
@@ -980,10 +983,13 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	unsigned long newsp = 0;
 	int sigret;
 	unsigned long tramp;
+	struct pt_regs *regs = tsk->thread.regs;
+
+	BUG_ON(tsk != current);
 
 	/* Set up Signal Frame */
 	/* Put a Real Time Context onto stack */
-	rt_sf = get_sigframe(ksig, get_tm_stackpointer(regs), sizeof(*rt_sf), 1);
+	rt_sf = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*rt_sf), 1);
 	addr = rt_sf;
 	if (unlikely(rt_sf == NULL))
 		goto badframe;
@@ -1000,9 +1006,9 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	/* Save user registers on the stack */
 	frame = &rt_sf->uc.uc_mcontext;
 	addr = frame;
-	if (vdso32_rt_sigtramp && current->mm->context.vdso_base) {
+	if (vdso32_rt_sigtramp && tsk->mm->context.vdso_base) {
 		sigret = 0;
-		tramp = current->mm->context.vdso_base + vdso32_rt_sigtramp;
+		tramp = tsk->mm->context.vdso_base + vdso32_rt_sigtramp;
 	} else {
 		sigret = __NR_rt_sigreturn;
 		tramp = (unsigned long) frame->tramp;
@@ -1029,7 +1035,7 @@ int handle_rt_signal32(struct ksignal *ksig, sigset_t *oldset,
 	}
 	regs->link = tramp;
 
-	current->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
+	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
 
 	/* create a stack frame for the caller of the handler */
 	newsp = ((unsigned long)rt_sf) - (__SIGNAL_FRAMESIZE + 16);
@@ -1054,7 +1060,7 @@ badframe:
 		printk_ratelimited(KERN_INFO
 				   "%s[%d]: bad frame in handle_rt_signal32: "
 				   "%p nip %08lx lr %08lx\n",
-				   current->comm, current->pid,
+				   tsk->comm, tsk->pid,
 				   addr, regs->nip, regs->link);
 
 	return 1;
@@ -1410,7 +1416,8 @@ int sys_debug_setcontext(struct ucontext __user *ctx,
 /*
  * OK, we're invoking a handler
  */
-int handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs)
+int handle_signal32(struct ksignal *ksig, sigset_t *oldset,
+		struct task_struct *tsk)
 {
 	struct sigcontext __user *sc;
 	struct sigframe __user *frame;
@@ -1418,9 +1425,12 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs
 	unsigned long newsp = 0;
 	int sigret;
 	unsigned long tramp;
+	struct pt_regs *regs = tsk->thread.regs;
+
+	BUG_ON(tsk != current);
 
 	/* Set up Signal Frame */
-	frame = get_sigframe(ksig, get_tm_stackpointer(regs), sizeof(*frame), 1);
+	frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 1);
 	if (unlikely(frame == NULL))
 		goto badframe;
 	sc = (struct sigcontext __user *) &frame->sctx;
@@ -1439,9 +1449,9 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs
 	    || __put_user(ksig->sig, &sc->signal))
 		goto badframe;
 
-	if (vdso32_sigtramp && current->mm->context.vdso_base) {
+	if (vdso32_sigtramp && tsk->mm->context.vdso_base) {
 		sigret = 0;
-		tramp = current->mm->context.vdso_base + vdso32_sigtramp;
+		tramp = tsk->mm->context.vdso_base + vdso32_sigtramp;
 	} else {
 		sigret = __NR_sigreturn;
 		tramp = (unsigned long) frame->mctx.tramp;
@@ -1463,7 +1473,7 @@ int handle_signal32(struct ksignal *ksig, sigset_t *oldset, struct pt_regs *regs
 
 	regs->link = tramp;
 
-	current->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
+	tsk->thread.fp_state.fpscr = 0;	/* turn off all fp exceptions */
 
 	/* create a stack frame for the caller of the handler */
 	newsp = ((unsigned long)frame) - __SIGNAL_FRAMESIZE;
@@ -1483,7 +1493,7 @@ badframe:
 		printk_ratelimited(KERN_INFO
 				   "%s[%d]: bad frame in handle_signal32: "
 				   "%p nip %08lx lr %08lx\n",
-				   current->comm, current->pid,
+				   tsk->comm, tsk->pid,
 				   frame, regs->nip, regs->link);
 
 	return 1;

+ 114 - 93
arch/powerpc/kernel/signal_64.c

@@ -35,6 +35,7 @@
 #include <asm/vdso.h>
 #include <asm/switch_to.h>
 #include <asm/tm.h>
+#include <asm/asm-prototypes.h>
 
 #include "signal.h"
 
@@ -90,9 +91,9 @@ static elf_vrreg_t __user *sigcontext_vmx_regs(struct sigcontext __user *sc)
  * Set up the sigcontext for the signal frame.
  */
 
-static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
-		 int signr, sigset_t *set, unsigned long handler,
-		 int ctx_has_vsx_region)
+static long setup_sigcontext(struct sigcontext __user *sc,
+		struct task_struct *tsk, int signr, sigset_t *set,
+		unsigned long handler, int ctx_has_vsx_region)
 {
 	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
 	 * process never used altivec yet (MSR_VEC is zero in pt_regs of
@@ -106,17 +107,20 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
 	unsigned long vrsave;
 #endif
+	struct pt_regs *regs = tsk->thread.regs;
 	unsigned long msr = regs->msr;
 	long err = 0;
 
+	BUG_ON(tsk != current);
+
 #ifdef CONFIG_ALTIVEC
 	err |= __put_user(v_regs, &sc->v_regs);
 
 	/* save altivec registers */
-	if (current->thread.used_vr) {
-		flush_altivec_to_thread(current);
+	if (tsk->thread.used_vr) {
+		flush_altivec_to_thread(tsk);
 		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
-		err |= __copy_to_user(v_regs, &current->thread.vr_state,
+		err |= __copy_to_user(v_regs, &tsk->thread.vr_state,
 				      33 * sizeof(vector128));
 		/* set MSR_VEC in the MSR value in the frame to indicate that sc->v_reg)
 		 * contains valid data.
@@ -129,16 +133,16 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	vrsave = 0;
 	if (cpu_has_feature(CPU_FTR_ALTIVEC)) {
 		vrsave = mfspr(SPRN_VRSAVE);
-		current->thread.vrsave = vrsave;
+		tsk->thread.vrsave = vrsave;
 	}
 
 	err |= __put_user(vrsave, (u32 __user *)&v_regs[33]);
 #else /* CONFIG_ALTIVEC */
 	err |= __put_user(0, &sc->v_regs);
 #endif /* CONFIG_ALTIVEC */
-	flush_fp_to_thread(current);
+	flush_fp_to_thread(tsk);
 	/* copy fpr regs and fpscr */
-	err |= copy_fpr_to_user(&sc->fp_regs, current);
+	err |= copy_fpr_to_user(&sc->fp_regs, tsk);
 
 	/*
 	 * Clear the MSR VSX bit to indicate there is no valid state attached
@@ -151,10 +155,10 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
 	 * then out to userspace.  Update v_regs to point after the
 	 * VMX data.
 	 */
-	if (current->thread.used_vsr && ctx_has_vsx_region) {
-		flush_vsx_to_thread(current);
+	if (tsk->thread.used_vsr && ctx_has_vsx_region) {
+		flush_vsx_to_thread(tsk);
 		v_regs += ELF_NVRREG;
-		err |= copy_vsx_to_user(v_regs, current);
+		err |= copy_vsx_to_user(v_regs, tsk);
 		/* set MSR_VSX in the MSR value in the frame to
 		 * indicate that sc->vs_reg) contains valid data.
 		 */
@@ -187,7 +191,7 @@ static long setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
  */
 static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 				 struct sigcontext __user *tm_sc,
-				 struct pt_regs *regs,
+				 struct task_struct *tsk,
 				 int signr, sigset_t *set, unsigned long handler)
 {
 	/* When CONFIG_ALTIVEC is set, we _always_ setup v_regs even if the
@@ -202,9 +206,12 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	elf_vrreg_t __user *v_regs = sigcontext_vmx_regs(sc);
 	elf_vrreg_t __user *tm_v_regs = sigcontext_vmx_regs(tm_sc);
 #endif
-	unsigned long msr = regs->msr;
+	struct pt_regs *regs = tsk->thread.regs;
+	unsigned long msr = tsk->thread.ckpt_regs.msr;
 	long err = 0;
 
+	BUG_ON(tsk != current);
+
 	BUG_ON(!MSR_TM_ACTIVE(regs->msr));
 
 	/* Remove TM bits from thread's MSR.  The MSR in the sigcontext
@@ -214,28 +221,25 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	 */
 	regs->msr &= ~MSR_TS_MASK;
 
-	flush_fp_to_thread(current);
-
 #ifdef CONFIG_ALTIVEC
 	err |= __put_user(v_regs, &sc->v_regs);
 	err |= __put_user(tm_v_regs, &tm_sc->v_regs);
 
 	/* save altivec registers */
-	if (current->thread.used_vr) {
-		flush_altivec_to_thread(current);
+	if (tsk->thread.used_vr) {
 		/* Copy 33 vec registers (vr0..31 and vscr) to the stack */
-		err |= __copy_to_user(v_regs, &current->thread.vr_state,
+		err |= __copy_to_user(v_regs, &tsk->thread.ckvr_state,
 				      33 * sizeof(vector128));
 		/* If VEC was enabled there are transactional VRs valid too,
 		 * else they're a copy of the checkpointed VRs.
 		 */
 		if (msr & MSR_VEC)
 			err |= __copy_to_user(tm_v_regs,
-					      &current->thread.transact_vr,
+					      &tsk->thread.vr_state,
 					      33 * sizeof(vector128));
 		else
 			err |= __copy_to_user(tm_v_regs,
-					      &current->thread.vr_state,
+					      &tsk->thread.ckvr_state,
 					      33 * sizeof(vector128));
 
 		/* set MSR_VEC in the MSR value in the frame to indicate
@@ -247,13 +251,13 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	 * use altivec.
 	 */
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		current->thread.vrsave = mfspr(SPRN_VRSAVE);
-	err |= __put_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
+		tsk->thread.ckvrsave = mfspr(SPRN_VRSAVE);
+	err |= __put_user(tsk->thread.ckvrsave, (u32 __user *)&v_regs[33]);
 	if (msr & MSR_VEC)
-		err |= __put_user(current->thread.transact_vrsave,
+		err |= __put_user(tsk->thread.vrsave,
 				  (u32 __user *)&tm_v_regs[33]);
 	else
-		err |= __put_user(current->thread.vrsave,
+		err |= __put_user(tsk->thread.ckvrsave,
 				  (u32 __user *)&tm_v_regs[33]);
 
 #else /* CONFIG_ALTIVEC */
@@ -262,11 +266,11 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 #endif /* CONFIG_ALTIVEC */
 
 	/* copy fpr regs and fpscr */
-	err |= copy_fpr_to_user(&sc->fp_regs, current);
+	err |= copy_ckfpr_to_user(&sc->fp_regs, tsk);
 	if (msr & MSR_FP)
-		err |= copy_transact_fpr_to_user(&tm_sc->fp_regs, current);
+		err |= copy_fpr_to_user(&tm_sc->fp_regs, tsk);
 	else
-		err |= copy_fpr_to_user(&tm_sc->fp_regs, current);
+		err |= copy_ckfpr_to_user(&tm_sc->fp_regs, tsk);
 
 #ifdef CONFIG_VSX
 	/*
@@ -274,17 +278,16 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	 * then out to userspace.  Update v_regs to point after the
 	 * VMX data.
 	 */
-	if (current->thread.used_vsr) {
-		flush_vsx_to_thread(current);
+	if (tsk->thread.used_vsr) {
 		v_regs += ELF_NVRREG;
 		tm_v_regs += ELF_NVRREG;
 
-		err |= copy_vsx_to_user(v_regs, current);
+		err |= copy_ckvsx_to_user(v_regs, tsk);
 
 		if (msr & MSR_VSX)
-			err |= copy_transact_vsx_to_user(tm_v_regs, current);
+			err |= copy_vsx_to_user(tm_v_regs, tsk);
 		else
-			err |= copy_vsx_to_user(tm_v_regs, current);
+			err |= copy_ckvsx_to_user(tm_v_regs, tsk);
 
 		/* set MSR_VSX in the MSR value in the frame to
 		 * indicate that sc->vs_reg) contains valid data.
@@ -298,7 +301,7 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
 	WARN_ON(!FULL_REGS(regs));
 	err |= __copy_to_user(&tm_sc->gp_regs, regs, GP_REGS_SIZE);
 	err |= __copy_to_user(&sc->gp_regs,
-			      &current->thread.ckpt_regs, GP_REGS_SIZE);
+			      &tsk->thread.ckpt_regs, GP_REGS_SIZE);
 	err |= __put_user(msr, &tm_sc->gp_regs[PT_MSR]);
 	err |= __put_user(msr, &sc->gp_regs[PT_MSR]);
 	err |= __put_user(signr, &sc->signal);
@@ -314,7 +317,7 @@ static long setup_tm_sigcontexts(struct sigcontext __user *sc,
  * Restore the sigcontext from the signal frame.
  */
 
-static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
+static long restore_sigcontext(struct task_struct *tsk, sigset_t *set, int sig,
 			      struct sigcontext __user *sc)
 {
 #ifdef CONFIG_ALTIVEC
@@ -323,10 +326,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
 	unsigned long err = 0;
 	unsigned long save_r13 = 0;
 	unsigned long msr;
+	struct pt_regs *regs = tsk->thread.regs;
 #ifdef CONFIG_VSX
 	int i;
 #endif
 
+	BUG_ON(tsk != current);
+
 	/* If this is not a signal return, we preserve the TLS in r13 */
 	if (!sig)
 		save_r13 = regs->gpr[13];
@@ -356,7 +362,7 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
 
 	/*
 	 * Force reload of FP/VEC.
-	 * This has to be done before copying stuff into current->thread.fpr/vr
+	 * This has to be done before copying stuff into tsk->thread.fpr/vr
 	 * for the reasons explained in the previous comment.
 	 */
 	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
@@ -368,21 +374,23 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
 	if (v_regs && !access_ok(VERIFY_READ, v_regs, 34 * sizeof(vector128)))
 		return -EFAULT;
 	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
-	if (v_regs != NULL && (msr & MSR_VEC) != 0)
-		err |= __copy_from_user(&current->thread.vr_state, v_regs,
+	if (v_regs != NULL && (msr & MSR_VEC) != 0) {
+		err |= __copy_from_user(&tsk->thread.vr_state, v_regs,
 					33 * sizeof(vector128));
-	else if (current->thread.used_vr)
-		memset(&current->thread.vr_state, 0, 33 * sizeof(vector128));
+		tsk->thread.used_vr = true;
+	} else if (tsk->thread.used_vr) {
+		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
+	}
 	/* Always get VRSAVE back */
 	if (v_regs != NULL)
-		err |= __get_user(current->thread.vrsave, (u32 __user *)&v_regs[33]);
+		err |= __get_user(tsk->thread.vrsave, (u32 __user *)&v_regs[33]);
 	else
-		current->thread.vrsave = 0;
+		tsk->thread.vrsave = 0;
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		mtspr(SPRN_VRSAVE, current->thread.vrsave);
+		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
 #endif /* CONFIG_ALTIVEC */
 	/* restore floating point */
-	err |= copy_fpr_from_user(current, &sc->fp_regs);
+	err |= copy_fpr_from_user(tsk, &sc->fp_regs);
 #ifdef CONFIG_VSX
 	/*
 	 * Get additional VSX data. Update v_regs to point after the
@@ -390,11 +398,13 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
 	 * buffer for formatting, then into the taskstruct.
 	 */
 	v_regs += ELF_NVRREG;
-	if ((msr & MSR_VSX) != 0)
-		err |= copy_vsx_from_user(current, v_regs);
-	else
+	if ((msr & MSR_VSX) != 0) {
+		err |= copy_vsx_from_user(tsk, v_regs);
+		tsk->thread.used_vsr = true;
+	} else {
 		for (i = 0; i < 32 ; i++)
-			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+	}
 #endif
 	return err;
 }
@@ -404,7 +414,7 @@ static long restore_sigcontext(struct pt_regs *regs, sigset_t *set, int sig,
  * Restore the two sigcontexts from the frame of a transactional processes.
  */
 
-static long restore_tm_sigcontexts(struct pt_regs *regs,
+static long restore_tm_sigcontexts(struct task_struct *tsk,
 				   struct sigcontext __user *sc,
 				   struct sigcontext __user *tm_sc)
 {
@@ -413,12 +423,16 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 #endif
 	unsigned long err = 0;
 	unsigned long msr;
+	struct pt_regs *regs = tsk->thread.regs;
 #ifdef CONFIG_VSX
 	int i;
 #endif
+
+	BUG_ON(tsk != current);
+
 	/* copy the GPRs */
 	err |= __copy_from_user(regs->gpr, tm_sc->gp_regs, sizeof(regs->gpr));
-	err |= __copy_from_user(&current->thread.ckpt_regs, sc->gp_regs,
+	err |= __copy_from_user(&tsk->thread.ckpt_regs, sc->gp_regs,
 				sizeof(regs->gpr));
 
 	/*
@@ -430,7 +444,7 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 	 * we don't need to re-copy them here.
 	 */
 	err |= __get_user(regs->nip, &tm_sc->gp_regs[PT_NIP]);
-	err |= __get_user(current->thread.tm_tfhar, &sc->gp_regs[PT_NIP]);
+	err |= __get_user(tsk->thread.tm_tfhar, &sc->gp_regs[PT_NIP]);
 
 	/* get MSR separately, transfer the LE bit if doing signal return */
 	err |= __get_user(msr, &sc->gp_regs[PT_MSR]);
@@ -449,13 +463,13 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 	err |= __get_user(regs->link, &tm_sc->gp_regs[PT_LNK]);
 	err |= __get_user(regs->xer, &tm_sc->gp_regs[PT_XER]);
 	err |= __get_user(regs->ccr, &tm_sc->gp_regs[PT_CCR]);
-	err |= __get_user(current->thread.ckpt_regs.ctr,
+	err |= __get_user(tsk->thread.ckpt_regs.ctr,
 			  &sc->gp_regs[PT_CTR]);
-	err |= __get_user(current->thread.ckpt_regs.link,
+	err |= __get_user(tsk->thread.ckpt_regs.link,
 			  &sc->gp_regs[PT_LNK]);
-	err |= __get_user(current->thread.ckpt_regs.xer,
+	err |= __get_user(tsk->thread.ckpt_regs.xer,
 			  &sc->gp_regs[PT_XER]);
-	err |= __get_user(current->thread.ckpt_regs.ccr,
+	err |= __get_user(tsk->thread.ckpt_regs.ccr,
 			  &sc->gp_regs[PT_CCR]);
 
 	/* These regs are not checkpointed; they can go in 'regs'. */
@@ -466,7 +480,7 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 
 	/*
 	 * Force reload of FP/VEC.
-	 * This has to be done before copying stuff into current->thread.fpr/vr
+	 * This has to be done before copying stuff into tsk->thread.fpr/vr
 	 * for the reasons explained in the previous comment.
 	 */
 	regs->msr &= ~(MSR_FP | MSR_FE0 | MSR_FE1 | MSR_VEC | MSR_VSX);
@@ -483,32 +497,33 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 		return -EFAULT;
 	/* Copy 33 vec registers (vr0..31 and vscr) from the stack */
 	if (v_regs != NULL && tm_v_regs != NULL && (msr & MSR_VEC) != 0) {
-		err |= __copy_from_user(&current->thread.vr_state, v_regs,
+		err |= __copy_from_user(&tsk->thread.ckvr_state, v_regs,
 					33 * sizeof(vector128));
-		err |= __copy_from_user(&current->thread.transact_vr, tm_v_regs,
+		err |= __copy_from_user(&tsk->thread.vr_state, tm_v_regs,
 					33 * sizeof(vector128));
+		current->thread.used_vr = true;
 	}
-	else if (current->thread.used_vr) {
-		memset(&current->thread.vr_state, 0, 33 * sizeof(vector128));
-		memset(&current->thread.transact_vr, 0, 33 * sizeof(vector128));
+	else if (tsk->thread.used_vr) {
+		memset(&tsk->thread.vr_state, 0, 33 * sizeof(vector128));
+		memset(&tsk->thread.ckvr_state, 0, 33 * sizeof(vector128));
 	}
 	/* Always get VRSAVE back */
 	if (v_regs != NULL && tm_v_regs != NULL) {
-		err |= __get_user(current->thread.vrsave,
+		err |= __get_user(tsk->thread.ckvrsave,
 				  (u32 __user *)&v_regs[33]);
-		err |= __get_user(current->thread.transact_vrsave,
+		err |= __get_user(tsk->thread.vrsave,
 				  (u32 __user *)&tm_v_regs[33]);
 	}
 	else {
-		current->thread.vrsave = 0;
-		current->thread.transact_vrsave = 0;
+		tsk->thread.vrsave = 0;
+		tsk->thread.ckvrsave = 0;
 	}
 	if (cpu_has_feature(CPU_FTR_ALTIVEC))
-		mtspr(SPRN_VRSAVE, current->thread.vrsave);
+		mtspr(SPRN_VRSAVE, tsk->thread.vrsave);
 #endif /* CONFIG_ALTIVEC */
 	/* restore floating point */
-	err |= copy_fpr_from_user(current, &sc->fp_regs);
-	err |= copy_transact_fpr_from_user(current, &tm_sc->fp_regs);
+	err |= copy_fpr_from_user(tsk, &tm_sc->fp_regs);
+	err |= copy_ckfpr_from_user(tsk, &sc->fp_regs);
 #ifdef CONFIG_VSX
 	/*
 	 * Get additional VSX data. Update v_regs to point after the
@@ -518,32 +533,31 @@ static long restore_tm_sigcontexts(struct pt_regs *regs,
 	if (v_regs && ((msr & MSR_VSX) != 0)) {
 		v_regs += ELF_NVRREG;
 		tm_v_regs += ELF_NVRREG;
-		err |= copy_vsx_from_user(current, v_regs);
-		err |= copy_transact_vsx_from_user(current, tm_v_regs);
+		err |= copy_vsx_from_user(tsk, tm_v_regs);
+		err |= copy_ckvsx_from_user(tsk, v_regs);
+		tsk->thread.used_vsr = true;
 	} else {
 		for (i = 0; i < 32 ; i++) {
-			current->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
-			current->thread.transact_fp.fpr[i][TS_VSRLOWOFFSET] = 0;
+			tsk->thread.fp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
+			tsk->thread.ckfp_state.fpr[i][TS_VSRLOWOFFSET] = 0;
 		}
 	}
 #endif
 	tm_enable();
 	/* Make sure the transaction is marked as failed */
-	current->thread.tm_texasr |= TEXASR_FS;
+	tsk->thread.tm_texasr |= TEXASR_FS;
 	/* This loads the checkpointed FP/VEC state, if used */
-	tm_recheckpoint(&current->thread, msr);
+	tm_recheckpoint(&tsk->thread, msr);
 
-	/* This loads the speculative FP/VEC state, if used */
+	msr_check_and_set(msr & (MSR_FP | MSR_VEC));
 	if (msr & MSR_FP) {
-		do_load_up_transact_fpu(&current->thread);
-		regs->msr |= (MSR_FP | current->thread.fpexc_mode);
+		load_fp_state(&tsk->thread.fp_state);
+		regs->msr |= (MSR_FP | tsk->thread.fpexc_mode);
 	}
-#ifdef CONFIG_ALTIVEC
 	if (msr & MSR_VEC) {
-		do_load_up_transact_altivec(&current->thread);
+		load_vr_state(&tsk->thread.vr_state);
 		regs->msr |= MSR_VEC;
 	}
-#endif
 
 	return err;
 }
@@ -594,6 +608,8 @@ int sys_swapcontext(struct ucontext __user *old_ctx,
 	unsigned long new_msr = 0;
 	int ctx_has_vsx_region = 0;
 
+	BUG_ON(regs != current->thread.regs);
+
 	if (new_ctx &&
 	    get_user(new_msr, &new_ctx->uc_mcontext.gp_regs[PT_MSR]))
 		return -EFAULT;
@@ -616,7 +632,7 @@ int sys_swapcontext(struct ucontext __user *old_ctx,
 
 	if (old_ctx != NULL) {
 		if (!access_ok(VERIFY_WRITE, old_ctx, ctx_size)
-		    || setup_sigcontext(&old_ctx->uc_mcontext, regs, 0, NULL, 0,
+		    || setup_sigcontext(&old_ctx->uc_mcontext, current, 0, NULL, 0,
 					ctx_has_vsx_region)
 		    || __copy_to_user(&old_ctx->uc_sigmask,
 				      &current->blocked, sizeof(sigset_t)))
@@ -644,7 +660,7 @@ int sys_swapcontext(struct ucontext __user *old_ctx,
 	if (__copy_from_user(&set, &new_ctx->uc_sigmask, sizeof(set)))
 		do_exit(SIGSEGV);
 	set_current_blocked(&set);
-	if (restore_sigcontext(regs, NULL, 0, &new_ctx->uc_mcontext))
+	if (restore_sigcontext(current, NULL, 0, &new_ctx->uc_mcontext))
 		do_exit(SIGSEGV);
 
 	/* This returns like rt_sigreturn */
@@ -667,6 +683,8 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 	unsigned long msr;
 #endif
 
+	BUG_ON(current->thread.regs != regs);
+
 	/* Always make any pending restarted system calls return -EINTR */
 	current->restart_block.fn = do_no_restart_syscall;
 
@@ -698,14 +716,14 @@ int sys_rt_sigreturn(unsigned long r3, unsigned long r4, unsigned long r5,
 		struct ucontext __user *uc_transact;
 		if (__get_user(uc_transact, &uc->uc_link))
 			goto badframe;
-		if (restore_tm_sigcontexts(regs, &uc->uc_mcontext,
+		if (restore_tm_sigcontexts(current, &uc->uc_mcontext,
 					   &uc_transact->uc_mcontext))
 			goto badframe;
 	}
 	else
 	/* Fall through, for non-TM restore */
 #endif
-	if (restore_sigcontext(regs, NULL, 1, &uc->uc_mcontext))
+	if (restore_sigcontext(current, NULL, 1, &uc->uc_mcontext))
 		goto badframe;
 
 	if (restore_altstack(&uc->uc_stack))
@@ -724,13 +742,17 @@ badframe:
 	return 0;
 }
 
-int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs)
+int handle_rt_signal64(struct ksignal *ksig, sigset_t *set,
+		struct task_struct *tsk)
 {
 	struct rt_sigframe __user *frame;
 	unsigned long newsp = 0;
 	long err = 0;
+	struct pt_regs *regs = tsk->thread.regs;
+
+	BUG_ON(tsk != current);
 
-	frame = get_sigframe(ksig, get_tm_stackpointer(regs), sizeof(*frame), 0);
+	frame = get_sigframe(ksig, get_tm_stackpointer(tsk), sizeof(*frame), 0);
 	if (unlikely(frame == NULL))
 		goto badframe;
 
@@ -751,14 +773,13 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs
 		err |= __put_user(&frame->uc_transact, &frame->uc.uc_link);
 		err |= setup_tm_sigcontexts(&frame->uc.uc_mcontext,
 					    &frame->uc_transact.uc_mcontext,
-					    regs, ksig->sig,
-					    NULL,
+					    tsk, ksig->sig, NULL,
 					    (unsigned long)ksig->ka.sa.sa_handler);
 	} else
 #endif
 	{
 		err |= __put_user(0, &frame->uc.uc_link);
-		err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, ksig->sig,
+		err |= setup_sigcontext(&frame->uc.uc_mcontext, tsk, ksig->sig,
 					NULL, (unsigned long)ksig->ka.sa.sa_handler,
 					1);
 	}
@@ -767,11 +788,11 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs
 		goto badframe;
 
 	/* Make sure signal handler doesn't get spurious FP exceptions */
-	current->thread.fp_state.fpscr = 0;
+	tsk->thread.fp_state.fpscr = 0;
 
 	/* Set up to return from userspace. */
-	if (vdso64_rt_sigtramp && current->mm->context.vdso_base) {
-		regs->link = current->mm->context.vdso_base + vdso64_rt_sigtramp;
+	if (vdso64_rt_sigtramp && tsk->mm->context.vdso_base) {
+		regs->link = tsk->mm->context.vdso_base + vdso64_rt_sigtramp;
 	} else {
 		err |= setup_trampoline(__NR_rt_sigreturn, &frame->tramp[0]);
 		if (err)
@@ -821,7 +842,7 @@ int handle_rt_signal64(struct ksignal *ksig, sigset_t *set, struct pt_regs *regs
 badframe:
 	if (show_unhandled_signals)
 		printk_ratelimited(regs->msr & MSR_64BIT ? fmt64 : fmt32,
-				   current->comm, current->pid, "setup_rt_frame",
+				   tsk->comm, tsk->pid, "setup_rt_frame",
 				   (long)frame, regs->nip, regs->link);
 
 	return 1;

+ 1 - 0
arch/powerpc/kernel/syscalls.c

@@ -40,6 +40,7 @@
 #include <asm/syscalls.h>
 #include <asm/time.h>
 #include <asm/unistd.h>
+#include <asm/asm-prototypes.h>
 
 static inline unsigned long do_mmap2(unsigned long addr, size_t len,
 			unsigned long prot, unsigned long flags,

+ 1 - 0
arch/powerpc/kernel/time.c

@@ -73,6 +73,7 @@
 #include <asm/vdso_datapage.h>
 #include <asm/firmware.h>
 #include <asm/cputime.h>
+#include <asm/asm-prototypes.h>
 
 /* powerpc clocksource/clockevent code */
 

+ 50 - 44
arch/powerpc/kernel/tm.S

@@ -108,6 +108,7 @@ _GLOBAL(tm_reclaim)
 	/* We've a struct pt_regs at [r1+STACK_FRAME_OVERHEAD]. */
 
 	std	r3, STK_PARAM(R3)(r1)
+	std	r4, STK_PARAM(R4)(r1)
 	SAVE_NVGPRS(r1)
 
 	/* We need to setup MSR for VSX register save instructions. */
@@ -126,43 +127,6 @@ _GLOBAL(tm_reclaim)
 	mtmsrd	r15
 	std	r14, TM_FRAME_L0(r1)
 
-	/* Stash the stack pointer away for use after reclaim */
-	std	r1, PACAR1(r13)
-
-	/* ******************** FPR/VR/VSRs ************
-	 * Before reclaiming, capture the current/transactional FPR/VR
-	* versions /if used/.
-	 *
-	 * (If VSX used, FP and VMX are implied.  Or, we don't need to look
-	 * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.)
-	 *
-	 * We're passed the thread's MSR as parameter 2.
-	 *
-	 * We enabled VEC/FP/VSX in the msr above, so we can execute these
-	 * instructions!
-	 */
-	andis.		r0, r4, MSR_VEC@h
-	beq	dont_backup_vec
-
-	addi	r7, r3, THREAD_TRANSACT_VRSTATE
-	SAVE_32VRS(0, r6, r7)	/* r6 scratch, r7 transact vr state */
-	mfvscr	v0
-	li	r6, VRSTATE_VSCR
-	stvx	v0, r7, r6
-dont_backup_vec:
-	mfspr	r0, SPRN_VRSAVE
-	std	r0, THREAD_TRANSACT_VRSAVE(r3)
-
-	andi.	r0, r4, MSR_FP
-	beq	dont_backup_fp
-
-	addi	r7, r3, THREAD_TRANSACT_FPSTATE
-	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
-
-	mffs    fr0
-	stfd    fr0,FPSTATE_FPSCR(r7)
-
-dont_backup_fp:
 	/* Do sanity check on MSR to make sure we are suspended */
 	li	r7, (MSR_TS_S)@higher
 	srdi	r6, r14, 32
@@ -170,6 +134,9 @@ dont_backup_fp:
 1:	tdeqi   r6, 0
 	EMIT_BUG_ENTRY 1b,__FILE__,__LINE__,0
 
+	/* Stash the stack pointer away for use after reclaim */
+	std	r1, PACAR1(r13)
+
 	/* Clear MSR RI since we are about to change r1, EE is already off. */
 	li	r4, 0
 	mtmsrd	r4, 1
@@ -273,6 +240,43 @@ dont_backup_fp:
 	 * MSR.
 	 */
 
+
+	/* ******************** FPR/VR/VSRs ************
+	 * After reclaiming, capture the checkpointed FPRs/VRs /if used/.
+	 *
+	 * (If VSX used, FP and VMX are implied.  Or, we don't need to look
+	 * at MSR.VSX as copying FP regs if .FP, vector regs if .VMX covers it.)
+	 *
+	 * We're passed the thread's MSR as the second parameter
+	 *
+	 * We enabled VEC/FP/VSX in the msr above, so we can execute these
+	 * instructions!
+	 */
+	ld	r4, STK_PARAM(R4)(r1)		/* Second parameter, MSR * */
+	mr	r3, r12
+	andis.		r0, r4, MSR_VEC@h
+	beq	dont_backup_vec
+
+	addi	r7, r3, THREAD_CKVRSTATE
+	SAVE_32VRS(0, r6, r7)	/* r6 scratch, r7 transact vr state */
+	mfvscr	v0
+	li	r6, VRSTATE_VSCR
+	stvx	v0, r7, r6
+dont_backup_vec:
+	mfspr	r0, SPRN_VRSAVE
+	std	r0, THREAD_CKVRSAVE(r3)
+
+	andi.	r0, r4, MSR_FP
+	beq	dont_backup_fp
+
+	addi	r7, r3, THREAD_CKFPSTATE
+	SAVE_32FPRS_VSRS(0, R6, R7)	/* r6 scratch, r7 transact fp state */
+
+	mffs    fr0
+	stfd    fr0,FPSTATE_FPSCR(r7)
+
+dont_backup_fp:
+
 	/* TM regs, incl TEXASR -- these live in thread_struct.  Note they've
 	 * been updated by the treclaim, to explain to userland the failure
 	 * cause (aborted).
@@ -288,6 +292,7 @@ dont_backup_fp:
 
 	/* Restore original MSR/IRQ state & clear TM mode */
 	ld	r14, TM_FRAME_L0(r1)		/* Orig MSR */
+
 	li	r15, 0
 	rldimi  r14, r15, MSR_TS_LG, (63-MSR_TS_LG)-1
 	mtmsrd  r14
@@ -356,28 +361,29 @@ _GLOBAL(__tm_recheckpoint)
 	mtmsr	r5
 
 #ifdef CONFIG_ALTIVEC
-	/* FP and VEC registers:  These are recheckpointed from thread.fpr[]
-	 * and thread.vr[] respectively.  The thread.transact_fpr[] version
-	 * is more modern, and will be loaded subsequently by any FPUnavailable
-	 * trap.
+	/*
+	 * FP and VEC registers: These are recheckpointed from
+	 * thread.ckfp_state and thread.ckvr_state respectively. The
+	 * thread.fp_state[] version holds the 'live' (transactional)
+	 * and will be loaded subsequently by any FPUnavailable trap.
 	 */
 	andis.	r0, r4, MSR_VEC@h
 	beq	dont_restore_vec
 
-	addi	r8, r3, THREAD_VRSTATE
+	addi	r8, r3, THREAD_CKVRSTATE
 	li	r5, VRSTATE_VSCR
 	lvx	v0, r8, r5
 	mtvscr	v0
 	REST_32VRS(0, r5, r8)			/* r5 scratch, r8 ptr */
 dont_restore_vec:
-	ld	r5, THREAD_VRSAVE(r3)
+	ld	r5, THREAD_CKVRSAVE(r3)
 	mtspr	SPRN_VRSAVE, r5
 #endif
 
 	andi.	r0, r4, MSR_FP
 	beq	dont_restore_fp
 
-	addi	r8, r3, THREAD_FPSTATE
+	addi	r8, r3, THREAD_CKFPSTATE
 	lfd	fr0, FPSTATE_FPSCR(r8)
 	MTFSF_L(fr0)
 	REST_32FPRS_VSRS(0, R4, R8)

+ 72 - 11
arch/powerpc/kernel/traps.c

@@ -117,7 +117,7 @@ static int die_owner = -1;
 static unsigned int die_nest_count;
 static int die_counter;
 
-static unsigned __kprobes long oops_begin(struct pt_regs *regs)
+static unsigned long oops_begin(struct pt_regs *regs)
 {
 	int cpu;
 	unsigned long flags;
@@ -144,8 +144,9 @@ static unsigned __kprobes long oops_begin(struct pt_regs *regs)
 		pmac_backlight_unblank();
 	return flags;
 }
+NOKPROBE_SYMBOL(oops_begin);
 
-static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs,
+static void oops_end(unsigned long flags, struct pt_regs *regs,
 			       int signr)
 {
 	bust_spinlocks(0);
@@ -196,8 +197,9 @@ static void __kprobes oops_end(unsigned long flags, struct pt_regs *regs,
 		panic("Fatal exception");
 	do_exit(signr);
 }
+NOKPROBE_SYMBOL(oops_end);
 
-static int __kprobes __die(const char *str, struct pt_regs *regs, long err)
+static int __die(const char *str, struct pt_regs *regs, long err)
 {
 	printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter);
 #ifdef CONFIG_PREEMPT
@@ -221,6 +223,7 @@ static int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 
 	return 0;
 }
+NOKPROBE_SYMBOL(__die);
 
 void die(const char *str, struct pt_regs *regs, long err)
 {
@@ -802,7 +805,7 @@ void RunModeException(struct pt_regs *regs)
 	_exception(SIGTRAP, regs, 0, 0);
 }
 
-void __kprobes single_step_exception(struct pt_regs *regs)
+void single_step_exception(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
 
@@ -819,6 +822,7 @@ void __kprobes single_step_exception(struct pt_regs *regs)
 bail:
 	exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(single_step_exception);
 
 /*
  * After we have successfully emulated an instruction, we have to
@@ -1140,7 +1144,7 @@ static int emulate_math(struct pt_regs *regs)
 static inline int emulate_math(struct pt_regs *regs) { return -1; }
 #endif
 
-void __kprobes program_check_exception(struct pt_regs *regs)
+void program_check_exception(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
 	unsigned int reason = get_reason(regs);
@@ -1260,16 +1264,18 @@ sigill:
 bail:
 	exception_exit(prev_state);
 }
+NOKPROBE_SYMBOL(program_check_exception);
 
 /*
  * This occurs when running in hypervisor mode on POWER6 or later
  * and an illegal instruction is encountered.
  */
-void __kprobes emulation_assist_interrupt(struct pt_regs *regs)
+void emulation_assist_interrupt(struct pt_regs *regs)
 {
 	regs->msr |= REASON_ILLEGAL;
 	program_check_exception(regs);
 }
+NOKPROBE_SYMBOL(emulation_assist_interrupt);
 
 void alignment_exception(struct pt_regs *regs)
 {
@@ -1310,6 +1316,18 @@ bail:
 	exception_exit(prev_state);
 }
 
+void slb_miss_bad_addr(struct pt_regs *regs)
+{
+	enum ctx_state prev_state = exception_enter();
+
+	if (user_mode(regs))
+		_exception(SIGSEGV, regs, SEGV_BNDERR, regs->dar);
+	else
+		bad_page_fault(regs, regs->dar, SIGSEGV);
+
+	exception_exit(prev_state);
+}
+
 void StackOverflow(struct pt_regs *regs)
 {
 	printk(KERN_CRIT "Kernel stack overflow in process %p, r1=%lx\n",
@@ -1372,6 +1390,22 @@ void vsx_unavailable_exception(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_PPC64
+static void tm_unavailable(struct pt_regs *regs)
+{
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+	if (user_mode(regs)) {
+		current->thread.load_tm++;
+		regs->msr |= MSR_TM;
+		tm_enable();
+		tm_restore_sprs(&current->thread);
+		return;
+	}
+#endif
+	pr_emerg("Unrecoverable TM Unavailable Exception "
+			"%lx at %lx\n", regs->trap, regs->nip);
+	die("Unrecoverable TM Unavailable Exception", regs, SIGABRT);
+}
+
 void facility_unavailable_exception(struct pt_regs *regs)
 {
 	static char *facility_strings[] = {
@@ -1451,6 +1485,27 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		return;
 	}
 
+	if (status == FSCR_TM_LG) {
+		/*
+		 * If we're here then the hardware is TM aware because it
+		 * generated an exception with FSRM_TM set.
+		 *
+		 * If cpu_has_feature(CPU_FTR_TM) is false, then either firmware
+		 * told us not to do TM, or the kernel is not built with TM
+		 * support.
+		 *
+		 * If both of those things are true, then userspace can spam the
+		 * console by triggering the printk() below just by continually
+		 * doing tbegin (or any TM instruction). So in that case just
+		 * send the process a SIGILL immediately.
+		 */
+		if (!cpu_has_feature(CPU_FTR_TM))
+			goto out;
+
+		tm_unavailable(regs);
+		return;
+	}
+
 	if ((status < ARRAY_SIZE(facility_strings)) &&
 	    facility_strings[status])
 		facility = facility_strings[status];
@@ -1463,6 +1518,7 @@ void facility_unavailable_exception(struct pt_regs *regs)
 		"%sFacility '%s' unavailable, exception at 0x%lx, MSR=%lx\n",
 		hv ? "Hypervisor " : "", facility, regs->nip, regs->msr);
 
+out:
 	if (user_mode(regs)) {
 		_exception(SIGILL, regs, ILL_ILLOPC, regs->nip);
 		return;
@@ -1504,7 +1560,8 @@ void fp_unavailable_tm(struct pt_regs *regs)
 
 	/* If VMX is in use, get the transactional values back */
 	if (regs->msr & MSR_VEC) {
-		do_load_up_transact_altivec(&current->thread);
+		msr_check_and_set(MSR_VEC);
+		load_vr_state(&current->thread.vr_state);
 		/* At this point all the VSX state is loaded, so enable it */
 		regs->msr |= MSR_VSX;
 	}
@@ -1525,7 +1582,8 @@ void altivec_unavailable_tm(struct pt_regs *regs)
 	current->thread.used_vr = 1;
 
 	if (regs->msr & MSR_FP) {
-		do_load_up_transact_fpu(&current->thread);
+		msr_check_and_set(MSR_FP);
+		load_fp_state(&current->thread.fp_state);
 		regs->msr |= MSR_VSX;
 	}
 }
@@ -1564,10 +1622,12 @@ void vsx_unavailable_tm(struct pt_regs *regs)
 	 */
 	tm_recheckpoint(&current->thread, regs->msr & ~orig_msr);
 
+	msr_check_and_set(orig_msr & (MSR_FP | MSR_VEC));
+
 	if (orig_msr & MSR_FP)
-		do_load_up_transact_fpu(&current->thread);
+		load_fp_state(&current->thread.fp_state);
 	if (orig_msr & MSR_VEC)
-		do_load_up_transact_altivec(&current->thread);
+		load_vr_state(&current->thread.vr_state);
 }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
@@ -1656,7 +1716,7 @@ static void handle_debug(struct pt_regs *regs, unsigned long debug_status)
 		mtspr(SPRN_DBCR0, current->thread.debug.dbcr0);
 }
 
-void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
+void DebugException(struct pt_regs *regs, unsigned long debug_status)
 {
 	current->thread.debug.dbsr = debug_status;
 
@@ -1717,6 +1777,7 @@ void __kprobes DebugException(struct pt_regs *regs, unsigned long debug_status)
 	} else
 		handle_debug(regs, debug_status);
 }
+NOKPROBE_SYMBOL(DebugException);
 #endif /* CONFIG_PPC_ADV_DEBUG_REGS */
 
 #if !defined(CONFIG_TAU_INT)

+ 0 - 6
arch/powerpc/kernel/vdso64/Makefile

@@ -31,15 +31,9 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
 $(obj)/%.so: $(obj)/%.so.dbg FORCE
 	$(call if_changed,objcopy)
 
-# assembly rules for the .S files
-$(obj-vdso64): %.o: %.S FORCE
-	$(call if_changed_dep,vdso64as)
-
 # actual build commands
 quiet_cmd_vdso64ld = VDSO64L $@
       cmd_vdso64ld = $(CC) $(c_flags) -o $@ -Wl,-T$(filter %.lds,$^) $(filter %.o,$^)
-quiet_cmd_vdso64as = VDSO64A $@
-      cmd_vdso64as = $(CC) $(a_flags) -c -o $@ $<
 
 # install commands for the unstripped file
 quiet_cmd_vdso_install = INSTALL $@

+ 1 - 1
arch/powerpc/kernel/vdso64/datapage.S

@@ -59,7 +59,7 @@ V_FUNCTION_BEGIN(__kernel_get_syscall_map)
 	bl	V_LOCAL_FUNC(__get_datapage)
 	mtlr	r12
 	addi	r3,r3,CFG_SYSCALL_MAP64
-	cmpli	cr0,r4,0
+	cmpldi	cr0,r4,0
 	crclr	cr0*4+so
 	beqlr
 	li	r0,NR_syscalls

+ 1 - 1
arch/powerpc/kernel/vdso64/gettimeofday.S

@@ -145,7 +145,7 @@ V_FUNCTION_BEGIN(__kernel_clock_getres)
 	bne	cr0,99f
 
 	li	r3,0
-	cmpli	cr0,r4,0
+	cmpldi	cr0,r4,0
 	crclr	cr0*4+so
 	beqlr
 	lis	r5,CLOCK_REALTIME_RES@h

+ 0 - 25
arch/powerpc/kernel/vector.S

@@ -7,31 +7,6 @@
 #include <asm/page.h>
 #include <asm/ptrace.h>
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-/* void do_load_up_transact_altivec(struct thread_struct *thread)
- *
- * This is similar to load_up_altivec but for the transactional version of the
- * vector regs.  It doesn't mess with the task MSR or valid flags.
- * Furthermore, VEC laziness is not supported with TM currently.
- */
-_GLOBAL(do_load_up_transact_altivec)
-	mfmsr	r6
-	oris	r5,r6,MSR_VEC@h
-	MTMSRD(r5)
-	isync
-
-	li	r4,1
-	stw	r4,THREAD_USED_VR(r3)
-
-	li	r10,THREAD_TRANSACT_VRSTATE+VRSTATE_VSCR
-	lvx	v0,r10,r3
-	mtvscr	v0
-	addi	r10,r3,THREAD_TRANSACT_VRSTATE
-	REST_32VRS(0,r4,r10)
-
-	blr
-#endif
-
 /*
  * Load state from memory into VMX registers including VSCR.
  * Assumes the caller has enabled VMX in the MSR.

+ 52 - 3
arch/powerpc/kernel/vmlinux.lds.S

@@ -44,11 +44,58 @@ SECTIONS
  * Text, read only data and other permanent read-only sections
  */
 
-	/* Text and gots */
+	_text = .;
+	_stext = .;
+
+	/*
+	 * Head text.
+	 * This needs to be in its own output section to avoid ld placing
+	 * branch trampoline stubs randomly throughout the fixed sections,
+	 * which it will do (even if the branch comes from another section)
+	 * in order to optimize stub generation.
+	 */
+	.head.text : AT(ADDR(.head.text) - LOAD_OFFSET) {
+#ifdef CONFIG_PPC64
+		KEEP(*(.head.text.first_256B));
+#ifdef CONFIG_PPC_BOOK3E
+# define END_FIXED	0x100
+#else
+		KEEP(*(.head.text.real_vectors));
+		*(.head.text.real_trampolines);
+		KEEP(*(.head.text.virt_vectors));
+		*(.head.text.virt_trampolines);
+# if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV)
+		KEEP(*(.head.data.fwnmi_page));
+#  define END_FIXED	0x8000
+# else
+#  define END_FIXED	0x7000
+# endif
+#endif
+		ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error");
+#else /* !CONFIG_PPC64 */
+		HEAD_TEXT
+#endif
+	} :kernel
+
+	/*
+	 * If the build dies here, it's likely code in head_64.S is referencing
+	 * labels it can't reach, and the linker inserting stubs without the
+	 * assembler's knowledge. To debug, remove the above assert and
+	 * rebuild. Look for branch stubs in the fixed section region.
+	 *
+	 * Linker stub generation could be allowed in "trampoline"
+	 * sections if absolutely necessary, but this would require
+	 * some rework of the fixed sections. Before resorting to this,
+	 * consider references that have sufficient addressing range,
+	 * (e.g., hand coded trampolines) so the linker does not have
+	 * to add stubs.
+	 *
+	 * Linker stubs at the top of the main text section are currently not
+	 * detected, and will result in a crash at boot due to offsets being
+	 * wrong.
+	 */
 	.text : AT(ADDR(.text) - LOAD_OFFSET) {
 		ALIGN_FUNCTION();
-		HEAD_TEXT
-		_text = .;
 		/* careful! __ftr_alt_* sections need to be close to .text */
 		*(.text .fixup __ftr_alt_* .ref.text)
 		SCHED_TEXT
@@ -56,6 +103,8 @@ SECTIONS
 		KPROBES_TEXT
 		IRQENTRY_TEXT
 		SOFTIRQENTRY_TEXT
+		MEM_KEEP(init.text)
+		MEM_KEEP(exit.text)
 
 #ifdef CONFIG_PPC32
 		*(.got1)

+ 1 - 1
arch/powerpc/lib/Makefile

@@ -22,7 +22,7 @@ obj64-$(CONFIG_SMP)	+= locks.o
 obj64-$(CONFIG_ALTIVEC)	+= vmx-helper.o
 
 ifeq ($(CONFIG_GENERIC_CSUM),)
-obj-y			+= checksum_$(CONFIG_WORD_SIZE).o checksum_wrappers.o
+obj-y			+= checksum_$(BITS).o checksum_wrappers.o
 endif
 
 obj-$(CONFIG_PPC_EMULATE_SSTEP)	+= sstep.o ldstfp.o

+ 2 - 0
arch/powerpc/lib/mem_64.S

@@ -37,6 +37,7 @@ _GLOBAL(memset)
 	clrldi	r5,r5,58
 	mtctr	r0
 	beq	5f
+	.balign 16
 4:	std	r4,0(r6)
 	std	r4,8(r6)
 	std	r4,16(r6)
@@ -90,6 +91,7 @@ _GLOBAL(backwards_memcpy)
 	andi.	r0,r6,3
 	mtctr	r7
 	bne	5f
+	.balign 16
 1:	lwz	r7,-4(r4)
 	lwzu	r8,-8(r4)
 	stw	r7,-4(r6)

+ 3 - 4
arch/powerpc/mm/Makefile

@@ -7,17 +7,16 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
 ccflags-$(CONFIG_PPC64)	:= $(NO_MINIMAL_TOC)
 
 obj-y				:= fault.o mem.o pgtable.o mmap.o \
-				   init_$(CONFIG_WORD_SIZE).o \
-				   pgtable_$(CONFIG_WORD_SIZE).o
+				   init_$(BITS).o pgtable_$(BITS).o
 obj-$(CONFIG_PPC_MMU_NOHASH)	+= mmu_context_nohash.o tlb_nohash.o \
 				   tlb_nohash_low.o
-obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(CONFIG_WORD_SIZE)e.o
+obj-$(CONFIG_PPC_BOOK3E)	+= tlb_low_$(BITS)e.o
 hash64-$(CONFIG_PPC_NATIVE)	:= hash_native_64.o
 obj-$(CONFIG_PPC_BOOK3E_64)   += pgtable-book3e.o
 obj-$(CONFIG_PPC_STD_MMU_64)	+= pgtable-hash64.o hash_utils_64.o slb_low.o slb.o $(hash64-y) mmu_context_book3s64.o pgtable-book3s64.o
 obj-$(CONFIG_PPC_RADIX_MMU)	+= pgtable-radix.o tlb-radix.o
 obj-$(CONFIG_PPC_STD_MMU_32)	+= ppc_mmu_32.o hash_low_32.o mmu_context_hash32.o
-obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(CONFIG_WORD_SIZE).o
+obj-$(CONFIG_PPC_STD_MMU)	+= tlb_hash$(BITS).o
 ifeq ($(CONFIG_PPC_STD_MMU_64),y)
 obj-$(CONFIG_PPC_4K_PAGES)	+= hash64_4k.o
 obj-$(CONFIG_PPC_64K_PAGES)	+= hash64_64k.o

+ 2 - 2
arch/powerpc/mm/fault.c

@@ -205,7 +205,7 @@ static int mm_fault_error(struct pt_regs *regs, unsigned long addr, int fault)
  * The return value is 0 if the fault was handled, or the signal
  * number if this is a kernel fault that can't be handled here.
  */
-int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
+int do_page_fault(struct pt_regs *regs, unsigned long address,
 			    unsigned long error_code)
 {
 	enum ctx_state prev_state = exception_enter();
@@ -498,8 +498,8 @@ bad_area_nosemaphore:
 bail:
 	exception_exit(prev_state);
 	return rc;
-
 }
+NOKPROBE_SYMBOL(do_page_fault);
 
 /*
  * bad_page_fault is called when we have a bad access from the kernel.

+ 50 - 31
arch/powerpc/mm/hash_utils_64.c

@@ -766,6 +766,29 @@ int remove_section_mapping(unsigned long start, unsigned long end)
 }
 #endif /* CONFIG_MEMORY_HOTPLUG */
 
+static void update_hid_for_hash(void)
+{
+	unsigned long hid0;
+	unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
+
+	asm volatile("ptesync": : :"memory");
+	/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(0), "i"(0), "i"(2), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
+	/*
+	 * now switch the HID
+	 */
+	hid0  = mfspr(SPRN_HID0);
+	hid0 &= ~HID0_POWER9_RADIX;
+	mtspr(SPRN_HID0, hid0);
+	asm volatile("isync": : :"memory");
+
+	/* Wait for it to happen */
+	while ((mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
+		cpu_relax();
+}
+
 static void __init hash_init_partition_table(phys_addr_t hash_table,
 					     unsigned long htab_size)
 {
@@ -792,6 +815,8 @@ static void __init hash_init_partition_table(phys_addr_t hash_table,
 	 */
 	partition_tb->patb1 = 0;
 	pr_info("Partition table %p\n", partition_tb);
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+		update_hid_for_hash();
 	/*
 	 * update partition table control register,
 	 * 64 K size.
@@ -1515,6 +1540,29 @@ out_exit:
 	local_irq_restore(flags);
 }
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+static inline void tm_flush_hash_page(int local)
+{
+	/*
+	 * Transactions are not aborted by tlbiel, only tlbie. Without, syncing a
+	 * page back to a block device w/PIO could pick up transactional data
+	 * (bad!) so we force an abort here. Before the sync the page will be
+	 * made read-only, which will flush_hash_page. BIG ISSUE here: if the
+	 * kernel uses a page from userspace without unmapping it first, it may
+	 * see the speculated version.
+	 */
+	if (local && cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
+	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
+		tm_enable();
+		tm_abort(TM_CAUSE_TLBI);
+	}
+}
+#else
+static inline void tm_flush_hash_page(int local)
+{
+}
+#endif
+
 /* WARNING: This is called from hash_low_64.S, if you change this prototype,
  *          do not forget to update the assembly call site !
  */
@@ -1541,21 +1589,7 @@ void flush_hash_page(unsigned long vpn, real_pte_t pte, int psize, int ssize,
 					     ssize, local);
 	} pte_iterate_hashed_end();
 
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Transactions are not aborted by tlbiel, only tlbie.
-	 * Without, syncing a page back to a block device w/ PIO could pick up
-	 * transactional data (bad!) so we force an abort here.  Before the
-	 * sync the page will be made read-only, which will flush_hash_page.
-	 * BIG ISSUE here: if the kernel uses a page from userspace without
-	 * unmapping it first, it may see the speculated version.
-	 */
-	if (local && cpu_has_feature(CPU_FTR_TM) &&
-	    current->thread.regs &&
-	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
-		tm_enable();
-		tm_abort(TM_CAUSE_TLBI);
-	}
-#endif
+	tm_flush_hash_page(local);
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1612,22 +1646,7 @@ void flush_hash_hugepage(unsigned long vsid, unsigned long addr,
 					     MMU_PAGE_16M, ssize, local);
 	}
 tm_abort:
-#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
-	/* Transactions are not aborted by tlbiel, only tlbie.
-	 * Without, syncing a page back to a block device w/ PIO could pick up
-	 * transactional data (bad!) so we force an abort here.  Before the
-	 * sync the page will be made read-only, which will flush_hash_page.
-	 * BIG ISSUE here: if the kernel uses a page from userspace without
-	 * unmapping it first, it may see the speculated version.
-	 */
-	if (local && cpu_has_feature(CPU_FTR_TM) &&
-	    current->thread.regs &&
-	    MSR_TM_ACTIVE(current->thread.regs->msr)) {
-		tm_enable();
-		tm_abort(TM_CAUSE_TLBI);
-	}
-#endif
-	return;
+	tm_flush_hash_page(local);
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 

+ 7 - 0
arch/powerpc/mm/hugetlbpage.c

@@ -1019,8 +1019,15 @@ int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
 
 	pte = READ_ONCE(*ptep);
 	mask = _PAGE_PRESENT | _PAGE_READ;
+
+	/*
+	 * On some CPUs like the 8xx, _PAGE_RW hence _PAGE_WRITE is defined
+	 * as 0 and _PAGE_RO has to be set when a page is not writable
+	 */
 	if (write)
 		mask |= _PAGE_WRITE;
+	else
+		mask |= _PAGE_RO;
 
 	if ((pte_val(pte) & mask) != mask)
 		return 0;

+ 1 - 1
arch/powerpc/mm/init_32.c

@@ -137,7 +137,7 @@ void __init MMU_init(void)
 	if (memblock.memory.cnt > 1) {
 #ifndef CONFIG_WII
 		memblock_enforce_memory_limit(memblock.memory.regions[0].size);
-		printk(KERN_WARNING "Only using first contiguous memory region");
+		pr_warn("Only using first contiguous memory region\n");
 #else
 		wii_memory_fixups();
 #endif

+ 77 - 4
arch/powerpc/mm/mmu_context_iommu.c

@@ -15,6 +15,9 @@
 #include <linux/rculist.h>
 #include <linux/vmalloc.h>
 #include <linux/mutex.h>
+#include <linux/migrate.h>
+#include <linux/hugetlb.h>
+#include <linux/swap.h>
 #include <asm/mmu_context.h>
 
 static DEFINE_MUTEX(mem_list_mutex);
@@ -72,6 +75,55 @@ bool mm_iommu_preregistered(void)
 }
 EXPORT_SYMBOL_GPL(mm_iommu_preregistered);
 
+/*
+ * Taken from alloc_migrate_target with changes to remove CMA allocations
+ */
+struct page *new_iommu_non_cma_page(struct page *page, unsigned long private,
+					int **resultp)
+{
+	gfp_t gfp_mask = GFP_USER;
+	struct page *new_page;
+
+	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+		return NULL;
+
+	if (PageHighMem(page))
+		gfp_mask |= __GFP_HIGHMEM;
+
+	/*
+	 * We don't want the allocation to force an OOM if possibe
+	 */
+	new_page = alloc_page(gfp_mask | __GFP_NORETRY | __GFP_NOWARN);
+	return new_page;
+}
+
+static int mm_iommu_move_page_from_cma(struct page *page)
+{
+	int ret = 0;
+	LIST_HEAD(cma_migrate_pages);
+
+	/* Ignore huge pages for now */
+	if (PageHuge(page) || PageTransHuge(page) || PageCompound(page))
+		return -EBUSY;
+
+	lru_add_drain();
+	ret = isolate_lru_page(page);
+	if (ret)
+		return ret;
+
+	list_add(&page->lru, &cma_migrate_pages);
+	put_page(page); /* Drop the gup reference */
+
+	ret = migrate_pages(&cma_migrate_pages, new_iommu_non_cma_page,
+				NULL, 0, MIGRATE_SYNC, MR_CMA);
+	if (ret) {
+		if (!list_empty(&cma_migrate_pages))
+			putback_movable_pages(&cma_migrate_pages);
+	}
+
+	return 0;
+}
+
 long mm_iommu_get(unsigned long ua, unsigned long entries,
 		struct mm_iommu_table_group_mem_t **pmem)
 {
@@ -124,15 +176,36 @@ long mm_iommu_get(unsigned long ua, unsigned long entries,
 	for (i = 0; i < entries; ++i) {
 		if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
 					1/* pages */, 1/* iswrite */, &page)) {
+			ret = -EFAULT;
 			for (j = 0; j < i; ++j)
-				put_page(pfn_to_page(
-						mem->hpas[j] >> PAGE_SHIFT));
+				put_page(pfn_to_page(mem->hpas[j] >>
+						PAGE_SHIFT));
 			vfree(mem->hpas);
 			kfree(mem);
-			ret = -EFAULT;
 			goto unlock_exit;
 		}
-
+		/*
+		 * If we get a page from the CMA zone, since we are going to
+		 * be pinning these entries, we might as well move them out
+		 * of the CMA zone if possible. NOTE: faulting in + migration
+		 * can be expensive. Batching can be considered later
+		 */
+		if (get_pageblock_migratetype(page) == MIGRATE_CMA) {
+			if (mm_iommu_move_page_from_cma(page))
+				goto populate;
+			if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT),
+						1/* pages */, 1/* iswrite */,
+						&page)) {
+				ret = -EFAULT;
+				for (j = 0; j < i; ++j)
+					put_page(pfn_to_page(mem->hpas[j] >>
+								PAGE_SHIFT));
+				vfree(mem->hpas);
+				kfree(mem);
+				goto unlock_exit;
+			}
+		}
+populate:
 		mem->hpas[i] = page_to_pfn(page) << PAGE_SHIFT;
 	}
 

+ 10 - 1
arch/powerpc/mm/pgtable-book3s64.c

@@ -35,7 +35,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 #endif
 	changed = !pmd_same(*(pmdp), entry);
 	if (changed) {
-		__ptep_set_access_flags(pmdp_ptep(pmdp), pmd_pte(entry));
+		__ptep_set_access_flags(vma->vm_mm, pmdp_ptep(pmdp), pmd_pte(entry));
 		flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
 	}
 	return changed;
@@ -116,3 +116,12 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr,
 	return;
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+/* For use by kexec */
+void mmu_cleanup_all(void)
+{
+	if (radix_enabled())
+		radix__mmu_cleanup_all();
+	else if (mmu_hash_ops.hpte_clear_all)
+		mmu_hash_ops.hpte_clear_all();
+}

+ 40 - 0
arch/powerpc/mm/pgtable-radix.c

@@ -294,6 +294,32 @@ found:
 	return;
 }
 
+static void update_hid_for_radix(void)
+{
+	unsigned long hid0;
+	unsigned long rb = 3UL << PPC_BITLSHIFT(53); /* IS = 3 */
+
+	asm volatile("ptesync": : :"memory");
+	/* prs = 0, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(1), "i"(0), "i"(2), "r"(0) : "memory");
+	/* prs = 1, ric = 2, rs = 0, r = 1 is = 3 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(1), "i"(1), "i"(2), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync; isync; slbia": : :"memory");
+	/*
+	 * now switch the HID
+	 */
+	hid0  = mfspr(SPRN_HID0);
+	hid0 |= HID0_POWER9_RADIX;
+	mtspr(SPRN_HID0, hid0);
+	asm volatile("isync": : :"memory");
+
+	/* Wait for it to happen */
+	while (!(mfspr(SPRN_HID0) & HID0_POWER9_RADIX))
+		cpu_relax();
+}
+
 void __init radix__early_init_mmu(void)
 {
 	unsigned long lpcr;
@@ -345,6 +371,8 @@ void __init radix__early_init_mmu(void)
 
 	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
 		radix_init_native();
+		if (cpu_has_feature(CPU_FTR_POWER9_DD1))
+			update_hid_for_radix();
 		lpcr = mfspr(SPRN_LPCR);
 		mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
 		radix_init_partition_table();
@@ -368,6 +396,18 @@ void radix__early_init_mmu_secondary(void)
 	}
 }
 
+void radix__mmu_cleanup_all(void)
+{
+	unsigned long lpcr;
+
+	if (!firmware_has_feature(FW_FEATURE_LPAR)) {
+		lpcr = mfspr(SPRN_LPCR);
+		mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
+		mtspr(SPRN_PTCR, 0);
+		radix__flush_tlb_all();
+	}
+}
+
 void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base,
 				phys_addr_t first_memblock_size)
 {

+ 1 - 1
arch/powerpc/mm/pgtable.c

@@ -224,7 +224,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address,
 	if (changed) {
 		if (!is_vm_hugetlb_page(vma))
 			assert_pte_locked(vma->vm_mm, address);
-		__ptep_set_access_flags(ptep, entry);
+		__ptep_set_access_flags(vma->vm_mm, ptep, entry);
 		flush_tlb_page(vma, address);
 	}
 	return changed;

+ 3 - 5
arch/powerpc/mm/slb_low.S

@@ -178,11 +178,9 @@ BEGIN_FTR_SECTION
 END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT)
 	b	slb_finish_load
 
-8:	/* invalid EA */
-	li	r10,0			/* BAD_VSID */
-	li	r9,0			/* BAD_VSID */
-	li	r11,SLB_VSID_USER	/* flags don't much matter */
-	b	slb_finish_load
+8:	/* invalid EA - return an error indication */
+	crset	4*cr0+eq		/* indicate failure */
+	blr
 
 /*
  * Finish loading of an SLB entry and return

+ 24 - 0
arch/powerpc/mm/tlb-radix.c

@@ -400,3 +400,27 @@ void radix__flush_pmd_tlb_range(struct vm_area_struct *vma,
 	radix__flush_tlb_range_psize(vma->vm_mm, start, end, MMU_PAGE_2M);
 }
 EXPORT_SYMBOL(radix__flush_pmd_tlb_range);
+
+void radix__flush_tlb_all(void)
+{
+	unsigned long rb,prs,r,rs;
+	unsigned long ric = RIC_FLUSH_ALL;
+
+	rb = 0x3 << PPC_BITLSHIFT(53); /* IS = 3 */
+	prs = 0; /* partition scoped */
+	r = 1;   /* raidx format */
+	rs = 1 & ((1UL << 32) - 1); /* any LPID value to flush guest mappings */
+
+	asm volatile("ptesync": : :"memory");
+	/*
+	 * now flush guest entries by passing PRS = 1 and LPID != 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(1), "i"(ric), "r"(rs) : "memory");
+	/*
+	 * now flush host entires by passing PRS = 0 and LPID == 0
+	 */
+	asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+		     : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(0) : "memory");
+	asm volatile("eieio; tlbsync; ptesync": : :"memory");
+}

+ 2 - 0
arch/powerpc/net/bpf_jit.h

@@ -40,6 +40,8 @@
 #define PPC_BLR()		EMIT(PPC_INST_BLR)
 #define PPC_BLRL()		EMIT(PPC_INST_BLRL)
 #define PPC_MTLR(r)		EMIT(PPC_INST_MTLR | ___PPC_RT(r))
+#define PPC_BCTR()		EMIT(PPC_INST_BCTR)
+#define PPC_MTCTR(r)		EMIT(PPC_INST_MTCTR | ___PPC_RT(r))
 #define PPC_ADDI(d, a, i)	EMIT(PPC_INST_ADDI | ___PPC_RT(d) |	      \
 				     ___PPC_RA(a) | IMM_L(i))
 #define PPC_MR(d, a)		PPC_OR(d, a, a)

Some files were not shown because too many files changed in this diff