Browse Source

Merge branch 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm

Pull ARM updates from Russell King:
 "Included in this update:

   - perf updates from Will Deacon:

     The main changes are callchain stability fixes from Jean Pihet and
     event mapping and PMU name rework from Mark Rutland

     The latter is preparatory work for enabling some code re-use with
     arm64 in the future.

   - updates for nommu from Uwe Kleine-König:

     Two different fixes for the same problem making some ARM nommu
     configurations not boot since 3.6-rc1.  The problem is that
     user_addr_max returned the biggest available RAM address which
     makes some copy_from_user variants fail to read from XIP memory.

   - deprecate legacy OMAP DMA API, in preparation for it's removal.

     The popular drivers have been converted over, leaving a very small
     number of rarely used drivers, which hopefully can be converted
     during the next cycle with a bit more visibility (and hopefully
     people popping out of the woodwork to help test)

   - more tweaks for BE systems, particularly with the kernel image
     format.  In connection with this, I've cleaned up the way we
     generate the linker script for the decompressor.

   - removal of hard-coded assumptions of the kernel stack size, making
     everywhere depend on the value of THREAD_SIZE_ORDER.

   - MCPM updates from Nicolas Pitre.

   - Make it easier for proper CPU part number checks (which should
     always include the vendor field).

   - Assembly code optimisation - use the "bx" instruction when
     returning from a function on ARMv6+ rather than "mov pc, reg".

   - Save the last kernel misaligned fault location and report it via
     the procfs alignment file.

   - Clean up the way we create the initial stack frame, which is a
     repeated pattern in several different locations.

   - Support for 8-byte get_user(), needed for some DRM implementations.

   - mcs locking from Will Deacon.

   - Save and restore a few more Cortex-A9 registers (for errata
     workarounds)

   - Fix various aspects of the SWP emulation, and the ELF hwcap for the
     SWP instruction.

   - Update LPAE logic for pte_write and pmd_write to make it more
     correct.

   - Support for Broadcom Brahma15 CPU cores.

   - ARM assembly crypto updates from Ard Biesheuvel"

* 'for-linus' of git://ftp.arm.linux.org.uk/~rmk/linux-arm: (53 commits)
  ARM: add comments to the early page table remap code
  ARM: 8122/1: smp_scu: enable SCU standby support
  ARM: 8121/1: smp_scu: use macro for SCU enable bit
  ARM: 8120/1: crypto: sha512: add ARM NEON implementation
  ARM: 8119/1: crypto: sha1: add ARM NEON implementation
  ARM: 8118/1: crypto: sha1/make use of common SHA-1 structures
  ARM: 8113/1: remove remaining definitions of PLAT_PHYS_OFFSET from <mach/memory.h>
  ARM: 8111/1: Enable erratum 798181 for Broadcom Brahma-B15
  ARM: 8110/1: do CPU-specific init for Broadcom Brahma15 cores
  ARM: 8109/1: mm: Modify pte_write and pmd_write logic for LPAE
  ARM: 8108/1: mm: Introduce {pte,pmd}_isset and {pte,pmd}_isclear
  ARM: hwcap: disable HWCAP_SWP if the CPU advertises it has exclusives
  ARM: SWP emulation: only initialise on ARMv7 CPUs
  ARM: SWP emulation: always enable when SMP is enabled
  ARM: 8103/1: save/restore Cortex-A9 CP15 registers on suspend/resume
  ARM: 8098/1: mcs lock: implement wfe-based polling for MCS locking
  ARM: 8091/2: add get_user() support for 8 byte types
  ARM: 8097/1: unistd.h: relocate comments back to place
  ARM: 8096/1: Describe required sort order for textofs-y (TEXT_OFFSET)
  ARM: 8090/1: add revision info for PL310 errata 588369 and 727915
  ...
Linus Torvalds 11 years ago
parent
commit
c489d98c8c
100 changed files with 2583 additions and 1489 deletions
  1. 15 2
      arch/arm/Kconfig
  2. 3 0
      arch/arm/Makefile
  3. 1 4
      arch/arm/boot/compressed/Makefile
  4. 5 3
      arch/arm/boot/compressed/head.S
  5. 14 3
      arch/arm/boot/compressed/vmlinux.lds.S
  6. 52 0
      arch/arm/common/mcpm_entry.c
  7. 4 0
      arch/arm/crypto/Makefile
  8. 2 1
      arch/arm/crypto/aes-armv4.S
  9. 634 0
      arch/arm/crypto/sha1-armv7-neon.S
  10. 27 31
      arch/arm/crypto/sha1_glue.c
  11. 197 0
      arch/arm/crypto/sha1_neon_glue.c
  12. 455 0
      arch/arm/crypto/sha512-armv7-neon.S
  13. 305 0
      arch/arm/crypto/sha512_neon_glue.c
  14. 26 3
      arch/arm/include/asm/assembler.h
  15. 24 13
      arch/arm/include/asm/cputype.h
  16. 10 0
      arch/arm/include/asm/crypto/sha1.h
  17. 1 1
      arch/arm/include/asm/entry-macro-multi.S
  18. 9 9
      arch/arm/include/asm/glue-proc.h
  19. 16 0
      arch/arm/include/asm/mcpm.h
  20. 23 0
      arch/arm/include/asm/mcs_spinlock.h
  21. 3 7
      arch/arm/include/asm/memory.h
  22. 0 9
      arch/arm/include/asm/perf_event.h
  23. 2 1
      arch/arm/include/asm/pgtable-3level-hwdef.h
  24. 30 19
      arch/arm/include/asm/pgtable-3level.h
  25. 11 7
      arch/arm/include/asm/pgtable.h
  26. 19 0
      arch/arm/include/asm/pmu.h
  27. 6 0
      arch/arm/include/asm/ptrace.h
  28. 1 1
      arch/arm/include/asm/smp_scu.h
  29. 15 0
      arch/arm/include/asm/stacktrace.h
  30. 2 1
      arch/arm/include/asm/thread_info.h
  31. 20 2
      arch/arm/include/asm/uaccess.h
  32. 10 0
      arch/arm/include/asm/unistd.h
  33. 0 11
      arch/arm/include/uapi/asm/unistd.h
  34. 5 5
      arch/arm/kernel/debug.S
  35. 21 21
      arch/arm/kernel/entry-armv.S
  36. 7 6
      arch/arm/kernel/entry-common.S
  37. 0 14
      arch/arm/kernel/entry-header.S
  38. 2 2
      arch/arm/kernel/fiqasm.S
  39. 4 3
      arch/arm/kernel/head-common.S
  40. 4 4
      arch/arm/kernel/head-nommu.S
  41. 9 9
      arch/arm/kernel/head.S
  42. 3 3
      arch/arm/kernel/hyp-stub.S
  43. 8 8
      arch/arm/kernel/iwmmxt.S
  44. 12 6
      arch/arm/kernel/perf_event.c
  45. 36 30
      arch/arm/kernel/perf_event_cpu.c
  46. 79 228
      arch/arm/kernel/perf_event_v6.c
  47. 216 751
      arch/arm/kernel/perf_event_v7.c
  48. 17 104
      arch/arm/kernel/perf_event_xscale.c
  49. 2 1
      arch/arm/kernel/relocate_kernel.S
  50. 22 7
      arch/arm/kernel/setup.c
  51. 1 1
      arch/arm/kernel/sleep.S
  52. 10 2
      arch/arm/kernel/smp_scu.c
  53. 12 8
      arch/arm/kernel/smp_tlb.c
  54. 4 0
      arch/arm/kernel/swp_emulate.c
  55. 1 4
      arch/arm/kernel/time.c
  56. 4 2
      arch/arm/kernel/traps.c
  57. 3 5
      arch/arm/kernel/unwind.c
  58. 0 1
      arch/arm/kernel/vmlinux.lds.S
  59. 1 7
      arch/arm/kvm/guest.c
  60. 2 1
      arch/arm/kvm/init.S
  61. 2 1
      arch/arm/lib/ashldi3.S
  62. 2 1
      arch/arm/lib/ashrdi3.S
  63. 1 1
      arch/arm/lib/backtrace.S
  64. 3 2
      arch/arm/lib/bitops.h
  65. 3 2
      arch/arm/lib/bswapsdi2.S
  66. 2 2
      arch/arm/lib/call_with_stack.S
  67. 1 1
      arch/arm/lib/csumpartial.S
  68. 3 2
      arch/arm/lib/csumpartialcopygeneric.S
  69. 9 9
      arch/arm/lib/delay-loop.S
  70. 7 6
      arch/arm/lib/div64.S
  71. 5 5
      arch/arm/lib/findbit.S
  72. 40 5
      arch/arm/lib/getuser.S
  73. 1 1
      arch/arm/lib/io-readsb.S
  74. 3 3
      arch/arm/lib/io-readsl.S
  75. 2 2
      arch/arm/lib/io-readsw-armv3.S
  76. 1 1
      arch/arm/lib/io-readsw-armv4.S
  77. 1 1
      arch/arm/lib/io-writesb.S
  78. 5 5
      arch/arm/lib/io-writesl.S
  79. 2 2
      arch/arm/lib/io-writesw-armv3.S
  80. 2 2
      arch/arm/lib/io-writesw-armv4.S
  81. 13 13
      arch/arm/lib/lib1funcs.S
  82. 2 1
      arch/arm/lib/lshrdi3.S
  83. 1 1
      arch/arm/lib/memchr.S
  84. 1 1
      arch/arm/lib/memset.S
  85. 1 1
      arch/arm/lib/memzero.S
  86. 2 1
      arch/arm/lib/muldi3.S
  87. 5 5
      arch/arm/lib/putuser.S
  88. 1 1
      arch/arm/lib/strchr.S
  89. 1 1
      arch/arm/lib/strrchr.S
  90. 3 2
      arch/arm/lib/ucmpdi2.S
  91. 1 1
      arch/arm/mach-davinci/sleep.S
  92. 0 5
      arch/arm/mach-ebsa110/include/mach/memory.h
  93. 3 3
      arch/arm/mach-ep93xx/crunch-bits.S
  94. 0 22
      arch/arm/mach-ep93xx/include/mach/memory.h
  95. 1 0
      arch/arm/mach-exynos/Kconfig
  96. 16 1
      arch/arm/mach-exynos/mcpm-exynos.c
  97. 2 2
      arch/arm/mach-exynos/platsmp.c
  98. 5 6
      arch/arm/mach-exynos/pm.c
  99. 0 5
      arch/arm/mach-footbridge/include/mach/memory.h
  100. 3 2
      arch/arm/mach-imx/suspend-imx6.S

+ 15 - 2
arch/arm/Kconfig

@@ -263,8 +263,22 @@ config NEED_MACH_MEMORY_H
 
 
 config PHYS_OFFSET
 config PHYS_OFFSET
 	hex "Physical address of main memory" if MMU
 	hex "Physical address of main memory" if MMU
-	depends on !ARM_PATCH_PHYS_VIRT && !NEED_MACH_MEMORY_H
+	depends on !ARM_PATCH_PHYS_VIRT
 	default DRAM_BASE if !MMU
 	default DRAM_BASE if !MMU
+	default 0x00000000 if ARCH_EBSA110 || \
+			EP93XX_SDCE3_SYNC_PHYS_OFFSET || \
+			ARCH_FOOTBRIDGE || \
+			ARCH_INTEGRATOR || \
+			ARCH_IOP13XX || \
+			ARCH_KS8695 || \
+			(ARCH_REALVIEW && !REALVIEW_HIGH_PHYS_OFFSET)
+	default 0x10000000 if ARCH_OMAP1 || ARCH_RPC
+	default 0x20000000 if ARCH_S5PV210
+	default 0x70000000 if REALVIEW_HIGH_PHYS_OFFSET
+	default 0xc0000000 if EP93XX_SDCE0_PHYS_OFFSET || ARCH_SA1100
+	default 0xd0000000 if EP93XX_SDCE1_PHYS_OFFSET
+	default 0xe0000000 if EP93XX_SDCE2_PHYS_OFFSET
+	default 0xf0000000 if EP93XX_SDCE3_ASYNC_PHYS_OFFSET
 	help
 	help
 	  Please provide the physical address corresponding to the
 	  Please provide the physical address corresponding to the
 	  location of main memory in your system.
 	  location of main memory in your system.
@@ -436,7 +450,6 @@ config ARCH_EP93XX
 	select ARM_VIC
 	select ARM_VIC
 	select CLKDEV_LOOKUP
 	select CLKDEV_LOOKUP
 	select CPU_ARM920T
 	select CPU_ARM920T
-	select NEED_MACH_MEMORY_H
 	help
 	help
 	  This enables support for the Cirrus EP93xx series of CPUs.
 	  This enables support for the Cirrus EP93xx series of CPUs.
 
 

+ 3 - 0
arch/arm/Makefile

@@ -127,6 +127,9 @@ CHECKFLAGS	+= -D__arm__
 
 
 #Default value
 #Default value
 head-y		:= arch/arm/kernel/head$(MMUEXT).o
 head-y		:= arch/arm/kernel/head$(MMUEXT).o
+
+# Text offset. This list is sorted numerically by address in order to
+# provide a means to avoid/resolve conflicts in multi-arch kernels.
 textofs-y	:= 0x00008000
 textofs-y	:= 0x00008000
 textofs-$(CONFIG_ARCH_CLPS711X) := 0x00028000
 textofs-$(CONFIG_ARCH_CLPS711X) := 0x00028000
 # We don't want the htc bootloader to corrupt kernel during resume
 # We don't want the htc bootloader to corrupt kernel during resume

+ 1 - 4
arch/arm/boot/compressed/Makefile

@@ -81,7 +81,7 @@ ZTEXTADDR	:= 0
 ZBSSADDR	:= ALIGN(8)
 ZBSSADDR	:= ALIGN(8)
 endif
 endif
 
 
-SEDFLAGS	= s/TEXT_START/$(ZTEXTADDR)/;s/BSS_START/$(ZBSSADDR)/
+CPPFLAGS_vmlinux.lds := -DTEXT_START="$(ZTEXTADDR)" -DBSS_START="$(ZBSSADDR)"
 
 
 suffix_$(CONFIG_KERNEL_GZIP) = gzip
 suffix_$(CONFIG_KERNEL_GZIP) = gzip
 suffix_$(CONFIG_KERNEL_LZO)  = lzo
 suffix_$(CONFIG_KERNEL_LZO)  = lzo
@@ -199,8 +199,5 @@ CFLAGS_font.o := -Dstatic=
 $(obj)/font.c: $(FONTC)
 $(obj)/font.c: $(FONTC)
 	$(call cmd,shipped)
 	$(call cmd,shipped)
 
 
-$(obj)/vmlinux.lds: $(obj)/vmlinux.lds.in arch/arm/boot/Makefile $(KCONFIG_CONFIG)
-	@sed "$(SEDFLAGS)" < $< > $@
-
 $(obj)/hyp-stub.S: $(srctree)/arch/$(SRCARCH)/kernel/hyp-stub.S
 $(obj)/hyp-stub.S: $(srctree)/arch/$(SRCARCH)/kernel/hyp-stub.S
 	$(call cmd,shipped)
 	$(call cmd,shipped)

+ 5 - 3
arch/arm/boot/compressed/head.S

@@ -125,9 +125,11 @@ start:
  THUMB(		adr	r12, BSYM(1f)	)
  THUMB(		adr	r12, BSYM(1f)	)
  THUMB(		bx	r12		)
  THUMB(		bx	r12		)
 
 
-		.word	0x016f2818		@ Magic numbers to help the loader
-		.word	start			@ absolute load/run zImage address
-		.word	_edata			@ zImage end address
+		.word	_magic_sig	@ Magic numbers to help the loader
+		.word	_magic_start	@ absolute load/run zImage address
+		.word	_magic_end	@ zImage end address
+		.word	0x04030201	@ endianness flag
+
  THUMB(		.thumb			)
  THUMB(		.thumb			)
 1:
 1:
  ARM_BE8(	setend	be )			@ go BE8 if compiled for BE8
  ARM_BE8(	setend	be )			@ go BE8 if compiled for BE8

+ 14 - 3
arch/arm/boot/compressed/vmlinux.lds.in → arch/arm/boot/compressed/vmlinux.lds.S

@@ -1,12 +1,20 @@
 /*
 /*
- *  linux/arch/arm/boot/compressed/vmlinux.lds.in
- *
  *  Copyright (C) 2000 Russell King
  *  Copyright (C) 2000 Russell King
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  */
  */
+
+#ifdef CONFIG_CPU_ENDIAN_BE8
+#define ZIMAGE_MAGIC(x) ( (((x) >> 24) & 0x000000ff) | \
+			  (((x) >>  8) & 0x0000ff00) | \
+			  (((x) <<  8) & 0x00ff0000) | \
+			  (((x) << 24) & 0xff000000) )
+#else
+#define ZIMAGE_MAGIC(x) (x)
+#endif
+
 OUTPUT_ARCH(arm)
 OUTPUT_ARCH(arm)
 ENTRY(_start)
 ENTRY(_start)
 SECTIONS
 SECTIONS
@@ -57,6 +65,10 @@ SECTIONS
   .pad			: { BYTE(0); . = ALIGN(8); }
   .pad			: { BYTE(0); . = ALIGN(8); }
   _edata = .;
   _edata = .;
 
 
+  _magic_sig = ZIMAGE_MAGIC(0x016f2818);
+  _magic_start = ZIMAGE_MAGIC(_start);
+  _magic_end = ZIMAGE_MAGIC(_edata);
+
   . = BSS_START;
   . = BSS_START;
   __bss_start = .;
   __bss_start = .;
   .bss			: { *(.bss) }
   .bss			: { *(.bss) }
@@ -73,4 +85,3 @@ SECTIONS
   .stab.indexstr 0	: { *(.stab.indexstr) }
   .stab.indexstr 0	: { *(.stab.indexstr) }
   .comment 0		: { *(.comment) }
   .comment 0		: { *(.comment) }
 }
 }
-

+ 52 - 0
arch/arm/common/mcpm_entry.c

@@ -12,11 +12,13 @@
 #include <linux/kernel.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/init.h>
 #include <linux/irqflags.h>
 #include <linux/irqflags.h>
+#include <linux/cpu_pm.h>
 
 
 #include <asm/mcpm.h>
 #include <asm/mcpm.h>
 #include <asm/cacheflush.h>
 #include <asm/cacheflush.h>
 #include <asm/idmap.h>
 #include <asm/idmap.h>
 #include <asm/cputype.h>
 #include <asm/cputype.h>
+#include <asm/suspend.h>
 
 
 extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER];
 
 
@@ -146,6 +148,56 @@ int mcpm_cpu_powered_up(void)
 	return 0;
 	return 0;
 }
 }
 
 
+#ifdef CONFIG_ARM_CPU_SUSPEND
+
+static int __init nocache_trampoline(unsigned long _arg)
+{
+	void (*cache_disable)(void) = (void *)_arg;
+	unsigned int mpidr = read_cpuid_mpidr();
+	unsigned int cpu = MPIDR_AFFINITY_LEVEL(mpidr, 0);
+	unsigned int cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1);
+	phys_reset_t phys_reset;
+
+	mcpm_set_entry_vector(cpu, cluster, cpu_resume);
+	setup_mm_for_reboot();
+
+	__mcpm_cpu_going_down(cpu, cluster);
+	BUG_ON(!__mcpm_outbound_enter_critical(cpu, cluster));
+	cache_disable();
+	__mcpm_outbound_leave_critical(cluster, CLUSTER_DOWN);
+	__mcpm_cpu_down(cpu, cluster);
+
+	phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset);
+	phys_reset(virt_to_phys(mcpm_entry_point));
+	BUG();
+}
+
+int __init mcpm_loopback(void (*cache_disable)(void))
+{
+	int ret;
+
+	/*
+	 * We're going to soft-restart the current CPU through the
+	 * low-level MCPM code by leveraging the suspend/resume
+	 * infrastructure. Let's play it safe by using cpu_pm_enter()
+	 * in case the CPU init code path resets the VFP or similar.
+	 */
+	local_irq_disable();
+	local_fiq_disable();
+	ret = cpu_pm_enter();
+	if (!ret) {
+		ret = cpu_suspend((unsigned long)cache_disable, nocache_trampoline);
+		cpu_pm_exit();
+	}
+	local_fiq_enable();
+	local_irq_enable();
+	if (ret)
+		pr_err("%s returned %d\n", __func__, ret);
+	return ret;
+}
+
+#endif
+
 struct sync_struct mcpm_sync;
 struct sync_struct mcpm_sync;
 
 
 /*
 /*

+ 4 - 0
arch/arm/crypto/Makefile

@@ -5,10 +5,14 @@
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM) += aes-arm.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
 obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
+obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
+obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
 
 
 aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-y	:= aes-armv4.o aes_glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 aes-arm-bs-y	:= aesbs-core.o aesbs-glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
 sha1-arm-y	:= sha1-armv4-large.o sha1_glue.o
+sha1-arm-neon-y	:= sha1-armv7-neon.o sha1_neon_glue.o
+sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
 
 
 quiet_cmd_perl = PERL    $@
 quiet_cmd_perl = PERL    $@
       cmd_perl = $(PERL) $(<) > $(@)
       cmd_perl = $(PERL) $(<) > $(@)

+ 2 - 1
arch/arm/crypto/aes-armv4.S

@@ -35,6 +35,7 @@
 @ that is being targetted.
 @ that is being targetted.
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 .text
 .text
 
 
@@ -648,7 +649,7 @@ _armv4_AES_set_encrypt_key:
 
 
 .Ldone:	mov	r0,#0
 .Ldone:	mov	r0,#0
 	ldmia   sp!,{r4-r12,lr}
 	ldmia   sp!,{r4-r12,lr}
-.Labrt:	mov	pc,lr
+.Labrt:	ret	lr
 ENDPROC(private_AES_set_encrypt_key)
 ENDPROC(private_AES_set_encrypt_key)
 
 
 .align	5
 .align	5

+ 634 - 0
arch/arm/crypto/sha1-armv7-neon.S

@@ -0,0 +1,634 @@
+/* sha1-armv7-neon.S - ARM/NEON accelerated SHA-1 transform function
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code   32
+.fpu neon
+
+.text
+
+
+/* Context structure */
+
+#define state_h0 0
+#define state_h1 4
+#define state_h2 8
+#define state_h3 12
+#define state_h4 16
+
+
+/* Constants */
+
+#define K1  0x5A827999
+#define K2  0x6ED9EBA1
+#define K3  0x8F1BBCDC
+#define K4  0xCA62C1D6
+.align 4
+.LK_VEC:
+.LK1:	.long K1, K1, K1, K1
+.LK2:	.long K2, K2, K2, K2
+.LK3:	.long K3, K3, K3, K3
+.LK4:	.long K4, K4, K4, K4
+
+
+/* Register macros */
+
+#define RSTATE r0
+#define RDATA r1
+#define RNBLKS r2
+#define ROLDSTACK r3
+#define RWK lr
+
+#define _a r4
+#define _b r5
+#define _c r6
+#define _d r7
+#define _e r8
+
+#define RT0 r9
+#define RT1 r10
+#define RT2 r11
+#define RT3 r12
+
+#define W0 q0
+#define W1 q1
+#define W2 q2
+#define W3 q3
+#define W4 q4
+#define W5 q5
+#define W6 q6
+#define W7 q7
+
+#define tmp0 q8
+#define tmp1 q9
+#define tmp2 q10
+#define tmp3 q11
+
+#define qK1 q12
+#define qK2 q13
+#define qK3 q14
+#define qK4 q15
+
+
+/* Round function macros. */
+
+#define WK_offs(i) (((i) & 15) * 4)
+
+#define _R_F1(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	bic RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	and RT1, c, b; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add RT0, RT0, RT3; \
+	add e, e, RT1; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0;
+
+#define _R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, d, b; \
+	add e, e, a, ror #(32 - 5); \
+	eor RT0, RT0, c; \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT3; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT0; \
+
+#define _R_F3(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	ldr RT3, [sp, WK_offs(i)]; \
+		pre1(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	eor RT0, b, c; \
+	and RT1, b, c; \
+	add e, e, a, ror #(32 - 5); \
+		pre2(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	and RT0, RT0, d; \
+	add RT1, RT1, RT3; \
+	add e, e, RT0; \
+	ror b, #(32 - 30); \
+		pre3(i16,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28); \
+	add e, e, RT1;
+
+#define _R_F4(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_F2(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	      W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define _R(a,b,c,d,e,f,i,pre1,pre2,pre3,i16,\
+           W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	_R_##f(a,b,c,d,e,i,pre1,pre2,pre3,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define R(a,b,c,d,e,f,i) \
+	_R_##f(a,b,c,d,e,i,dummy,dummy,dummy,i16,\
+	       W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28)
+
+#define dummy(...)
+
+
+/* Input expansion macros. */
+
+/********* Precalc macros for rounds 0-15 *************************************/
+
+#define W_PRECALC_00_15() \
+	add       RWK, sp, #(WK_offs(0));			\
+	\
+	vld1.32   {tmp0, tmp1}, [RDATA]!;			\
+	vrev32.8  W0, tmp0;		/* big => little */	\
+	vld1.32   {tmp2, tmp3}, [RDATA]!;			\
+	vadd.u32  tmp0, W0, curK;				\
+	vrev32.8  W7, tmp1;		/* big => little */	\
+	vrev32.8  W6, tmp2;		/* big => little */	\
+	vadd.u32  tmp1, W7, curK;				\
+	vrev32.8  W5, tmp3;		/* big => little */	\
+	vadd.u32  tmp2, W6, curK;				\
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+	vadd.u32  tmp3, W5, curK;				\
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+#define WPRECALC_00_15_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {tmp0, tmp1}, [RDATA]!;			\
+
+#define WPRECALC_00_15_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(0));			\
+
+#define WPRECALC_00_15_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vrev32.8  W0, tmp0;		/* big => little */	\
+
+#define WPRECALC_00_15_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vld1.32   {tmp2, tmp3}, [RDATA]!;			\
+
+#define WPRECALC_00_15_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W0, curK;				\
+
+#define WPRECALC_00_15_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vrev32.8  W7, tmp1;		/* big => little */	\
+
+#define WPRECALC_00_15_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vrev32.8  W6, tmp2;		/* big => little */	\
+
+#define WPRECALC_00_15_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp1, W7, curK;				\
+
+#define WPRECALC_00_15_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vrev32.8  W5, tmp3;		/* big => little */	\
+
+#define WPRECALC_00_15_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp2, W6, curK;				\
+
+#define WPRECALC_00_15_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0, tmp1}, [RWK]!;				\
+
+#define WPRECALC_00_15_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp3, W5, curK;				\
+
+#define WPRECALC_00_15_12(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp2, tmp3}, [RWK];				\
+
+
+/********* Precalc macros for rounds 16-31 ************************************/
+
+#define WPRECALC_16_31_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0;			\
+	vext.8    W, W_m16, W_m12, #8;		\
+
+#define WPRECALC_16_31_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add       RWK, sp, #(WK_offs(i));	\
+	vext.8    tmp0, W_m04, tmp0, #4;	\
+
+#define WPRECALC_16_31_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W_m16;		\
+	veor.32   W, W, W_m08;			\
+
+#define WPRECALC_16_31_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp1, tmp1;			\
+	veor      W, W, tmp0;			\
+
+#define WPRECALC_16_31_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp0, W, #1;			\
+
+#define WPRECALC_16_31_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8    tmp1, tmp1, W, #(16-12);	\
+	vshr.u32  W, W, #31;			\
+
+#define WPRECALC_16_31_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr      tmp0, tmp0, W;		\
+	vshr.u32  W, tmp1, #30;			\
+
+#define WPRECALC_16_31_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32  tmp1, tmp1, #2;		\
+
+#define WPRECALC_16_31_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      tmp0, tmp0, W;		\
+
+#define WPRECALC_16_31_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor      W, tmp0, tmp1;		\
+
+#define WPRECALC_16_31_10(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32  tmp0, W, curK;		\
+
+#define WPRECALC_16_31_11(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32   {tmp0}, [RWK];
+
+
+/********* Precalc macros for rounds 32-79 ************************************/
+
+#define WPRECALC_32_79_0(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m28; \
+
+#define WPRECALC_32_79_1(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vext.8 tmp0, W_m08, W_m04, #8; \
+
+#define WPRECALC_32_79_2(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, W_m16; \
+
+#define WPRECALC_32_79_3(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	veor W, tmp0; \
+
+#define WPRECALC_32_79_4(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	add RWK, sp, #(WK_offs(i&~3)); \
+
+#define WPRECALC_32_79_5(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshl.u32 tmp1, W, #2; \
+
+#define WPRECALC_32_79_6(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vshr.u32 tmp0, W, #30; \
+
+#define WPRECALC_32_79_7(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vorr W, tmp0, tmp1; \
+
+#define WPRECALC_32_79_8(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vadd.u32 tmp0, W, curK; \
+
+#define WPRECALC_32_79_9(i,W,W_m04,W_m08,W_m12,W_m16,W_m20,W_m24,W_m28) \
+	vst1.32 {tmp0}, [RWK];
+
+
+/*
+ * Transform nblks*64 bytes (nblks*16 32-bit words) at DATA.
+ *
+ * unsigned int
+ * sha1_transform_neon (void *ctx, const unsigned char *data,
+ *                      unsigned int nblks)
+ */
+.align 3
+ENTRY(sha1_transform_neon)
+  /* input:
+   *	r0: ctx, CTX
+   *	r1: data (64*nblks bytes)
+   *	r2: nblks
+   */
+
+  cmp RNBLKS, #0;
+  beq .Ldo_nothing;
+
+  push {r4-r12, lr};
+  /*vpush {q4-q7};*/
+
+  adr RT3, .LK_VEC;
+
+  mov ROLDSTACK, sp;
+
+  /* Align stack. */
+  sub RT0, sp, #(16*4);
+  and RT0, #(~(16-1));
+  mov sp, RT0;
+
+  vld1.32 {qK1-qK2}, [RT3]!; /* Load K1,K2 */
+
+  /* Get the values of the chaining variables. */
+  ldm RSTATE, {_a-_e};
+
+  vld1.32 {qK3-qK4}, [RT3]; /* Load K3,K4 */
+
+#undef curK
+#define curK qK1
+  /* Precalc 0-15. */
+  W_PRECALC_00_15();
+
+.Loop:
+  /* Transform 0-15 + Precalc 16-31. */
+  _R( _a, _b, _c, _d, _e, F1,  0,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  1,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  2,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 16,
+      W4, W5, W6, W7, W0, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1,  3,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,16,
+      W4, W5, W6, W7, W0, _, _, _ );
+
+#undef curK
+#define curK qK2
+  _R( _b, _c, _d, _e, _a, F1,  4,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1,  5,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1,  6,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 20,
+      W3, W4, W5, W6, W7, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F1,  7,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,20,
+      W3, W4, W5, W6, W7, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F1,  8,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1,  9,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 10,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 24,
+      W2, W3, W4, W5, W6, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F1, 11,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,24,
+      W2, W3, W4, W5, W6, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F1, 12,
+      WPRECALC_16_31_0, WPRECALC_16_31_1, WPRECALC_16_31_2, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F1, 13,
+      WPRECALC_16_31_3, WPRECALC_16_31_4, WPRECALC_16_31_5, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F1, 14,
+      WPRECALC_16_31_6, WPRECALC_16_31_7, WPRECALC_16_31_8, 28,
+      W1, W2, W3, W4, W5, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F1, 15,
+      WPRECALC_16_31_9, WPRECALC_16_31_10,WPRECALC_16_31_11,28,
+      W1, W2, W3, W4, W5, _, _, _ );
+
+  /* Transform 16-63 + Precalc 32-79. */
+  _R( _e, _a, _b, _c, _d, F1, 16,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _d, _e, _a, _b, _c, F1, 17,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _c, _d, _e, _a, _b, F1, 18,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F1, 19,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 32,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _a, _b, _c, _d, _e, F2, 20,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _e, _a, _b, _c, _d, F2, 21,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _d, _e, _a, _b, _c, F2, 22,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F2, 23,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 36,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+#undef curK
+#define curK qK3
+  _R( _b, _c, _d, _e, _a, F2, 24,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _a, _b, _c, _d, _e, F2, 25,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _e, _a, _b, _c, _d, F2, 26,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F2, 27,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 40,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  _R( _c, _d, _e, _a, _b, F2, 28,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _b, _c, _d, _e, _a, F2, 29,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _a, _b, _c, _d, _e, F2, 30,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F2, 31,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 44,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  _R( _d, _e, _a, _b, _c, F2, 32,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _c, _d, _e, _a, _b, F2, 33,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _b, _c, _d, _e, _a, F2, 34,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+  _R( _a, _b, _c, _d, _e, F2, 35,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 48,
+      W4, W5, W6, W7, W0, W1, W2, W3);
+
+  _R( _e, _a, _b, _c, _d, F2, 36,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _d, _e, _a, _b, _c, F2, 37,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _c, _d, _e, _a, _b, F2, 38,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+  _R( _b, _c, _d, _e, _a, F2, 39,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 52,
+      W3, W4, W5, W6, W7, W0, W1, W2);
+
+  _R( _a, _b, _c, _d, _e, F3, 40,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _e, _a, _b, _c, _d, F3, 41,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _d, _e, _a, _b, _c, F3, 42,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+  _R( _c, _d, _e, _a, _b, F3, 43,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 56,
+      W2, W3, W4, W5, W6, W7, W0, W1);
+
+#undef curK
+#define curK qK4
+  _R( _b, _c, _d, _e, _a, F3, 44,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _a, _b, _c, _d, _e, F3, 45,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _e, _a, _b, _c, _d, F3, 46,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+  _R( _d, _e, _a, _b, _c, F3, 47,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 60,
+      W1, W2, W3, W4, W5, W6, W7, W0);
+
+  _R( _c, _d, _e, _a, _b, F3, 48,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _b, _c, _d, _e, _a, F3, 49,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _a, _b, _c, _d, _e, F3, 50,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+  _R( _e, _a, _b, _c, _d, F3, 51,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 64,
+      W0, W1, W2, W3, W4, W5, W6, W7);
+
+  _R( _d, _e, _a, _b, _c, F3, 52,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _c, _d, _e, _a, _b, F3, 53,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _b, _c, _d, _e, _a, F3, 54,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+  _R( _a, _b, _c, _d, _e, F3, 55,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 68,
+      W7, W0, W1, W2, W3, W4, W5, W6);
+
+  _R( _e, _a, _b, _c, _d, F3, 56,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _d, _e, _a, _b, _c, F3, 57,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _c, _d, _e, _a, _b, F3, 58,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+  _R( _b, _c, _d, _e, _a, F3, 59,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 72,
+      W6, W7, W0, W1, W2, W3, W4, W5);
+
+  subs RNBLKS, #1;
+
+  _R( _a, _b, _c, _d, _e, F4, 60,
+      WPRECALC_32_79_0, WPRECALC_32_79_1, WPRECALC_32_79_2, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _e, _a, _b, _c, _d, F4, 61,
+      WPRECALC_32_79_3, WPRECALC_32_79_4, WPRECALC_32_79_5, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _d, _e, _a, _b, _c, F4, 62,
+      WPRECALC_32_79_6, dummy,            WPRECALC_32_79_7, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+  _R( _c, _d, _e, _a, _b, F4, 63,
+      WPRECALC_32_79_8, dummy,            WPRECALC_32_79_9, 76,
+      W5, W6, W7, W0, W1, W2, W3, W4);
+
+  beq .Lend;
+
+  /* Transform 64-79 + Precalc 0-15 of next block. */
+#undef curK
+#define curK qK1
+  _R( _b, _c, _d, _e, _a, F4, 64,
+      WPRECALC_00_15_0, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 65,
+      WPRECALC_00_15_1, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 66,
+      WPRECALC_00_15_2, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 67,
+      WPRECALC_00_15_3, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _c, _d, _e, _a, _b, F4, 68,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 69,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 70,
+      WPRECALC_00_15_4, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _e, _a, _b, _c, _d, F4, 71,
+      WPRECALC_00_15_5, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _d, _e, _a, _b, _c, F4, 72,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 73,
+      dummy,            dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 74,
+      WPRECALC_00_15_6, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _a, _b, _c, _d, _e, F4, 75,
+      WPRECALC_00_15_7, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+
+  _R( _e, _a, _b, _c, _d, F4, 76,
+      WPRECALC_00_15_8, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _d, _e, _a, _b, _c, F4, 77,
+      WPRECALC_00_15_9, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _c, _d, _e, _a, _b, F4, 78,
+      WPRECALC_00_15_10, dummy, dummy, _, _, _, _, _, _, _, _, _ );
+  _R( _b, _c, _d, _e, _a, F4, 79,
+      WPRECALC_00_15_11, dummy, WPRECALC_00_15_12, _, _, _, _, _, _, _, _, _ );
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  b .Loop;
+
+.Lend:
+  /* Transform 64-79 */
+  R( _b, _c, _d, _e, _a, F4, 64 );
+  R( _a, _b, _c, _d, _e, F4, 65 );
+  R( _e, _a, _b, _c, _d, F4, 66 );
+  R( _d, _e, _a, _b, _c, F4, 67 );
+  R( _c, _d, _e, _a, _b, F4, 68 );
+  R( _b, _c, _d, _e, _a, F4, 69 );
+  R( _a, _b, _c, _d, _e, F4, 70 );
+  R( _e, _a, _b, _c, _d, F4, 71 );
+  R( _d, _e, _a, _b, _c, F4, 72 );
+  R( _c, _d, _e, _a, _b, F4, 73 );
+  R( _b, _c, _d, _e, _a, F4, 74 );
+  R( _a, _b, _c, _d, _e, F4, 75 );
+  R( _e, _a, _b, _c, _d, F4, 76 );
+  R( _d, _e, _a, _b, _c, F4, 77 );
+  R( _c, _d, _e, _a, _b, F4, 78 );
+  R( _b, _c, _d, _e, _a, F4, 79 );
+
+  mov sp, ROLDSTACK;
+
+  /* Update the chaining variables. */
+  ldm RSTATE, {RT0-RT3};
+  add _a, RT0;
+  ldr RT0, [RSTATE, #state_h4];
+  add _b, RT1;
+  add _c, RT2;
+  add _d, RT3;
+  /*vpop {q4-q7};*/
+  add _e, RT0;
+  stm RSTATE, {_a-_e};
+
+  pop {r4-r12, pc};
+
+.Ldo_nothing:
+  bx lr
+ENDPROC(sha1_transform_neon)

+ 27 - 31
arch/arm/crypto/sha1_glue.c

@@ -23,32 +23,27 @@
 #include <linux/types.h>
 #include <linux/types.h>
 #include <crypto/sha.h>
 #include <crypto/sha.h>
 #include <asm/byteorder.h>
 #include <asm/byteorder.h>
+#include <asm/crypto/sha1.h>
 
 
-struct SHA1_CTX {
-	uint32_t h0,h1,h2,h3,h4;
-	u64 count;
-	u8 data[SHA1_BLOCK_SIZE];
-};
 
 
-asmlinkage void sha1_block_data_order(struct SHA1_CTX *digest,
+asmlinkage void sha1_block_data_order(u32 *digest,
 		const unsigned char *data, unsigned int rounds);
 		const unsigned char *data, unsigned int rounds);
 
 
 
 
 static int sha1_init(struct shash_desc *desc)
 static int sha1_init(struct shash_desc *desc)
 {
 {
-	struct SHA1_CTX *sctx = shash_desc_ctx(desc);
-	memset(sctx, 0, sizeof(*sctx));
-	sctx->h0 = SHA1_H0;
-	sctx->h1 = SHA1_H1;
-	sctx->h2 = SHA1_H2;
-	sctx->h3 = SHA1_H3;
-	sctx->h4 = SHA1_H4;
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha1_state){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
+
 	return 0;
 	return 0;
 }
 }
 
 
 
 
-static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
-			       unsigned int len, unsigned int partial)
+static int __sha1_update(struct sha1_state *sctx, const u8 *data,
+			 unsigned int len, unsigned int partial)
 {
 {
 	unsigned int done = 0;
 	unsigned int done = 0;
 
 
@@ -56,43 +51,44 @@ static int __sha1_update(struct SHA1_CTX *sctx, const u8 *data,
 
 
 	if (partial) {
 	if (partial) {
 		done = SHA1_BLOCK_SIZE - partial;
 		done = SHA1_BLOCK_SIZE - partial;
-		memcpy(sctx->data + partial, data, done);
-		sha1_block_data_order(sctx, sctx->data, 1);
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_block_data_order(sctx->state, sctx->buffer, 1);
 	}
 	}
 
 
 	if (len - done >= SHA1_BLOCK_SIZE) {
 	if (len - done >= SHA1_BLOCK_SIZE) {
 		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
 		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
-		sha1_block_data_order(sctx, data + done, rounds);
+		sha1_block_data_order(sctx->state, data + done, rounds);
 		done += rounds * SHA1_BLOCK_SIZE;
 		done += rounds * SHA1_BLOCK_SIZE;
 	}
 	}
 
 
-	memcpy(sctx->data, data + done, len - done);
+	memcpy(sctx->buffer, data + done, len - done);
 	return 0;
 	return 0;
 }
 }
 
 
 
 
-static int sha1_update(struct shash_desc *desc, const u8 *data,
-			     unsigned int len)
+int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+		    unsigned int len)
 {
 {
-	struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+	struct sha1_state *sctx = shash_desc_ctx(desc);
 	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
 	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
 	int res;
 	int res;
 
 
 	/* Handle the fast case right here */
 	/* Handle the fast case right here */
 	if (partial + len < SHA1_BLOCK_SIZE) {
 	if (partial + len < SHA1_BLOCK_SIZE) {
 		sctx->count += len;
 		sctx->count += len;
-		memcpy(sctx->data + partial, data, len);
+		memcpy(sctx->buffer + partial, data, len);
 		return 0;
 		return 0;
 	}
 	}
 	res = __sha1_update(sctx, data, len, partial);
 	res = __sha1_update(sctx, data, len, partial);
 	return res;
 	return res;
 }
 }
+EXPORT_SYMBOL_GPL(sha1_update_arm);
 
 
 
 
 /* Add padding and return the message digest. */
 /* Add padding and return the message digest. */
 static int sha1_final(struct shash_desc *desc, u8 *out)
 static int sha1_final(struct shash_desc *desc, u8 *out)
 {
 {
-	struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+	struct sha1_state *sctx = shash_desc_ctx(desc);
 	unsigned int i, index, padlen;
 	unsigned int i, index, padlen;
 	__be32 *dst = (__be32 *)out;
 	__be32 *dst = (__be32 *)out;
 	__be64 bits;
 	__be64 bits;
@@ -106,7 +102,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
 	/* We need to fill a whole block for __sha1_update() */
 	/* We need to fill a whole block for __sha1_update() */
 	if (padlen <= 56) {
 	if (padlen <= 56) {
 		sctx->count += padlen;
 		sctx->count += padlen;
-		memcpy(sctx->data + index, padding, padlen);
+		memcpy(sctx->buffer + index, padding, padlen);
 	} else {
 	} else {
 		__sha1_update(sctx, padding, padlen, index);
 		__sha1_update(sctx, padding, padlen, index);
 	}
 	}
@@ -114,7 +110,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
 
 
 	/* Store state in digest */
 	/* Store state in digest */
 	for (i = 0; i < 5; i++)
 	for (i = 0; i < 5; i++)
-		dst[i] = cpu_to_be32(((u32 *)sctx)[i]);
+		dst[i] = cpu_to_be32(sctx->state[i]);
 
 
 	/* Wipe context */
 	/* Wipe context */
 	memset(sctx, 0, sizeof(*sctx));
 	memset(sctx, 0, sizeof(*sctx));
@@ -124,7 +120,7 @@ static int sha1_final(struct shash_desc *desc, u8 *out)
 
 
 static int sha1_export(struct shash_desc *desc, void *out)
 static int sha1_export(struct shash_desc *desc, void *out)
 {
 {
-	struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+	struct sha1_state *sctx = shash_desc_ctx(desc);
 	memcpy(out, sctx, sizeof(*sctx));
 	memcpy(out, sctx, sizeof(*sctx));
 	return 0;
 	return 0;
 }
 }
@@ -132,7 +128,7 @@ static int sha1_export(struct shash_desc *desc, void *out)
 
 
 static int sha1_import(struct shash_desc *desc, const void *in)
 static int sha1_import(struct shash_desc *desc, const void *in)
 {
 {
-	struct SHA1_CTX *sctx = shash_desc_ctx(desc);
+	struct sha1_state *sctx = shash_desc_ctx(desc);
 	memcpy(sctx, in, sizeof(*sctx));
 	memcpy(sctx, in, sizeof(*sctx));
 	return 0;
 	return 0;
 }
 }
@@ -141,12 +137,12 @@ static int sha1_import(struct shash_desc *desc, const void *in)
 static struct shash_alg alg = {
 static struct shash_alg alg = {
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.digestsize	=	SHA1_DIGEST_SIZE,
 	.init		=	sha1_init,
 	.init		=	sha1_init,
-	.update		=	sha1_update,
+	.update		=	sha1_update_arm,
 	.final		=	sha1_final,
 	.final		=	sha1_final,
 	.export		=	sha1_export,
 	.export		=	sha1_export,
 	.import		=	sha1_import,
 	.import		=	sha1_import,
-	.descsize	=	sizeof(struct SHA1_CTX),
-	.statesize	=	sizeof(struct SHA1_CTX),
+	.descsize	=	sizeof(struct sha1_state),
+	.statesize	=	sizeof(struct sha1_state),
 	.base		=	{
 	.base		=	{
 		.cra_name	=	"sha1",
 		.cra_name	=	"sha1",
 		.cra_driver_name=	"sha1-asm",
 		.cra_driver_name=	"sha1-asm",

+ 197 - 0
arch/arm/crypto/sha1_neon_glue.c

@@ -0,0 +1,197 @@
+/*
+ * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using
+ * ARM NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha1_generic.c and sha1_ssse3_glue.c:
+ *  Copyright (c) Alan Smithee.
+ *  Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk>
+ *  Copyright (c) Jean-Francois Dive <jef@linuxbe.org>
+ *  Copyright (c) Mathias Krause <minipli@googlemail.com>
+ *  Copyright (c) Chandramouli Narayanan <mouli@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/neon.h>
+#include <asm/simd.h>
+#include <asm/crypto/sha1.h>
+
+
+asmlinkage void sha1_transform_neon(void *state_h, const char *data,
+				    unsigned int rounds);
+
+
+static int sha1_neon_init(struct shash_desc *desc)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	*sctx = (struct sha1_state){
+		.state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 },
+	};
+
+	return 0;
+}
+
+static int __sha1_neon_update(struct shash_desc *desc, const u8 *data,
+			       unsigned int len, unsigned int partial)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count += len;
+
+	if (partial) {
+		done = SHA1_BLOCK_SIZE - partial;
+		memcpy(sctx->buffer + partial, data, done);
+		sha1_transform_neon(sctx->state, sctx->buffer, 1);
+	}
+
+	if (len - done >= SHA1_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE;
+
+		sha1_transform_neon(sctx->state, data + done, rounds);
+		done += rounds * SHA1_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buffer, data + done, len - done);
+
+	return 0;
+}
+
+static int sha1_neon_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count % SHA1_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA1_BLOCK_SIZE) {
+		sctx->count += len;
+		memcpy(sctx->buffer + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = sha1_update_arm(desc, data, len);
+	} else {
+		kernel_neon_begin();
+		res = __sha1_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha1_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be32 *dst = (__be32 *)out;
+	__be64 bits;
+	static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, };
+
+	bits = cpu_to_be64(sctx->count << 3);
+
+	/* Pad out to 56 mod 64 and append length */
+	index = sctx->count % SHA1_BLOCK_SIZE;
+	padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index);
+	if (!may_use_simd()) {
+		sha1_update_arm(desc, padding, padlen);
+		sha1_update_arm(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha1_neon_update() */
+		if (padlen <= 56) {
+			sctx->count += padlen;
+			memcpy(sctx->buffer + index, padding, padlen);
+		} else {
+			__sha1_neon_update(desc, padding, padlen, index);
+		}
+		__sha1_neon_update(desc, (const u8 *)&bits, sizeof(bits), 56);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 5; i++)
+		dst[i] = cpu_to_be32(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_neon_export(struct shash_desc *desc, void *out)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha1_neon_import(struct shash_desc *desc, const void *in)
+{
+	struct sha1_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static struct shash_alg alg = {
+	.digestsize	=	SHA1_DIGEST_SIZE,
+	.init		=	sha1_neon_init,
+	.update		=	sha1_neon_update,
+	.final		=	sha1_neon_final,
+	.export		=	sha1_neon_export,
+	.import		=	sha1_neon_import,
+	.descsize	=	sizeof(struct sha1_state),
+	.statesize	=	sizeof(struct sha1_state),
+	.base		=	{
+		.cra_name		= "sha1",
+		.cra_driver_name	= "sha1-neon",
+		.cra_priority		= 250,
+		.cra_flags		= CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize		= SHA1_BLOCK_SIZE,
+		.cra_module		= THIS_MODULE,
+	}
+};
+
+static int __init sha1_neon_mod_init(void)
+{
+	if (!cpu_has_neon())
+		return -ENODEV;
+
+	return crypto_register_shash(&alg);
+}
+
+static void __exit sha1_neon_mod_fini(void)
+{
+	crypto_unregister_shash(&alg);
+}
+
+module_init(sha1_neon_mod_init);
+module_exit(sha1_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, NEON accelerated");
+MODULE_ALIAS("sha1");

+ 455 - 0
arch/arm/crypto/sha512-armv7-neon.S

@@ -0,0 +1,455 @@
+/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
+ *
+ * Copyright © 2013-2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ */
+
+#include <linux/linkage.h>
+
+
+.syntax unified
+.code   32
+.fpu neon
+
+.text
+
+/* structure of SHA512_CONTEXT */
+#define hd_a 0
+#define hd_b ((hd_a) + 8)
+#define hd_c ((hd_b) + 8)
+#define hd_d ((hd_c) + 8)
+#define hd_e ((hd_d) + 8)
+#define hd_f ((hd_e) + 8)
+#define hd_g ((hd_f) + 8)
+
+/* register macros */
+#define RK %r2
+
+#define RA d0
+#define RB d1
+#define RC d2
+#define RD d3
+#define RE d4
+#define RF d5
+#define RG d6
+#define RH d7
+
+#define RT0 d8
+#define RT1 d9
+#define RT2 d10
+#define RT3 d11
+#define RT4 d12
+#define RT5 d13
+#define RT6 d14
+#define RT7 d15
+
+#define RT01q q4
+#define RT23q q5
+#define RT45q q6
+#define RT67q q7
+
+#define RW0 d16
+#define RW1 d17
+#define RW2 d18
+#define RW3 d19
+#define RW4 d20
+#define RW5 d21
+#define RW6 d22
+#define RW7 d23
+#define RW8 d24
+#define RW9 d25
+#define RW10 d26
+#define RW11 d27
+#define RW12 d28
+#define RW13 d29
+#define RW14 d30
+#define RW15 d31
+
+#define RW01q q8
+#define RW23q q9
+#define RW45q q10
+#define RW67q q11
+#define RW89q q12
+#define RW1011q q13
+#define RW1213q q14
+#define RW1415q q15
+
+/***********************************************************************
+ * ARM assembly implementation of sha512 transform
+ ***********************************************************************/
+#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, \
+                     rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vshr.u64 RT2, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	interleave_op(arg1); \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, re, #41; \
+	vshl.u64 RT5, re, #64 - 41; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, re; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, rf, rg; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	vshr.u64 RT2, ra, #28; \
+	vshl.u64 RT3, ra, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* h = Sum0 (a) + Maj (a, b, c); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, ra, #39; \
+	vshl.u64 RT5, ra, #64 - 39; \
+	veor.64 RT0, ra, rb; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rc, rb; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	veor.64 rh, RT2, RT3; \
+	\
+	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+	vshr.u64 RT2, rd, #14; \
+	vshl.u64 RT3, rd, #64 - 14; \
+	vadd.u64 rh, rh, RT0; \
+	vshr.u64 RT4, rd, #18; \
+	vshl.u64 RT5, rd, #64 - 18; \
+	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rd, #41; \
+	vshl.u64 RT5, rd, #64 - 41; \
+	vadd.u64 RT0, RT0, rw1; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, rd; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, re, rf; \
+	\
+	vadd.u64 RT1, RT1, rg; \
+	vshr.u64 RT2, rh, #28; \
+	vshl.u64 RT3, rh, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, rh, #34; \
+	vshl.u64 RT5, rh, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* g = Sum0 (h) + Maj (h, a, b); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rh, #39; \
+	vshl.u64 RT5, rh, #64 - 39; \
+	veor.64 RT0, rh, ra; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rb, ra; \
+	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+	veor.64 rg, RT2, RT3; \
+	\
+	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
+	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
+	\
+	/**** S0(w[1:2]) */ \
+	\
+	/* w[0:1] += w[9:10] */ \
+	/* RT23q = rw1:rw2 */ \
+	vext.u64 RT23q, rw01q, rw23q, #1; \
+	vadd.u64 rw0, rw9; \
+	vadd.u64 rg, rg, RT0; \
+	vadd.u64 rw1, rw10;\
+	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
+	\
+	vshr.u64 RT45q, RT23q, #1; \
+	vshl.u64 RT67q, RT23q, #64 - 1; \
+	vshr.u64 RT01q, RT23q, #8; \
+	veor.u64 RT45q, RT45q, RT67q; \
+	vshl.u64 RT67q, RT23q, #64 - 8; \
+	veor.u64 RT45q, RT45q, RT01q; \
+	vshr.u64 RT01q, RT23q, #7; \
+	veor.u64 RT45q, RT45q, RT67q; \
+	\
+	/**** S1(w[14:15]) */ \
+	vshr.u64 RT23q, rw1415q, #6; \
+	veor.u64 RT01q, RT01q, RT45q; \
+	vshr.u64 RT45q, rw1415q, #19; \
+	vshl.u64 RT67q, rw1415q, #64 - 19; \
+	veor.u64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT45q, rw1415q, #61; \
+	veor.u64 RT23q, RT23q, RT67q; \
+	vshl.u64 RT67q, rw1415q, #64 - 61; \
+	veor.u64 RT23q, RT23q, RT45q; \
+	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
+	veor.u64 RT01q, RT23q, RT67q;
+#define vadd_RT01q(rw01q) \
+	/* w[0:1] += S(w[14:15]) */ \
+	vadd.u64 rw01q, RT01q;
+
+#define dummy(_) /*_*/
+
+#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, \
+	              interleave_op1, arg1, interleave_op2, arg2) \
+	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
+	vshr.u64 RT2, re, #14; \
+	vshl.u64 RT3, re, #64 - 14; \
+	interleave_op1(arg1); \
+	vshr.u64 RT4, re, #18; \
+	vshl.u64 RT5, re, #64 - 18; \
+	interleave_op2(arg2); \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, re, #41; \
+	vshl.u64 RT5, re, #64 - 41; \
+	vadd.u64 RT0, RT0, rw0; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, re; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, rf, rg; \
+	\
+	vadd.u64 RT1, RT1, rh; \
+	vshr.u64 RT2, ra, #28; \
+	vshl.u64 RT3, ra, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, ra, #34; \
+	vshl.u64 RT5, ra, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* h = Sum0 (a) + Maj (a, b, c); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, ra, #39; \
+	vshl.u64 RT5, ra, #64 - 39; \
+	veor.64 RT0, ra, rb; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rc, rb; \
+	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
+	veor.64 rh, RT2, RT3; \
+	\
+	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
+	vshr.u64 RT2, rd, #14; \
+	vshl.u64 RT3, rd, #64 - 14; \
+	vadd.u64 rh, rh, RT0; \
+	vshr.u64 RT4, rd, #18; \
+	vshl.u64 RT5, rd, #64 - 18; \
+	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
+	vld1.64 {RT0}, [RK]!; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rd, #41; \
+	vshl.u64 RT5, rd, #64 - 41; \
+	vadd.u64 RT0, RT0, rw1; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vmov.64 RT7, rd; \
+	veor.64 RT1, RT2, RT3; \
+	vbsl.64 RT7, re, rf; \
+	\
+	vadd.u64 RT1, RT1, rg; \
+	vshr.u64 RT2, rh, #28; \
+	vshl.u64 RT3, rh, #64 - 28; \
+	vadd.u64 RT1, RT1, RT0; \
+	vshr.u64 RT4, rh, #34; \
+	vshl.u64 RT5, rh, #64 - 34; \
+	vadd.u64 RT1, RT1, RT7; \
+	\
+	/* g = Sum0 (h) + Maj (h, a, b); */ \
+	veor.64 RT23q, RT23q, RT45q; \
+	vshr.u64 RT4, rh, #39; \
+	vshl.u64 RT5, rh, #64 - 39; \
+	veor.64 RT0, rh, ra; \
+	veor.64 RT23q, RT23q, RT45q; \
+	vbsl.64 RT0, rb, ra; \
+	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
+	veor.64 rg, RT2, RT3;
+#define vadd_rg_RT0(rg) \
+	vadd.u64 rg, rg, RT0;
+#define vadd_rg_RT1(rg) \
+	vadd.u64 rg, rg, RT1; /* g+=t1; */
+
+.align 3
+ENTRY(sha512_transform_neon)
+	/* Input:
+	 *	%r0: SHA512_CONTEXT
+	 *	%r1: data
+	 *	%r2: u64 k[] constants
+	 *	%r3: nblks
+	 */
+	push {%lr};
+
+	mov %lr, #0;
+
+	/* Load context to d0-d7 */
+	vld1.64 {RA-RD}, [%r0]!;
+	vld1.64 {RE-RH}, [%r0];
+	sub %r0, #(4*8);
+
+	/* Load input to w[16], d16-d31 */
+	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
+	vld1.64 {RW0-RW3}, [%r1]!;
+	vld1.64 {RW4-RW7}, [%r1]!;
+	vld1.64 {RW8-RW11}, [%r1]!;
+	vld1.64 {RW12-RW15}, [%r1]!;
+#ifdef __ARMEL__
+	/* byteswap */
+	vrev64.8 RW01q, RW01q;
+	vrev64.8 RW23q, RW23q;
+	vrev64.8 RW45q, RW45q;
+	vrev64.8 RW67q, RW67q;
+	vrev64.8 RW89q, RW89q;
+	vrev64.8 RW1011q, RW1011q;
+	vrev64.8 RW1213q, RW1213q;
+	vrev64.8 RW1415q, RW1415q;
+#endif
+
+	/* EABI says that d8-d15 must be preserved by callee. */
+	/*vpush {RT0-RT7};*/
+
+.Loop:
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+		     RW23q, RW1415q, RW9, RW10, dummy, _);
+	b .Lenter_rounds;
+
+.Loop_rounds:
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2,
+		     RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
+.Lenter_rounds:
+	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4,
+		     RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
+	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6,
+		     RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
+	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8,
+		     RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
+	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10,
+		     RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
+	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12,
+		     RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
+	add %lr, #16;
+	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14,
+		     RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
+	cmp %lr, #64;
+	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0,
+		     RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
+	bne .Loop_rounds;
+
+	subs %r3, #1;
+
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1,
+		      vadd_RT01q, RW1415q, dummy, _);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3,
+		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+	beq .Lhandle_tail;
+	vld1.64 {RW0-RW3}, [%r1]!;
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+	vrev64.8 RW01q, RW01q;
+	vrev64.8 RW23q, RW23q;
+#endif
+	vld1.64 {RW4-RW7}, [%r1]!;
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+#ifdef __ARMEL__
+	vrev64.8 RW45q, RW45q;
+	vrev64.8 RW67q, RW67q;
+#endif
+	vld1.64 {RW8-RW11}, [%r1]!;
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+#ifdef __ARMEL__
+	vrev64.8 RW89q, RW89q;
+	vrev64.8 RW1011q, RW1011q;
+#endif
+	vld1.64 {RW12-RW15}, [%r1]!;
+	vadd_rg_RT0(RA);
+	vadd_rg_RT1(RA);
+
+	/* Load context */
+	vld1.64 {RT0-RT3}, [%r0]!;
+	vld1.64 {RT4-RT7}, [%r0];
+	sub %r0, #(4*8);
+
+#ifdef __ARMEL__
+	vrev64.8 RW1213q, RW1213q;
+	vrev64.8 RW1415q, RW1415q;
+#endif
+
+	vadd.u64 RA, RT0;
+	vadd.u64 RB, RT1;
+	vadd.u64 RC, RT2;
+	vadd.u64 RD, RT3;
+	vadd.u64 RE, RT4;
+	vadd.u64 RF, RT5;
+	vadd.u64 RG, RT6;
+	vadd.u64 RH, RT7;
+
+	/* Store the first half of context */
+	vst1.64 {RA-RD}, [%r0]!;
+	sub RK, $(8*80);
+	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	mov %lr, #0;
+	sub %r0, #(4*8);
+
+	b .Loop;
+
+.Lhandle_tail:
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5,
+		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7,
+		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9,
+		      vadd_rg_RT0, RA, vadd_rg_RT1, RA);
+	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11,
+		      vadd_rg_RT0, RG, vadd_rg_RT1, RG);
+	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13,
+		      vadd_rg_RT0, RE, vadd_rg_RT1, RE);
+	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15,
+		      vadd_rg_RT0, RC, vadd_rg_RT1, RC);
+
+	/* Load context to d16-d23 */
+	vld1.64 {RW0-RW3}, [%r0]!;
+	vadd_rg_RT0(RA);
+	vld1.64 {RW4-RW7}, [%r0];
+	vadd_rg_RT1(RA);
+	sub %r0, #(4*8);
+
+	vadd.u64 RA, RW0;
+	vadd.u64 RB, RW1;
+	vadd.u64 RC, RW2;
+	vadd.u64 RD, RW3;
+	vadd.u64 RE, RW4;
+	vadd.u64 RF, RW5;
+	vadd.u64 RG, RW6;
+	vadd.u64 RH, RW7;
+
+	/* Store the first half of context */
+	vst1.64 {RA-RD}, [%r0]!;
+
+	/* Clear used registers */
+	/* d16-d31 */
+	veor.u64 RW01q, RW01q;
+	veor.u64 RW23q, RW23q;
+	veor.u64 RW45q, RW45q;
+	veor.u64 RW67q, RW67q;
+	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
+	veor.u64 RW89q, RW89q;
+	veor.u64 RW1011q, RW1011q;
+	veor.u64 RW1213q, RW1213q;
+	veor.u64 RW1415q, RW1415q;
+	/* d8-d15 */
+	/*vpop {RT0-RT7};*/
+	/* d0-d7 (q0-q3) */
+	veor.u64 %q0, %q0;
+	veor.u64 %q1, %q1;
+	veor.u64 %q2, %q2;
+	veor.u64 %q3, %q3;
+
+	pop {%pc};
+ENDPROC(sha512_transform_neon)

+ 305 - 0
arch/arm/crypto/sha512_neon_glue.c

@@ -0,0 +1,305 @@
+/*
+ * Glue code for the SHA512 Secure Hash Algorithm assembly implementation
+ * using NEON instructions.
+ *
+ * Copyright © 2014 Jussi Kivilinna <jussi.kivilinna@iki.fi>
+ *
+ * This file is based on sha512_ssse3_glue.c:
+ *   Copyright (C) 2013 Intel Corporation
+ *   Author: Tim Chen <tim.c.chen@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; either version 2 of the License, or (at your option)
+ * any later version.
+ *
+ */
+
+#include <crypto/internal/hash.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/cryptohash.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <crypto/sha.h>
+#include <asm/byteorder.h>
+#include <asm/simd.h>
+#include <asm/neon.h>
+
+
+static const u64 sha512_k[] = {
+	0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL,
+	0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL,
+	0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+	0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL,
+	0xd807aa98a3030242ULL, 0x12835b0145706fbeULL,
+	0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+	0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL,
+	0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL,
+	0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+	0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL,
+	0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL,
+	0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+	0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL,
+	0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL,
+	0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+	0x06ca6351e003826fULL, 0x142929670a0e6e70ULL,
+	0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL,
+	0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+	0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL,
+	0x81c2c92e47edaee6ULL, 0x92722c851482353bULL,
+	0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+	0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL,
+	0xd192e819d6ef5218ULL, 0xd69906245565a910ULL,
+	0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+	0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL,
+	0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL,
+	0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+	0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL,
+	0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL,
+	0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+	0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL,
+	0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL,
+	0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+	0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL,
+	0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL,
+	0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+	0x28db77f523047d84ULL, 0x32caab7b40c72493ULL,
+	0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL,
+	0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+	0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+
+asmlinkage void sha512_transform_neon(u64 *digest, const void *data,
+				      const u64 k[], unsigned int num_blks);
+
+
+static int sha512_neon_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA512_H0;
+	sctx->state[1] = SHA512_H1;
+	sctx->state[2] = SHA512_H2;
+	sctx->state[3] = SHA512_H3;
+	sctx->state[4] = SHA512_H4;
+	sctx->state[5] = SHA512_H5;
+	sctx->state[6] = SHA512_H6;
+	sctx->state[7] = SHA512_H7;
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int __sha512_neon_update(struct shash_desc *desc, const u8 *data,
+				unsigned int len, unsigned int partial)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int done = 0;
+
+	sctx->count[0] += len;
+	if (sctx->count[0] < len)
+		sctx->count[1]++;
+
+	if (partial) {
+		done = SHA512_BLOCK_SIZE - partial;
+		memcpy(sctx->buf + partial, data, done);
+		sha512_transform_neon(sctx->state, sctx->buf, sha512_k, 1);
+	}
+
+	if (len - done >= SHA512_BLOCK_SIZE) {
+		const unsigned int rounds = (len - done) / SHA512_BLOCK_SIZE;
+
+		sha512_transform_neon(sctx->state, data + done, sha512_k,
+				      rounds);
+
+		done += rounds * SHA512_BLOCK_SIZE;
+	}
+
+	memcpy(sctx->buf, data + done, len - done);
+
+	return 0;
+}
+
+static int sha512_neon_update(struct shash_desc *desc, const u8 *data,
+			     unsigned int len)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int partial = sctx->count[0] % SHA512_BLOCK_SIZE;
+	int res;
+
+	/* Handle the fast case right here */
+	if (partial + len < SHA512_BLOCK_SIZE) {
+		sctx->count[0] += len;
+		if (sctx->count[0] < len)
+			sctx->count[1]++;
+		memcpy(sctx->buf + partial, data, len);
+
+		return 0;
+	}
+
+	if (!may_use_simd()) {
+		res = crypto_sha512_update(desc, data, len);
+	} else {
+		kernel_neon_begin();
+		res = __sha512_neon_update(desc, data, len, partial);
+		kernel_neon_end();
+	}
+
+	return res;
+}
+
+
+/* Add padding and return the message digest. */
+static int sha512_neon_final(struct shash_desc *desc, u8 *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+	unsigned int i, index, padlen;
+	__be64 *dst = (__be64 *)out;
+	__be64 bits[2];
+	static const u8 padding[SHA512_BLOCK_SIZE] = { 0x80, };
+
+	/* save number of bits */
+	bits[1] = cpu_to_be64(sctx->count[0] << 3);
+	bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61);
+
+	/* Pad out to 112 mod 128 and append length */
+	index = sctx->count[0] & 0x7f;
+	padlen = (index < 112) ? (112 - index) : ((128+112) - index);
+
+	if (!may_use_simd()) {
+		crypto_sha512_update(desc, padding, padlen);
+		crypto_sha512_update(desc, (const u8 *)&bits, sizeof(bits));
+	} else {
+		kernel_neon_begin();
+		/* We need to fill a whole block for __sha512_neon_update() */
+		if (padlen <= 112) {
+			sctx->count[0] += padlen;
+			if (sctx->count[0] < padlen)
+				sctx->count[1]++;
+			memcpy(sctx->buf + index, padding, padlen);
+		} else {
+			__sha512_neon_update(desc, padding, padlen, index);
+		}
+		__sha512_neon_update(desc, (const u8 *)&bits,
+					sizeof(bits), 112);
+		kernel_neon_end();
+	}
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		dst[i] = cpu_to_be64(sctx->state[i]);
+
+	/* Wipe context */
+	memset(sctx, 0, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_neon_export(struct shash_desc *desc, void *out)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(out, sctx, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha512_neon_import(struct shash_desc *desc, const void *in)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	memcpy(sctx, in, sizeof(*sctx));
+
+	return 0;
+}
+
+static int sha384_neon_init(struct shash_desc *desc)
+{
+	struct sha512_state *sctx = shash_desc_ctx(desc);
+
+	sctx->state[0] = SHA384_H0;
+	sctx->state[1] = SHA384_H1;
+	sctx->state[2] = SHA384_H2;
+	sctx->state[3] = SHA384_H3;
+	sctx->state[4] = SHA384_H4;
+	sctx->state[5] = SHA384_H5;
+	sctx->state[6] = SHA384_H6;
+	sctx->state[7] = SHA384_H7;
+
+	sctx->count[0] = sctx->count[1] = 0;
+
+	return 0;
+}
+
+static int sha384_neon_final(struct shash_desc *desc, u8 *hash)
+{
+	u8 D[SHA512_DIGEST_SIZE];
+
+	sha512_neon_final(desc, D);
+
+	memcpy(hash, D, SHA384_DIGEST_SIZE);
+	memset(D, 0, SHA512_DIGEST_SIZE);
+
+	return 0;
+}
+
+static struct shash_alg algs[] = { {
+	.digestsize	=	SHA512_DIGEST_SIZE,
+	.init		=	sha512_neon_init,
+	.update		=	sha512_neon_update,
+	.final		=	sha512_neon_final,
+	.export		=	sha512_neon_export,
+	.import		=	sha512_neon_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha512",
+		.cra_driver_name =	"sha512-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA512_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+},  {
+	.digestsize	=	SHA384_DIGEST_SIZE,
+	.init		=	sha384_neon_init,
+	.update		=	sha512_neon_update,
+	.final		=	sha384_neon_final,
+	.export		=	sha512_neon_export,
+	.import		=	sha512_neon_import,
+	.descsize	=	sizeof(struct sha512_state),
+	.statesize	=	sizeof(struct sha512_state),
+	.base		=	{
+		.cra_name	=	"sha384",
+		.cra_driver_name =	"sha384-neon",
+		.cra_priority	=	250,
+		.cra_flags	=	CRYPTO_ALG_TYPE_SHASH,
+		.cra_blocksize	=	SHA384_BLOCK_SIZE,
+		.cra_module	=	THIS_MODULE,
+	}
+} };
+
+static int __init sha512_neon_mod_init(void)
+{
+	if (!cpu_has_neon())
+		return -ENODEV;
+
+	return crypto_register_shashes(algs, ARRAY_SIZE(algs));
+}
+
+static void __exit sha512_neon_mod_fini(void)
+{
+	crypto_unregister_shashes(algs, ARRAY_SIZE(algs));
+}
+
+module_init(sha512_neon_mod_init);
+module_exit(sha512_neon_mod_fini);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("SHA512 Secure Hash Algorithm, NEON accelerated");
+
+MODULE_ALIAS("sha512");
+MODULE_ALIAS("sha384");

+ 26 - 3
arch/arm/include/asm/assembler.h

@@ -24,6 +24,8 @@
 #include <asm/domain.h>
 #include <asm/domain.h>
 #include <asm/opcodes-virt.h>
 #include <asm/opcodes-virt.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
+#include <asm/page.h>
+#include <asm/thread_info.h>
 
 
 #define IOMEM(x)	(x)
 #define IOMEM(x)	(x)
 
 
@@ -179,10 +181,10 @@
  * Get current thread_info.
  * Get current thread_info.
  */
  */
 	.macro	get_thread_info, rd
 	.macro	get_thread_info, rd
- ARM(	mov	\rd, sp, lsr #13	)
+ ARM(	mov	\rd, sp, lsr #THREAD_SIZE_ORDER + PAGE_SHIFT	)
  THUMB(	mov	\rd, sp			)
  THUMB(	mov	\rd, sp			)
- THUMB(	lsr	\rd, \rd, #13		)
-	mov	\rd, \rd, lsl #13
+ THUMB(	lsr	\rd, \rd, #THREAD_SIZE_ORDER + PAGE_SHIFT	)
+	mov	\rd, \rd, lsl #THREAD_SIZE_ORDER + PAGE_SHIFT
 	.endm
 	.endm
 
 
 /*
 /*
@@ -425,4 +427,25 @@ THUMB(	orr	\reg , \reg , #PSR_T_BIT	)
 #endif
 #endif
 	.endm
 	.endm
 
 
+	.irp	c,,eq,ne,cs,cc,mi,pl,vs,vc,hi,ls,ge,lt,gt,le,hs,lo
+	.macro	ret\c, reg
+#if __LINUX_ARM_ARCH__ < 6
+	mov\c	pc, \reg
+#else
+	.ifeqs	"\reg", "lr"
+	bx\c	\reg
+	.else
+	mov\c	pc, \reg
+	.endif
+#endif
+	.endm
+	.endr
+
+	.macro	ret.w, reg
+	ret	\reg
+#ifdef CONFIG_THUMB2_KERNEL
+	nop
+#endif
+	.endm
+
 #endif /* __ASM_ASSEMBLER_H__ */
 #endif /* __ASM_ASSEMBLER_H__ */

+ 24 - 13
arch/arm/include/asm/cputype.h

@@ -62,17 +62,18 @@
 #define ARM_CPU_IMP_ARM			0x41
 #define ARM_CPU_IMP_ARM			0x41
 #define ARM_CPU_IMP_INTEL		0x69
 #define ARM_CPU_IMP_INTEL		0x69
 
 
-#define ARM_CPU_PART_ARM1136		0xB360
-#define ARM_CPU_PART_ARM1156		0xB560
-#define ARM_CPU_PART_ARM1176		0xB760
-#define ARM_CPU_PART_ARM11MPCORE	0xB020
-#define ARM_CPU_PART_CORTEX_A8		0xC080
-#define ARM_CPU_PART_CORTEX_A9		0xC090
-#define ARM_CPU_PART_CORTEX_A5		0xC050
-#define ARM_CPU_PART_CORTEX_A15		0xC0F0
-#define ARM_CPU_PART_CORTEX_A7		0xC070
-#define ARM_CPU_PART_CORTEX_A12		0xC0D0
-#define ARM_CPU_PART_CORTEX_A17		0xC0E0
+/* ARM implemented processors */
+#define ARM_CPU_PART_ARM1136		0x4100b360
+#define ARM_CPU_PART_ARM1156		0x4100b560
+#define ARM_CPU_PART_ARM1176		0x4100b760
+#define ARM_CPU_PART_ARM11MPCORE	0x4100b020
+#define ARM_CPU_PART_CORTEX_A8		0x4100c080
+#define ARM_CPU_PART_CORTEX_A9		0x4100c090
+#define ARM_CPU_PART_CORTEX_A5		0x4100c050
+#define ARM_CPU_PART_CORTEX_A7		0x4100c070
+#define ARM_CPU_PART_CORTEX_A12		0x4100c0d0
+#define ARM_CPU_PART_CORTEX_A17		0x4100c0e0
+#define ARM_CPU_PART_CORTEX_A15		0x4100c0f0
 
 
 #define ARM_CPU_XSCALE_ARCH_MASK	0xe000
 #define ARM_CPU_XSCALE_ARCH_MASK	0xe000
 #define ARM_CPU_XSCALE_ARCH_V1		0x2000
 #define ARM_CPU_XSCALE_ARCH_V1		0x2000
@@ -171,14 +172,24 @@ static inline unsigned int __attribute_const__ read_cpuid_implementor(void)
 	return (read_cpuid_id() & 0xFF000000) >> 24;
 	return (read_cpuid_id() & 0xFF000000) >> 24;
 }
 }
 
 
-static inline unsigned int __attribute_const__ read_cpuid_part_number(void)
+/*
+ * The CPU part number is meaningless without referring to the CPU
+ * implementer: implementers are free to define their own part numbers
+ * which are permitted to clash with other implementer part numbers.
+ */
+static inline unsigned int __attribute_const__ read_cpuid_part(void)
+{
+	return read_cpuid_id() & 0xff00fff0;
+}
+
+static inline unsigned int __attribute_const__ __deprecated read_cpuid_part_number(void)
 {
 {
 	return read_cpuid_id() & 0xFFF0;
 	return read_cpuid_id() & 0xFFF0;
 }
 }
 
 
 static inline unsigned int __attribute_const__ xscale_cpu_arch_version(void)
 static inline unsigned int __attribute_const__ xscale_cpu_arch_version(void)
 {
 {
-	return read_cpuid_part_number() & ARM_CPU_XSCALE_ARCH_MASK;
+	return read_cpuid_id() & ARM_CPU_XSCALE_ARCH_MASK;
 }
 }
 
 
 static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)
 static inline unsigned int __attribute_const__ read_cpuid_cachetype(void)

+ 10 - 0
arch/arm/include/asm/crypto/sha1.h

@@ -0,0 +1,10 @@
+#ifndef ASM_ARM_CRYPTO_SHA1_H
+#define ASM_ARM_CRYPTO_SHA1_H
+
+#include <linux/crypto.h>
+#include <crypto/sha.h>
+
+extern int sha1_update_arm(struct shash_desc *desc, const u8 *data,
+			   unsigned int len);
+
+#endif

+ 1 - 1
arch/arm/include/asm/entry-macro-multi.S

@@ -35,5 +35,5 @@
 \symbol_name:
 \symbol_name:
 	mov	r8, lr
 	mov	r8, lr
 	arch_irq_handler_default
 	arch_irq_handler_default
-	mov     pc, r8
+	ret	r8
 	.endm
 	.endm

+ 9 - 9
arch/arm/include/asm/glue-proc.h

@@ -221,15 +221,6 @@
 # endif
 # endif
 #endif
 #endif
 
 
-#ifdef CONFIG_CPU_V7
-# ifdef CPU_NAME
-#  undef  MULTI_CPU
-#  define MULTI_CPU
-# else
-#  define CPU_NAME cpu_v7
-# endif
-#endif
-
 #ifdef CONFIG_CPU_V7M
 #ifdef CONFIG_CPU_V7M
 # ifdef CPU_NAME
 # ifdef CPU_NAME
 #  undef  MULTI_CPU
 #  undef  MULTI_CPU
@@ -248,6 +239,15 @@
 # endif
 # endif
 #endif
 #endif
 
 
+#ifdef CONFIG_CPU_V7
+/*
+ * Cortex-A9 needs a different suspend/resume function, so we need
+ * multiple CPU support for ARMv7 anyway.
+ */
+#  undef  MULTI_CPU
+#  define MULTI_CPU
+#endif
+
 #ifndef MULTI_CPU
 #ifndef MULTI_CPU
 #define cpu_proc_init			__glue(CPU_NAME,_proc_init)
 #define cpu_proc_init			__glue(CPU_NAME,_proc_init)
 #define cpu_proc_fin			__glue(CPU_NAME,_proc_fin)
 #define cpu_proc_fin			__glue(CPU_NAME,_proc_fin)

+ 16 - 0
arch/arm/include/asm/mcpm.h

@@ -217,6 +217,22 @@ int __mcpm_cluster_state(unsigned int cluster);
 int __init mcpm_sync_init(
 int __init mcpm_sync_init(
 	void (*power_up_setup)(unsigned int affinity_level));
 	void (*power_up_setup)(unsigned int affinity_level));
 
 
+/**
+ * mcpm_loopback - make a run through the MCPM low-level code
+ *
+ * @cache_disable: pointer to function performing cache disabling
+ *
+ * This exercises the MCPM machinery by soft resetting the CPU and branching
+ * to the MCPM low-level entry code before returning to the caller.
+ * The @cache_disable function must do the necessary cache disabling to
+ * let the regular kernel init code turn it back on as if the CPU was
+ * hotplugged in. The MCPM state machine is set as if the cluster was
+ * initialized meaning the power_up_setup callback passed to mcpm_sync_init()
+ * will be invoked for all affinity levels. This may be useful to initialize
+ * some resources such as enabling the CCI that requires the cache to be off, or simply for testing purposes.
+ */
+int __init mcpm_loopback(void (*cache_disable)(void));
+
 void __init mcpm_smp_set_ops(void);
 void __init mcpm_smp_set_ops(void);
 
 
 #else
 #else

+ 23 - 0
arch/arm/include/asm/mcs_spinlock.h

@@ -0,0 +1,23 @@
+#ifndef __ASM_MCS_LOCK_H
+#define __ASM_MCS_LOCK_H
+
+#ifdef CONFIG_SMP
+#include <asm/spinlock.h>
+
+/* MCS spin-locking. */
+#define arch_mcs_spin_lock_contended(lock)				\
+do {									\
+	/* Ensure prior stores are observed before we enter wfe. */	\
+	smp_mb();							\
+	while (!(smp_load_acquire(lock)))				\
+		wfe();							\
+} while (0)								\
+
+#define arch_mcs_spin_unlock_contended(lock)				\
+do {									\
+	smp_store_release(lock, 1);					\
+	dsb_sev();							\
+} while (0)
+
+#endif	/* CONFIG_SMP */
+#endif	/* __ASM_MCS_LOCK_H */

+ 3 - 7
arch/arm/include/asm/memory.h

@@ -91,9 +91,7 @@
  * of this define that was meant to.
  * of this define that was meant to.
  * Fortunately, there is no reference for this in noMMU mode, for now.
  * Fortunately, there is no reference for this in noMMU mode, for now.
  */
  */
-#ifndef TASK_SIZE
-#define TASK_SIZE		(CONFIG_DRAM_SIZE)
-#endif
+#define TASK_SIZE		UL(0xffffffff)
 
 
 #ifndef TASK_UNMAPPED_BASE
 #ifndef TASK_UNMAPPED_BASE
 #define TASK_UNMAPPED_BASE	UL(0x00000000)
 #define TASK_UNMAPPED_BASE	UL(0x00000000)
@@ -150,13 +148,11 @@
 
 
 /*
 /*
  * PLAT_PHYS_OFFSET is the offset (from zero) of the start of physical
  * PLAT_PHYS_OFFSET is the offset (from zero) of the start of physical
- * memory.  This is used for XIP and NoMMU kernels, or by kernels which
- * have their own mach/memory.h.  Assembly code must always use
+ * memory.  This is used for XIP and NoMMU kernels, and on platforms that don't
+ * have CONFIG_ARM_PATCH_PHYS_VIRT. Assembly code must always use
  * PLAT_PHYS_OFFSET and not PHYS_OFFSET.
  * PLAT_PHYS_OFFSET and not PHYS_OFFSET.
  */
  */
-#ifndef PLAT_PHYS_OFFSET
 #define PLAT_PHYS_OFFSET	UL(CONFIG_PHYS_OFFSET)
 #define PLAT_PHYS_OFFSET	UL(CONFIG_PHYS_OFFSET)
-#endif
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__
 
 

+ 0 - 9
arch/arm/include/asm/perf_event.h

@@ -12,15 +12,6 @@
 #ifndef __ARM_PERF_EVENT_H__
 #ifndef __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 #define __ARM_PERF_EVENT_H__
 
 
-/*
- * The ARMv7 CPU PMU supports up to 32 event counters.
- */
-#define ARMPMU_MAX_HWEVENTS		32
-
-#define HW_OP_UNSUPPORTED		0xFFFF
-#define C(_x)				PERF_COUNT_HW_CACHE_##_x
-#define CACHE_OP_UNSUPPORTED		0xFFFF
-
 #ifdef CONFIG_HW_PERF_EVENTS
 #ifdef CONFIG_HW_PERF_EVENTS
 struct pt_regs;
 struct pt_regs;
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);
 extern unsigned long perf_instruction_pointer(struct pt_regs *regs);

+ 2 - 1
arch/arm/include/asm/pgtable-3level-hwdef.h

@@ -43,7 +43,7 @@
 #define PMD_SECT_BUFFERABLE	(_AT(pmdval_t, 1) << 2)
 #define PMD_SECT_BUFFERABLE	(_AT(pmdval_t, 1) << 2)
 #define PMD_SECT_CACHEABLE	(_AT(pmdval_t, 1) << 3)
 #define PMD_SECT_CACHEABLE	(_AT(pmdval_t, 1) << 3)
 #define PMD_SECT_USER		(_AT(pmdval_t, 1) << 6)		/* AP[1] */
 #define PMD_SECT_USER		(_AT(pmdval_t, 1) << 6)		/* AP[1] */
-#define PMD_SECT_RDONLY		(_AT(pmdval_t, 1) << 7)		/* AP[2] */
+#define PMD_SECT_AP2		(_AT(pmdval_t, 1) << 7)		/* read only */
 #define PMD_SECT_S		(_AT(pmdval_t, 3) << 8)
 #define PMD_SECT_S		(_AT(pmdval_t, 3) << 8)
 #define PMD_SECT_AF		(_AT(pmdval_t, 1) << 10)
 #define PMD_SECT_AF		(_AT(pmdval_t, 1) << 10)
 #define PMD_SECT_nG		(_AT(pmdval_t, 1) << 11)
 #define PMD_SECT_nG		(_AT(pmdval_t, 1) << 11)
@@ -72,6 +72,7 @@
 #define PTE_TABLE_BIT		(_AT(pteval_t, 1) << 1)
 #define PTE_TABLE_BIT		(_AT(pteval_t, 1) << 1)
 #define PTE_BUFFERABLE		(_AT(pteval_t, 1) << 2)		/* AttrIndx[0] */
 #define PTE_BUFFERABLE		(_AT(pteval_t, 1) << 2)		/* AttrIndx[0] */
 #define PTE_CACHEABLE		(_AT(pteval_t, 1) << 3)		/* AttrIndx[1] */
 #define PTE_CACHEABLE		(_AT(pteval_t, 1) << 3)		/* AttrIndx[1] */
+#define PTE_AP2			(_AT(pteval_t, 1) << 7)		/* AP[2] */
 #define PTE_EXT_SHARED		(_AT(pteval_t, 3) << 8)		/* SH[1:0], inner shareable */
 #define PTE_EXT_SHARED		(_AT(pteval_t, 3) << 8)		/* SH[1:0], inner shareable */
 #define PTE_EXT_AF		(_AT(pteval_t, 1) << 10)	/* Access Flag */
 #define PTE_EXT_AF		(_AT(pteval_t, 1) << 10)	/* Access Flag */
 #define PTE_EXT_NG		(_AT(pteval_t, 1) << 11)	/* nG */
 #define PTE_EXT_NG		(_AT(pteval_t, 1) << 11)	/* nG */

+ 30 - 19
arch/arm/include/asm/pgtable-3level.h

@@ -79,18 +79,19 @@
 #define L_PTE_PRESENT		(_AT(pteval_t, 3) << 0)		/* Present */
 #define L_PTE_PRESENT		(_AT(pteval_t, 3) << 0)		/* Present */
 #define L_PTE_FILE		(_AT(pteval_t, 1) << 2)		/* only when !PRESENT */
 #define L_PTE_FILE		(_AT(pteval_t, 1) << 2)		/* only when !PRESENT */
 #define L_PTE_USER		(_AT(pteval_t, 1) << 6)		/* AP[1] */
 #define L_PTE_USER		(_AT(pteval_t, 1) << 6)		/* AP[1] */
-#define L_PTE_RDONLY		(_AT(pteval_t, 1) << 7)		/* AP[2] */
 #define L_PTE_SHARED		(_AT(pteval_t, 3) << 8)		/* SH[1:0], inner shareable */
 #define L_PTE_SHARED		(_AT(pteval_t, 3) << 8)		/* SH[1:0], inner shareable */
 #define L_PTE_YOUNG		(_AT(pteval_t, 1) << 10)	/* AF */
 #define L_PTE_YOUNG		(_AT(pteval_t, 1) << 10)	/* AF */
 #define L_PTE_XN		(_AT(pteval_t, 1) << 54)	/* XN */
 #define L_PTE_XN		(_AT(pteval_t, 1) << 54)	/* XN */
-#define L_PTE_DIRTY		(_AT(pteval_t, 1) << 55)	/* unused */
-#define L_PTE_SPECIAL		(_AT(pteval_t, 1) << 56)	/* unused */
+#define L_PTE_DIRTY		(_AT(pteval_t, 1) << 55)
+#define L_PTE_SPECIAL		(_AT(pteval_t, 1) << 56)
 #define L_PTE_NONE		(_AT(pteval_t, 1) << 57)	/* PROT_NONE */
 #define L_PTE_NONE		(_AT(pteval_t, 1) << 57)	/* PROT_NONE */
+#define L_PTE_RDONLY		(_AT(pteval_t, 1) << 58)	/* READ ONLY */
 
 
-#define PMD_SECT_VALID		(_AT(pmdval_t, 1) << 0)
-#define PMD_SECT_DIRTY		(_AT(pmdval_t, 1) << 55)
-#define PMD_SECT_SPLITTING	(_AT(pmdval_t, 1) << 56)
-#define PMD_SECT_NONE		(_AT(pmdval_t, 1) << 57)
+#define L_PMD_SECT_VALID	(_AT(pmdval_t, 1) << 0)
+#define L_PMD_SECT_DIRTY	(_AT(pmdval_t, 1) << 55)
+#define L_PMD_SECT_SPLITTING	(_AT(pmdval_t, 1) << 56)
+#define L_PMD_SECT_NONE		(_AT(pmdval_t, 1) << 57)
+#define L_PMD_SECT_RDONLY	(_AT(pteval_t, 1) << 58)
 
 
 /*
 /*
  * To be used in assembly code with the upper page attributes.
  * To be used in assembly code with the upper page attributes.
@@ -207,27 +208,32 @@ static inline pmd_t *pmd_offset(pud_t *pud, unsigned long addr)
 #define pte_huge(pte)		(pte_val(pte) && !(pte_val(pte) & PTE_TABLE_BIT))
 #define pte_huge(pte)		(pte_val(pte) && !(pte_val(pte) & PTE_TABLE_BIT))
 #define pte_mkhuge(pte)		(__pte(pte_val(pte) & ~PTE_TABLE_BIT))
 #define pte_mkhuge(pte)		(__pte(pte_val(pte) & ~PTE_TABLE_BIT))
 
 
-#define pmd_young(pmd)		(pmd_val(pmd) & PMD_SECT_AF)
+#define pmd_isset(pmd, val)	((u32)(val) == (val) ? pmd_val(pmd) & (val) \
+						: !!(pmd_val(pmd) & (val)))
+#define pmd_isclear(pmd, val)	(!(pmd_val(pmd) & (val)))
+
+#define pmd_young(pmd)		(pmd_isset((pmd), PMD_SECT_AF))
 
 
 #define __HAVE_ARCH_PMD_WRITE
 #define __HAVE_ARCH_PMD_WRITE
-#define pmd_write(pmd)		(!(pmd_val(pmd) & PMD_SECT_RDONLY))
+#define pmd_write(pmd)		(pmd_isclear((pmd), L_PMD_SECT_RDONLY))
+#define pmd_dirty(pmd)		(pmd_isset((pmd), L_PMD_SECT_DIRTY))
 
 
 #define pmd_hugewillfault(pmd)	(!pmd_young(pmd) || !pmd_write(pmd))
 #define pmd_hugewillfault(pmd)	(!pmd_young(pmd) || !pmd_write(pmd))
 #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
 #define pmd_thp_or_huge(pmd)	(pmd_huge(pmd) || pmd_trans_huge(pmd))
 
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define pmd_trans_huge(pmd)	(pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT))
-#define pmd_trans_splitting(pmd) (pmd_val(pmd) & PMD_SECT_SPLITTING)
+#define pmd_trans_huge(pmd)	(pmd_val(pmd) && !pmd_table(pmd))
+#define pmd_trans_splitting(pmd) (pmd_isset((pmd), L_PMD_SECT_SPLITTING))
 #endif
 #endif
 
 
 #define PMD_BIT_FUNC(fn,op) \
 #define PMD_BIT_FUNC(fn,op) \
 static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
 static inline pmd_t pmd_##fn(pmd_t pmd) { pmd_val(pmd) op; return pmd; }
 
 
-PMD_BIT_FUNC(wrprotect,	|= PMD_SECT_RDONLY);
+PMD_BIT_FUNC(wrprotect,	|= L_PMD_SECT_RDONLY);
 PMD_BIT_FUNC(mkold,	&= ~PMD_SECT_AF);
 PMD_BIT_FUNC(mkold,	&= ~PMD_SECT_AF);
-PMD_BIT_FUNC(mksplitting, |= PMD_SECT_SPLITTING);
-PMD_BIT_FUNC(mkwrite,   &= ~PMD_SECT_RDONLY);
-PMD_BIT_FUNC(mkdirty,   |= PMD_SECT_DIRTY);
+PMD_BIT_FUNC(mksplitting, |= L_PMD_SECT_SPLITTING);
+PMD_BIT_FUNC(mkwrite,   &= ~L_PMD_SECT_RDONLY);
+PMD_BIT_FUNC(mkdirty,   |= L_PMD_SECT_DIRTY);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
 
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
 #define pmd_mkhuge(pmd)		(__pmd(pmd_val(pmd) & ~PMD_TABLE_BIT))
@@ -241,8 +247,8 @@ PMD_BIT_FUNC(mkyoung,   |= PMD_SECT_AF);
 
 
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
 {
-	const pmdval_t mask = PMD_SECT_USER | PMD_SECT_XN | PMD_SECT_RDONLY |
-				PMD_SECT_VALID | PMD_SECT_NONE;
+	const pmdval_t mask = PMD_SECT_USER | PMD_SECT_XN | L_PMD_SECT_RDONLY |
+				L_PMD_SECT_VALID | L_PMD_SECT_NONE;
 	pmd_val(pmd) = (pmd_val(pmd) & ~mask) | (pgprot_val(newprot) & mask);
 	pmd_val(pmd) = (pmd_val(pmd) & ~mask) | (pgprot_val(newprot) & mask);
 	return pmd;
 	return pmd;
 }
 }
@@ -253,8 +259,13 @@ static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
 	BUG_ON(addr >= TASK_SIZE);
 	BUG_ON(addr >= TASK_SIZE);
 
 
 	/* create a faulting entry if PROT_NONE protected */
 	/* create a faulting entry if PROT_NONE protected */
-	if (pmd_val(pmd) & PMD_SECT_NONE)
-		pmd_val(pmd) &= ~PMD_SECT_VALID;
+	if (pmd_val(pmd) & L_PMD_SECT_NONE)
+		pmd_val(pmd) &= ~L_PMD_SECT_VALID;
+
+	if (pmd_write(pmd) && pmd_dirty(pmd))
+		pmd_val(pmd) &= ~PMD_SECT_AP2;
+	else
+		pmd_val(pmd) |= PMD_SECT_AP2;
 
 
 	*pmdp = __pmd(pmd_val(pmd) | PMD_SECT_nG);
 	*pmdp = __pmd(pmd_val(pmd) | PMD_SECT_nG);
 	flush_pmd_entry(pmdp);
 	flush_pmd_entry(pmdp);

+ 11 - 7
arch/arm/include/asm/pgtable.h

@@ -214,18 +214,22 @@ static inline pte_t *pmd_page_vaddr(pmd_t pmd)
 
 
 #define pte_clear(mm,addr,ptep)	set_pte_ext(ptep, __pte(0), 0)
 #define pte_clear(mm,addr,ptep)	set_pte_ext(ptep, __pte(0), 0)
 
 
+#define pte_isset(pte, val)	((u32)(val) == (val) ? pte_val(pte) & (val) \
+						: !!(pte_val(pte) & (val)))
+#define pte_isclear(pte, val)	(!(pte_val(pte) & (val)))
+
 #define pte_none(pte)		(!pte_val(pte))
 #define pte_none(pte)		(!pte_val(pte))
-#define pte_present(pte)	(pte_val(pte) & L_PTE_PRESENT)
-#define pte_valid(pte)		(pte_val(pte) & L_PTE_VALID)
+#define pte_present(pte)	(pte_isset((pte), L_PTE_PRESENT))
+#define pte_valid(pte)		(pte_isset((pte), L_PTE_VALID))
 #define pte_accessible(mm, pte)	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
 #define pte_accessible(mm, pte)	(mm_tlb_flush_pending(mm) ? pte_present(pte) : pte_valid(pte))
-#define pte_write(pte)		(!(pte_val(pte) & L_PTE_RDONLY))
-#define pte_dirty(pte)		(pte_val(pte) & L_PTE_DIRTY)
-#define pte_young(pte)		(pte_val(pte) & L_PTE_YOUNG)
-#define pte_exec(pte)		(!(pte_val(pte) & L_PTE_XN))
+#define pte_write(pte)		(pte_isclear((pte), L_PTE_RDONLY))
+#define pte_dirty(pte)		(pte_isset((pte), L_PTE_DIRTY))
+#define pte_young(pte)		(pte_isset((pte), L_PTE_YOUNG))
+#define pte_exec(pte)		(pte_isclear((pte), L_PTE_XN))
 #define pte_special(pte)	(0)
 #define pte_special(pte)	(0)
 
 
 #define pte_valid_user(pte)	\
 #define pte_valid_user(pte)	\
-	(pte_valid(pte) && (pte_val(pte) & L_PTE_USER) && pte_young(pte))
+	(pte_valid(pte) && pte_isset((pte), L_PTE_USER) && pte_young(pte))
 
 
 #if __LINUX_ARM_ARCH__ < 6
 #if __LINUX_ARM_ARCH__ < 6
 static inline void __sync_icache_dcache(pte_t pteval)
 static inline void __sync_icache_dcache(pte_t pteval)

+ 19 - 0
arch/arm/include/asm/pmu.h

@@ -42,6 +42,25 @@ struct arm_pmu_platdata {
 
 
 #ifdef CONFIG_HW_PERF_EVENTS
 #ifdef CONFIG_HW_PERF_EVENTS
 
 
+/*
+ * The ARMv7 CPU PMU supports up to 32 event counters.
+ */
+#define ARMPMU_MAX_HWEVENTS		32
+
+#define HW_OP_UNSUPPORTED		0xFFFF
+#define C(_x)				PERF_COUNT_HW_CACHE_##_x
+#define CACHE_OP_UNSUPPORTED		0xFFFF
+
+#define PERF_MAP_ALL_UNSUPPORTED					\
+	[0 ... PERF_COUNT_HW_MAX - 1] = HW_OP_UNSUPPORTED
+
+#define PERF_CACHE_MAP_ALL_UNSUPPORTED					\
+[0 ... C(MAX) - 1] = {							\
+	[0 ... C(OP_MAX) - 1] = {					\
+		[0 ... C(RESULT_MAX) - 1] = CACHE_OP_UNSUPPORTED,	\
+	},								\
+}
+
 /* The events for a given PMU register set. */
 /* The events for a given PMU register set. */
 struct pmu_hw_events {
 struct pmu_hw_events {
 	/*
 	/*

+ 6 - 0
arch/arm/include/asm/ptrace.h

@@ -84,6 +84,12 @@ static inline long regs_return_value(struct pt_regs *regs)
 
 
 #define instruction_pointer(regs)	(regs)->ARM_pc
 #define instruction_pointer(regs)	(regs)->ARM_pc
 
 
+#ifdef CONFIG_THUMB2_KERNEL
+#define frame_pointer(regs) (regs)->ARM_r7
+#else
+#define frame_pointer(regs) (regs)->ARM_fp
+#endif
+
 static inline void instruction_pointer_set(struct pt_regs *regs,
 static inline void instruction_pointer_set(struct pt_regs *regs,
 					   unsigned long val)
 					   unsigned long val)
 {
 {

+ 1 - 1
arch/arm/include/asm/smp_scu.h

@@ -11,7 +11,7 @@
 
 
 static inline bool scu_a9_has_base(void)
 static inline bool scu_a9_has_base(void)
 {
 {
-	return read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9;
+	return read_cpuid_part() == ARM_CPU_PART_CORTEX_A9;
 }
 }
 
 
 static inline unsigned long scu_a9_get_base(void)
 static inline unsigned long scu_a9_get_base(void)

+ 15 - 0
arch/arm/include/asm/stacktrace.h

@@ -1,13 +1,28 @@
 #ifndef __ASM_STACKTRACE_H
 #ifndef __ASM_STACKTRACE_H
 #define __ASM_STACKTRACE_H
 #define __ASM_STACKTRACE_H
 
 
+#include <asm/ptrace.h>
+
 struct stackframe {
 struct stackframe {
+	/*
+	 * FP member should hold R7 when CONFIG_THUMB2_KERNEL is enabled
+	 * and R11 otherwise.
+	 */
 	unsigned long fp;
 	unsigned long fp;
 	unsigned long sp;
 	unsigned long sp;
 	unsigned long lr;
 	unsigned long lr;
 	unsigned long pc;
 	unsigned long pc;
 };
 };
 
 
+static __always_inline
+void arm_get_current_stackframe(struct pt_regs *regs, struct stackframe *frame)
+{
+		frame->fp = frame_pointer(regs);
+		frame->sp = regs->ARM_sp;
+		frame->lr = regs->ARM_lr;
+		frame->pc = regs->ARM_pc;
+}
+
 extern int unwind_frame(struct stackframe *frame);
 extern int unwind_frame(struct stackframe *frame);
 extern void walk_stackframe(struct stackframe *frame,
 extern void walk_stackframe(struct stackframe *frame,
 			    int (*fn)(struct stackframe *, void *), void *data);
 			    int (*fn)(struct stackframe *, void *), void *data);

+ 2 - 1
arch/arm/include/asm/thread_info.h

@@ -14,9 +14,10 @@
 
 
 #include <linux/compiler.h>
 #include <linux/compiler.h>
 #include <asm/fpstate.h>
 #include <asm/fpstate.h>
+#include <asm/page.h>
 
 
 #define THREAD_SIZE_ORDER	1
 #define THREAD_SIZE_ORDER	1
-#define THREAD_SIZE		8192
+#define THREAD_SIZE		(PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_START_SP		(THREAD_SIZE - 8)
 #define THREAD_START_SP		(THREAD_SIZE - 8)
 
 
 #ifndef __ASSEMBLY__
 #ifndef __ASSEMBLY__

+ 20 - 2
arch/arm/include/asm/uaccess.h

@@ -107,6 +107,8 @@ static inline void set_fs(mm_segment_t fs)
 extern int __get_user_1(void *);
 extern int __get_user_1(void *);
 extern int __get_user_2(void *);
 extern int __get_user_2(void *);
 extern int __get_user_4(void *);
 extern int __get_user_4(void *);
+extern int __get_user_lo8(void *);
+extern int __get_user_8(void *);
 
 
 #define __GUP_CLOBBER_1	"lr", "cc"
 #define __GUP_CLOBBER_1	"lr", "cc"
 #ifdef CONFIG_CPU_USE_DOMAINS
 #ifdef CONFIG_CPU_USE_DOMAINS
@@ -115,6 +117,8 @@ extern int __get_user_4(void *);
 #define __GUP_CLOBBER_2 "lr", "cc"
 #define __GUP_CLOBBER_2 "lr", "cc"
 #endif
 #endif
 #define __GUP_CLOBBER_4	"lr", "cc"
 #define __GUP_CLOBBER_4	"lr", "cc"
+#define __GUP_CLOBBER_lo8 "lr", "cc"
+#define __GUP_CLOBBER_8	"lr", "cc"
 
 
 #define __get_user_x(__r2,__p,__e,__l,__s)				\
 #define __get_user_x(__r2,__p,__e,__l,__s)				\
 	   __asm__ __volatile__ (					\
 	   __asm__ __volatile__ (					\
@@ -125,11 +129,19 @@ extern int __get_user_4(void *);
 		: "0" (__p), "r" (__l)					\
 		: "0" (__p), "r" (__l)					\
 		: __GUP_CLOBBER_##__s)
 		: __GUP_CLOBBER_##__s)
 
 
+/* narrowing a double-word get into a single 32bit word register: */
+#ifdef __ARMEB__
+#define __get_user_xb(__r2, __p, __e, __l, __s)				\
+	__get_user_x(__r2, __p, __e, __l, lo8)
+#else
+#define __get_user_xb __get_user_x
+#endif
+
 #define __get_user_check(x,p)							\
 #define __get_user_check(x,p)							\
 	({								\
 	({								\
 		unsigned long __limit = current_thread_info()->addr_limit - 1; \
 		unsigned long __limit = current_thread_info()->addr_limit - 1; \
 		register const typeof(*(p)) __user *__p asm("r0") = (p);\
 		register const typeof(*(p)) __user *__p asm("r0") = (p);\
-		register unsigned long __r2 asm("r2");			\
+		register typeof(x) __r2 asm("r2");			\
 		register unsigned long __l asm("r1") = __limit;		\
 		register unsigned long __l asm("r1") = __limit;		\
 		register int __e asm("r0");				\
 		register int __e asm("r0");				\
 		switch (sizeof(*(__p))) {				\
 		switch (sizeof(*(__p))) {				\
@@ -142,6 +154,12 @@ extern int __get_user_4(void *);
 		case 4:							\
 		case 4:							\
 			__get_user_x(__r2, __p, __e, __l, 4);		\
 			__get_user_x(__r2, __p, __e, __l, 4);		\
 			break;						\
 			break;						\
+		case 8:							\
+			if (sizeof((x)) < 8)				\
+				__get_user_xb(__r2, __p, __e, __l, 4);	\
+			else						\
+				__get_user_x(__r2, __p, __e, __l, 8);	\
+			break;						\
 		default: __e = __get_user_bad(); break;			\
 		default: __e = __get_user_bad(); break;			\
 		}							\
 		}							\
 		x = (typeof(*(p))) __r2;				\
 		x = (typeof(*(p))) __r2;				\
@@ -224,7 +242,7 @@ static inline void set_fs(mm_segment_t fs)
 #define access_ok(type,addr,size)	(__range_ok(addr,size) == 0)
 #define access_ok(type,addr,size)	(__range_ok(addr,size) == 0)
 
 
 #define user_addr_max() \
 #define user_addr_max() \
-	(segment_eq(get_fs(), USER_DS) ? TASK_SIZE : ~0UL)
+	(segment_eq(get_fs(), KERNEL_DS) ? ~0UL : get_fs())
 
 
 /*
 /*
  * The "__xxx" versions of the user access functions do not verify the
  * The "__xxx" versions of the user access functions do not verify the

+ 10 - 0
arch/arm/include/asm/unistd.h

@@ -15,7 +15,17 @@
 
 
 #include <uapi/asm/unistd.h>
 #include <uapi/asm/unistd.h>
 
 
+/*
+ * This may need to be greater than __NR_last_syscall+1 in order to
+ * account for the padding in the syscall table
+ */
 #define __NR_syscalls  (384)
 #define __NR_syscalls  (384)
+
+/*
+ * *NOTE*: This is a ghost syscall private to the kernel.  Only the
+ * __kuser_cmpxchg code in entry-armv.S should be aware of its
+ * existence.  Don't ever use this from user code.
+ */
 #define __ARM_NR_cmpxchg		(__ARM_NR_BASE+0x00fff0)
 #define __ARM_NR_cmpxchg		(__ARM_NR_BASE+0x00fff0)
 
 
 #define __ARCH_WANT_STAT64
 #define __ARCH_WANT_STAT64

+ 0 - 11
arch/arm/include/uapi/asm/unistd.h

@@ -410,11 +410,6 @@
 #define __NR_sched_getattr		(__NR_SYSCALL_BASE+381)
 #define __NR_sched_getattr		(__NR_SYSCALL_BASE+381)
 #define __NR_renameat2			(__NR_SYSCALL_BASE+382)
 #define __NR_renameat2			(__NR_SYSCALL_BASE+382)
 
 
-/*
- * This may need to be greater than __NR_last_syscall+1 in order to
- * account for the padding in the syscall table
- */
-
 /*
 /*
  * The following SWIs are ARM private.
  * The following SWIs are ARM private.
  */
  */
@@ -425,12 +420,6 @@
 #define __ARM_NR_usr32			(__ARM_NR_BASE+4)
 #define __ARM_NR_usr32			(__ARM_NR_BASE+4)
 #define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
 #define __ARM_NR_set_tls		(__ARM_NR_BASE+5)
 
 
-/*
- * *NOTE*: This is a ghost syscall private to the kernel.  Only the
- * __kuser_cmpxchg code in entry-armv.S should be aware of its
- * existence.  Don't ever use this from user code.
- */
-
 /*
 /*
  * The following syscalls are obsolete and no longer available for EABI.
  * The following syscalls are obsolete and no longer available for EABI.
  */
  */

+ 5 - 5
arch/arm/kernel/debug.S

@@ -90,7 +90,7 @@ ENTRY(printascii)
 		ldrneb	r1, [r0], #1
 		ldrneb	r1, [r0], #1
 		teqne	r1, #0
 		teqne	r1, #0
 		bne	1b
 		bne	1b
-		mov	pc, lr
+		ret	lr
 ENDPROC(printascii)
 ENDPROC(printascii)
 
 
 ENTRY(printch)
 ENTRY(printch)
@@ -105,7 +105,7 @@ ENTRY(debug_ll_addr)
 		addruart r2, r3, ip
 		addruart r2, r3, ip
 		str	r2, [r0]
 		str	r2, [r0]
 		str	r3, [r1]
 		str	r3, [r1]
-		mov	pc, lr
+		ret	lr
 ENDPROC(debug_ll_addr)
 ENDPROC(debug_ll_addr)
 #endif
 #endif
 
 
@@ -116,7 +116,7 @@ ENTRY(printascii)
 		mov	r0, #0x04		@ SYS_WRITE0
 		mov	r0, #0x04		@ SYS_WRITE0
 	ARM(	svc	#0x123456	)
 	ARM(	svc	#0x123456	)
 	THUMB(	svc	#0xab		)
 	THUMB(	svc	#0xab		)
-		mov	pc, lr
+		ret	lr
 ENDPROC(printascii)
 ENDPROC(printascii)
 
 
 ENTRY(printch)
 ENTRY(printch)
@@ -125,14 +125,14 @@ ENTRY(printch)
 		mov	r0, #0x03		@ SYS_WRITEC
 		mov	r0, #0x03		@ SYS_WRITEC
 	ARM(	svc	#0x123456	)
 	ARM(	svc	#0x123456	)
 	THUMB(	svc	#0xab		)
 	THUMB(	svc	#0xab		)
-		mov	pc, lr
+		ret	lr
 ENDPROC(printch)
 ENDPROC(printch)
 
 
 ENTRY(debug_ll_addr)
 ENTRY(debug_ll_addr)
 		mov	r2, #0
 		mov	r2, #0
 		str	r2, [r0]
 		str	r2, [r0]
 		str	r2, [r1]
 		str	r2, [r1]
-		mov	pc, lr
+		ret	lr
 ENDPROC(debug_ll_addr)
 ENDPROC(debug_ll_addr)
 
 
 #endif
 #endif

+ 21 - 21
arch/arm/kernel/entry-armv.S

@@ -224,7 +224,7 @@ svc_preempt:
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
 	tst	r0, #_TIF_NEED_RESCHED
 	tst	r0, #_TIF_NEED_RESCHED
-	moveq	pc, r8				@ go again
+	reteq	r8				@ go again
 	b	1b
 	b	1b
 #endif
 #endif
 
 
@@ -490,7 +490,7 @@ ENDPROC(__und_usr)
 	.pushsection .fixup, "ax"
 	.pushsection .fixup, "ax"
 	.align	2
 	.align	2
 4:	str     r4, [sp, #S_PC]			@ retry current instruction
 4:	str     r4, [sp, #S_PC]			@ retry current instruction
-	mov	pc, r9
+	ret	r9
 	.popsection
 	.popsection
 	.pushsection __ex_table,"a"
 	.pushsection __ex_table,"a"
 	.long	1b, 4b
 	.long	1b, 4b
@@ -552,7 +552,7 @@ call_fpe:
 #endif
 #endif
 	tst	r0, #0x08000000			@ only CDP/CPRT/LDC/STC have bit 27
 	tst	r0, #0x08000000			@ only CDP/CPRT/LDC/STC have bit 27
 	tstne	r0, #0x04000000			@ bit 26 set on both ARM and Thumb-2
 	tstne	r0, #0x04000000			@ bit 26 set on both ARM and Thumb-2
-	moveq	pc, lr
+	reteq	lr
 	and	r8, r0, #0x00000f00		@ mask out CP number
 	and	r8, r0, #0x00000f00		@ mask out CP number
  THUMB(	lsr	r8, r8, #8		)
  THUMB(	lsr	r8, r8, #8		)
 	mov	r7, #1
 	mov	r7, #1
@@ -571,33 +571,33 @@ call_fpe:
  THUMB(	add	pc, r8			)
  THUMB(	add	pc, r8			)
 	nop
 	nop
 
 
-	movw_pc	lr				@ CP#0
+	ret.w	lr				@ CP#0
 	W(b)	do_fpe				@ CP#1 (FPE)
 	W(b)	do_fpe				@ CP#1 (FPE)
 	W(b)	do_fpe				@ CP#2 (FPE)
 	W(b)	do_fpe				@ CP#2 (FPE)
-	movw_pc	lr				@ CP#3
+	ret.w	lr				@ CP#3
 #ifdef CONFIG_CRUNCH
 #ifdef CONFIG_CRUNCH
 	b	crunch_task_enable		@ CP#4 (MaverickCrunch)
 	b	crunch_task_enable		@ CP#4 (MaverickCrunch)
 	b	crunch_task_enable		@ CP#5 (MaverickCrunch)
 	b	crunch_task_enable		@ CP#5 (MaverickCrunch)
 	b	crunch_task_enable		@ CP#6 (MaverickCrunch)
 	b	crunch_task_enable		@ CP#6 (MaverickCrunch)
 #else
 #else
-	movw_pc	lr				@ CP#4
-	movw_pc	lr				@ CP#5
-	movw_pc	lr				@ CP#6
+	ret.w	lr				@ CP#4
+	ret.w	lr				@ CP#5
+	ret.w	lr				@ CP#6
 #endif
 #endif
-	movw_pc	lr				@ CP#7
-	movw_pc	lr				@ CP#8
-	movw_pc	lr				@ CP#9
+	ret.w	lr				@ CP#7
+	ret.w	lr				@ CP#8
+	ret.w	lr				@ CP#9
 #ifdef CONFIG_VFP
 #ifdef CONFIG_VFP
 	W(b)	do_vfp				@ CP#10 (VFP)
 	W(b)	do_vfp				@ CP#10 (VFP)
 	W(b)	do_vfp				@ CP#11 (VFP)
 	W(b)	do_vfp				@ CP#11 (VFP)
 #else
 #else
-	movw_pc	lr				@ CP#10 (VFP)
-	movw_pc	lr				@ CP#11 (VFP)
+	ret.w	lr				@ CP#10 (VFP)
+	ret.w	lr				@ CP#11 (VFP)
 #endif
 #endif
-	movw_pc	lr				@ CP#12
-	movw_pc	lr				@ CP#13
-	movw_pc	lr				@ CP#14 (Debug)
-	movw_pc	lr				@ CP#15 (Control)
+	ret.w	lr				@ CP#12
+	ret.w	lr				@ CP#13
+	ret.w	lr				@ CP#14 (Debug)
+	ret.w	lr				@ CP#15 (Control)
 
 
 #ifdef NEED_CPU_ARCHITECTURE
 #ifdef NEED_CPU_ARCHITECTURE
 	.align	2
 	.align	2
@@ -649,7 +649,7 @@ ENTRY(fp_enter)
 	.popsection
 	.popsection
 
 
 ENTRY(no_fp)
 ENTRY(no_fp)
-	mov	pc, lr
+	ret	lr
 ENDPROC(no_fp)
 ENDPROC(no_fp)
 
 
 __und_usr_fault_32:
 __und_usr_fault_32:
@@ -745,7 +745,7 @@ ENDPROC(__switch_to)
 #ifdef CONFIG_ARM_THUMB
 #ifdef CONFIG_ARM_THUMB
 	bx	\reg
 	bx	\reg
 #else
 #else
-	mov	pc, \reg
+	ret	\reg
 #endif
 #endif
 	.endm
 	.endm
 
 
@@ -837,7 +837,7 @@ kuser_cmpxchg64_fixup:
 #if __LINUX_ARM_ARCH__ < 6
 #if __LINUX_ARM_ARCH__ < 6
 	bcc	kuser_cmpxchg32_fixup
 	bcc	kuser_cmpxchg32_fixup
 #endif
 #endif
-	mov	pc, lr
+	ret	lr
 	.previous
 	.previous
 
 
 #else
 #else
@@ -905,7 +905,7 @@ kuser_cmpxchg32_fixup:
 	subs	r8, r4, r7
 	subs	r8, r4, r7
 	rsbcss	r8, r8, #(2b - 1b)
 	rsbcss	r8, r8, #(2b - 1b)
 	strcs	r7, [sp, #S_PC]
 	strcs	r7, [sp, #S_PC]
-	mov	pc, lr
+	ret	lr
 	.previous
 	.previous
 
 
 #else
 #else

+ 7 - 6
arch/arm/kernel/entry-common.S

@@ -8,6 +8,7 @@
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  */
  */
 
 
+#include <asm/assembler.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
 #include <asm/ftrace.h>
 #include <asm/ftrace.h>
 #include <asm/unwind.h>
 #include <asm/unwind.h>
@@ -88,7 +89,7 @@ ENTRY(ret_from_fork)
 	cmp	r5, #0
 	cmp	r5, #0
 	movne	r0, r4
 	movne	r0, r4
 	adrne	lr, BSYM(1f)
 	adrne	lr, BSYM(1f)
-	movne	pc, r5
+	retne	r5
 1:	get_thread_info tsk
 1:	get_thread_info tsk
 	b	ret_slow_syscall
 	b	ret_slow_syscall
 ENDPROC(ret_from_fork)
 ENDPROC(ret_from_fork)
@@ -290,7 +291,7 @@ ENDPROC(ftrace_graph_caller_old)
 
 
 .macro mcount_exit
 .macro mcount_exit
 	ldmia	sp!, {r0-r3, ip, lr}
 	ldmia	sp!, {r0-r3, ip, lr}
-	mov	pc, ip
+	ret	ip
 .endm
 .endm
 
 
 ENTRY(__gnu_mcount_nc)
 ENTRY(__gnu_mcount_nc)
@@ -298,7 +299,7 @@ UNWIND(.fnstart)
 #ifdef CONFIG_DYNAMIC_FTRACE
 #ifdef CONFIG_DYNAMIC_FTRACE
 	mov	ip, lr
 	mov	ip, lr
 	ldmia	sp!, {lr}
 	ldmia	sp!, {lr}
-	mov	pc, ip
+	ret	ip
 #else
 #else
 	__mcount
 	__mcount
 #endif
 #endif
@@ -333,12 +334,12 @@ return_to_handler:
 	bl	ftrace_return_to_handler
 	bl	ftrace_return_to_handler
 	mov	lr, r0			@ r0 has real ret addr
 	mov	lr, r0			@ r0 has real ret addr
 	ldmia	sp!, {r0-r3}
 	ldmia	sp!, {r0-r3}
-	mov	pc, lr
+	ret	lr
 #endif
 #endif
 
 
 ENTRY(ftrace_stub)
 ENTRY(ftrace_stub)
 .Lftrace_stub:
 .Lftrace_stub:
-	mov	pc, lr
+	ret	lr
 ENDPROC(ftrace_stub)
 ENDPROC(ftrace_stub)
 
 
 #endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_TRACER */
@@ -561,7 +562,7 @@ sys_mmap2:
 		streq	r5, [sp, #4]
 		streq	r5, [sp, #4]
 		beq	sys_mmap_pgoff
 		beq	sys_mmap_pgoff
 		mov	r0, #-EINVAL
 		mov	r0, #-EINVAL
-		mov	pc, lr
+		ret	lr
 #else
 #else
 		str	r5, [sp, #4]
 		str	r5, [sp, #4]
 		b	sys_mmap_pgoff
 		b	sys_mmap_pgoff

+ 0 - 14
arch/arm/kernel/entry-header.S

@@ -240,12 +240,6 @@
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	.endm
 	.endm
 
 
-	@
-	@ 32-bit wide "mov pc, reg"
-	@
-	.macro	movw_pc, reg
-	mov	pc, \reg
-	.endm
 #else	/* CONFIG_THUMB2_KERNEL */
 #else	/* CONFIG_THUMB2_KERNEL */
 	.macro	svc_exit, rpsr, irq = 0
 	.macro	svc_exit, rpsr, irq = 0
 	.if	\irq != 0
 	.if	\irq != 0
@@ -304,14 +298,6 @@
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	movs	pc, lr				@ return & move spsr_svc into cpsr
 	.endm
 	.endm
 #endif	/* ifdef CONFIG_CPU_V7M / else */
 #endif	/* ifdef CONFIG_CPU_V7M / else */
-
-	@
-	@ 32-bit wide "mov pc, reg"
-	@
-	.macro	movw_pc, reg
-	mov	pc, \reg
-	nop
-	.endm
 #endif	/* !CONFIG_THUMB2_KERNEL */
 #endif	/* !CONFIG_THUMB2_KERNEL */
 
 
 /*
 /*

+ 2 - 2
arch/arm/kernel/fiqasm.S

@@ -32,7 +32,7 @@ ENTRY(__set_fiq_regs)
 	ldr	lr, [r0]
 	ldr	lr, [r0]
 	msr	cpsr_c, r1	@ return to SVC mode
 	msr	cpsr_c, r1	@ return to SVC mode
 	mov	r0, r0		@ avoid hazard prior to ARMv4
 	mov	r0, r0		@ avoid hazard prior to ARMv4
-	mov	pc, lr
+	ret	lr
 ENDPROC(__set_fiq_regs)
 ENDPROC(__set_fiq_regs)
 
 
 ENTRY(__get_fiq_regs)
 ENTRY(__get_fiq_regs)
@@ -45,5 +45,5 @@ ENTRY(__get_fiq_regs)
 	str	lr, [r0]
 	str	lr, [r0]
 	msr	cpsr_c, r1	@ return to SVC mode
 	msr	cpsr_c, r1	@ return to SVC mode
 	mov	r0, r0		@ avoid hazard prior to ARMv4
 	mov	r0, r0		@ avoid hazard prior to ARMv4
-	mov	pc, lr
+	ret	lr
 ENDPROC(__get_fiq_regs)
 ENDPROC(__get_fiq_regs)

+ 4 - 3
arch/arm/kernel/head-common.S

@@ -10,6 +10,7 @@
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  *
  *
  */
  */
+#include <asm/assembler.h>
 
 
 #define ATAG_CORE 0x54410001
 #define ATAG_CORE 0x54410001
 #define ATAG_CORE_SIZE ((2*4 + 3*4) >> 2)
 #define ATAG_CORE_SIZE ((2*4 + 3*4) >> 2)
@@ -61,10 +62,10 @@ __vet_atags:
 	cmp	r5, r6
 	cmp	r5, r6
 	bne	1f
 	bne	1f
 
 
-2:	mov	pc, lr				@ atag/dtb pointer is ok
+2:	ret	lr				@ atag/dtb pointer is ok
 
 
 1:	mov	r2, #0
 1:	mov	r2, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__vet_atags)
 ENDPROC(__vet_atags)
 
 
 /*
 /*
@@ -162,7 +163,7 @@ __lookup_processor_type:
 	cmp	r5, r6
 	cmp	r5, r6
 	blo	1b
 	blo	1b
 	mov	r5, #0				@ unknown processor
 	mov	r5, #0				@ unknown processor
-2:	mov	pc, lr
+2:	ret	lr
 ENDPROC(__lookup_processor_type)
 ENDPROC(__lookup_processor_type)
 
 
 /*
 /*

+ 4 - 4
arch/arm/kernel/head-nommu.S

@@ -82,7 +82,7 @@ ENTRY(stext)
 	adr	lr, BSYM(1f)			@ return (PIC) address
 	adr	lr, BSYM(1f)			@ return (PIC) address
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
- THUMB(	mov	pc, r12				)
+ THUMB(	ret	r12				)
  1:	b	__after_proc_init
  1:	b	__after_proc_init
 ENDPROC(stext)
 ENDPROC(stext)
 
 
@@ -119,7 +119,7 @@ ENTRY(secondary_startup)
 	mov	r13, r12			@ __secondary_switched address
 	mov	r13, r12			@ __secondary_switched address
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
- THUMB(	mov	pc, r12				)
+ THUMB(	ret	r12				)
 ENDPROC(secondary_startup)
 ENDPROC(secondary_startup)
 
 
 ENTRY(__secondary_switched)
 ENTRY(__secondary_switched)
@@ -164,7 +164,7 @@ __after_proc_init:
 #endif
 #endif
 	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
 	mcr	p15, 0, r0, c1, c0, 0		@ write control reg
 #endif /* CONFIG_CPU_CP15 */
 #endif /* CONFIG_CPU_CP15 */
-	mov	pc, r13
+	ret	r13
 ENDPROC(__after_proc_init)
 ENDPROC(__after_proc_init)
 	.ltorg
 	.ltorg
 
 
@@ -254,7 +254,7 @@ ENTRY(__setup_mpu)
 	orr	r0, r0, #CR_M			@ Set SCTRL.M (MPU on)
 	orr	r0, r0, #CR_M			@ Set SCTRL.M (MPU on)
 	mcr	p15, 0, r0, c1, c0, 0		@ Enable MPU
 	mcr	p15, 0, r0, c1, c0, 0		@ Enable MPU
 	isb
 	isb
-	mov pc,lr
+	ret	lr
 ENDPROC(__setup_mpu)
 ENDPROC(__setup_mpu)
 #endif
 #endif
 #include "head-common.S"
 #include "head-common.S"

+ 9 - 9
arch/arm/kernel/head.S

@@ -140,7 +140,7 @@ ENTRY(stext)
 	mov	r8, r4				@ set TTBR1 to swapper_pg_dir
 	mov	r8, r4				@ set TTBR1 to swapper_pg_dir
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
- THUMB(	mov	pc, r12				)
+ THUMB(	ret	r12				)
 1:	b	__enable_mmu
 1:	b	__enable_mmu
 ENDPROC(stext)
 ENDPROC(stext)
 	.ltorg
 	.ltorg
@@ -335,7 +335,7 @@ __create_page_tables:
 	sub	r4, r4, #0x1000		@ point to the PGD table
 	sub	r4, r4, #0x1000		@ point to the PGD table
 	mov	r4, r4, lsr #ARCH_PGD_SHIFT
 	mov	r4, r4, lsr #ARCH_PGD_SHIFT
 #endif
 #endif
-	mov	pc, lr
+	ret	lr
 ENDPROC(__create_page_tables)
 ENDPROC(__create_page_tables)
 	.ltorg
 	.ltorg
 	.align
 	.align
@@ -383,7 +383,7 @@ ENTRY(secondary_startup)
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	) @ initialise processor
  ARM(	add	pc, r10, #PROCINFO_INITFUNC	) @ initialise processor
 						  @ (return control reg)
 						  @ (return control reg)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
  THUMB(	add	r12, r10, #PROCINFO_INITFUNC	)
- THUMB(	mov	pc, r12				)
+ THUMB(	ret	r12				)
 ENDPROC(secondary_startup)
 ENDPROC(secondary_startup)
 
 
 	/*
 	/*
@@ -468,7 +468,7 @@ ENTRY(__turn_mmu_on)
 	instr_sync
 	instr_sync
 	mov	r3, r3
 	mov	r3, r3
 	mov	r3, r13
 	mov	r3, r13
-	mov	pc, r3
+	ret	r3
 __turn_mmu_on_end:
 __turn_mmu_on_end:
 ENDPROC(__turn_mmu_on)
 ENDPROC(__turn_mmu_on)
 	.popsection
 	.popsection
@@ -487,7 +487,7 @@ __fixup_smp:
 	orr	r4, r4, #0x0000b000
 	orr	r4, r4, #0x0000b000
 	orr	r4, r4, #0x00000020	@ val 0x4100b020
 	orr	r4, r4, #0x00000020	@ val 0x4100b020
 	teq	r3, r4			@ ARM 11MPCore?
 	teq	r3, r4			@ ARM 11MPCore?
-	moveq	pc, lr			@ yes, assume SMP
+	reteq	lr			@ yes, assume SMP
 
 
 	mrc	p15, 0, r0, c0, c0, 5	@ read MPIDR
 	mrc	p15, 0, r0, c0, c0, 5	@ read MPIDR
 	and	r0, r0, #0xc0000000	@ multiprocessing extensions and
 	and	r0, r0, #0xc0000000	@ multiprocessing extensions and
@@ -500,7 +500,7 @@ __fixup_smp:
 	orr	r4, r4, #0x0000c000
 	orr	r4, r4, #0x0000c000
 	orr	r4, r4, #0x00000090
 	orr	r4, r4, #0x00000090
 	teq	r3, r4			@ Check for ARM Cortex-A9
 	teq	r3, r4			@ Check for ARM Cortex-A9
-	movne	pc, lr			@ Not ARM Cortex-A9,
+	retne	lr			@ Not ARM Cortex-A9,
 
 
 	@ If a future SoC *does* use 0x0 as the PERIPH_BASE, then the
 	@ If a future SoC *does* use 0x0 as the PERIPH_BASE, then the
 	@ below address check will need to be #ifdef'd or equivalent
 	@ below address check will need to be #ifdef'd or equivalent
@@ -512,7 +512,7 @@ __fixup_smp:
 ARM_BE8(rev	r0, r0)			@ byteswap if big endian
 ARM_BE8(rev	r0, r0)			@ byteswap if big endian
 	and	r0, r0, #0x3		@ number of CPUs
 	and	r0, r0, #0x3		@ number of CPUs
 	teq	r0, #0x0		@ is 1?
 	teq	r0, #0x0		@ is 1?
-	movne	pc, lr
+	retne	lr
 
 
 __fixup_smp_on_up:
 __fixup_smp_on_up:
 	adr	r0, 1f
 	adr	r0, 1f
@@ -539,7 +539,7 @@ smp_on_up:
 	.text
 	.text
 __do_fixup_smp_on_up:
 __do_fixup_smp_on_up:
 	cmp	r4, r5
 	cmp	r4, r5
-	movhs	pc, lr
+	reths	lr
 	ldmia	r4!, {r0, r6}
 	ldmia	r4!, {r0, r6}
  ARM(	str	r6, [r0, r3]	)
  ARM(	str	r6, [r0, r3]	)
  THUMB(	add	r0, r0, r3	)
  THUMB(	add	r0, r0, r3	)
@@ -672,7 +672,7 @@ ARM_BE8(rev16	ip, ip)
 2:	cmp	r4, r5
 2:	cmp	r4, r5
 	ldrcc	r7, [r4], #4	@ use branch for delay slot
 	ldrcc	r7, [r4], #4	@ use branch for delay slot
 	bcc	1b
 	bcc	1b
-	mov	pc, lr
+	ret	lr
 #endif
 #endif
 ENDPROC(__fixup_a_pv_table)
 ENDPROC(__fixup_a_pv_table)
 
 

+ 3 - 3
arch/arm/kernel/hyp-stub.S

@@ -99,7 +99,7 @@ ENTRY(__hyp_stub_install_secondary)
 	 * immediately.
 	 * immediately.
 	 */
 	 */
 	compare_cpu_mode_with_primary	r4, r5, r6, r7
 	compare_cpu_mode_with_primary	r4, r5, r6, r7
-	movne	pc, lr
+	retne	lr
 
 
 	/*
 	/*
 	 * Once we have given up on one CPU, we do not try to install the
 	 * Once we have given up on one CPU, we do not try to install the
@@ -111,7 +111,7 @@ ENTRY(__hyp_stub_install_secondary)
 	 */
 	 */
 
 
 	cmp	r4, #HYP_MODE
 	cmp	r4, #HYP_MODE
-	movne	pc, lr			@ give up if the CPU is not in HYP mode
+	retne	lr			@ give up if the CPU is not in HYP mode
 
 
 /*
 /*
  * Configure HSCTLR to set correct exception endianness/instruction set
  * Configure HSCTLR to set correct exception endianness/instruction set
@@ -201,7 +201,7 @@ ENDPROC(__hyp_get_vectors)
 	@ fall through
 	@ fall through
 ENTRY(__hyp_set_vectors)
 ENTRY(__hyp_set_vectors)
 	__HVC(0)
 	__HVC(0)
-	mov	pc, lr
+	ret	lr
 ENDPROC(__hyp_set_vectors)
 ENDPROC(__hyp_set_vectors)
 
 
 #ifndef ZIMAGE
 #ifndef ZIMAGE

+ 8 - 8
arch/arm/kernel/iwmmxt.S

@@ -100,7 +100,7 @@ ENTRY(iwmmxt_task_enable)
 	get_thread_info r10
 	get_thread_info r10
 #endif
 #endif
 4:	dec_preempt_count r10, r3
 4:	dec_preempt_count r10, r3
-	mov	pc, r9				@ normal exit from exception
+	ret	r9				@ normal exit from exception
 
 
 concan_save:
 concan_save:
 
 
@@ -144,7 +144,7 @@ concan_dump:
 	wstrd	wR15, [r1, #MMX_WR15]
 	wstrd	wR15, [r1, #MMX_WR15]
 
 
 2:	teq	r0, #0				@ anything to load?
 2:	teq	r0, #0				@ anything to load?
-	moveq	pc, lr				@ if not, return
+	reteq	lr				@ if not, return
 
 
 concan_load:
 concan_load:
 
 
@@ -177,10 +177,10 @@ concan_load:
 	@ clear CUP/MUP (only if r1 != 0)
 	@ clear CUP/MUP (only if r1 != 0)
 	teq	r1, #0
 	teq	r1, #0
 	mov 	r2, #0
 	mov 	r2, #0
-	moveq	pc, lr
+	reteq	lr
 
 
 	tmcr	wCon, r2
 	tmcr	wCon, r2
-	mov	pc, lr
+	ret	lr
 
 
 /*
 /*
  * Back up Concan regs to save area and disable access to them
  * Back up Concan regs to save area and disable access to them
@@ -266,7 +266,7 @@ ENTRY(iwmmxt_task_copy)
 	mov	r3, lr				@ preserve return address
 	mov	r3, lr				@ preserve return address
 	bl	concan_dump
 	bl	concan_dump
 	msr	cpsr_c, ip			@ restore interrupt mode
 	msr	cpsr_c, ip			@ restore interrupt mode
-	mov	pc, r3
+	ret	r3
 
 
 /*
 /*
  * Restore Concan state from given memory address
  * Restore Concan state from given memory address
@@ -302,7 +302,7 @@ ENTRY(iwmmxt_task_restore)
 	mov	r3, lr				@ preserve return address
 	mov	r3, lr				@ preserve return address
 	bl	concan_load
 	bl	concan_load
 	msr	cpsr_c, ip			@ restore interrupt mode
 	msr	cpsr_c, ip			@ restore interrupt mode
-	mov	pc, r3
+	ret	r3
 
 
 /*
 /*
  * Concan handling on task switch
  * Concan handling on task switch
@@ -324,7 +324,7 @@ ENTRY(iwmmxt_task_switch)
 	add	r3, r0, #TI_IWMMXT_STATE	@ get next task Concan save area
 	add	r3, r0, #TI_IWMMXT_STATE	@ get next task Concan save area
 	ldr	r2, [r2]			@ get current Concan owner
 	ldr	r2, [r2]			@ get current Concan owner
 	teq	r2, r3				@ next task owns it?
 	teq	r2, r3				@ next task owns it?
-	movne	pc, lr				@ no: leave Concan disabled
+	retne	lr				@ no: leave Concan disabled
 
 
 1:	@ flip Concan access
 1:	@ flip Concan access
 	XSC(eor	r1, r1, #0x3)
 	XSC(eor	r1, r1, #0x3)
@@ -351,7 +351,7 @@ ENTRY(iwmmxt_task_release)
 	eors	r0, r0, r1			@ if equal...
 	eors	r0, r0, r1			@ if equal...
 	streq	r0, [r3]			@ then clear ownership
 	streq	r0, [r3]			@ then clear ownership
 	msr	cpsr_c, r2			@ restore interrupts
 	msr	cpsr_c, r2			@ restore interrupts
-	mov	pc, lr
+	ret	lr
 
 
 	.data
 	.data
 concan_owner:
 concan_owner:

+ 12 - 6
arch/arm/kernel/perf_event.c

@@ -560,11 +560,16 @@ user_backtrace(struct frame_tail __user *tail,
 	       struct perf_callchain_entry *entry)
 	       struct perf_callchain_entry *entry)
 {
 {
 	struct frame_tail buftail;
 	struct frame_tail buftail;
+	unsigned long err;
 
 
-	/* Also check accessibility of one struct frame_tail beyond */
 	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
 	if (!access_ok(VERIFY_READ, tail, sizeof(buftail)))
 		return NULL;
 		return NULL;
-	if (__copy_from_user_inatomic(&buftail, tail, sizeof(buftail)))
+
+	pagefault_disable();
+	err = __copy_from_user_inatomic(&buftail, tail, sizeof(buftail));
+	pagefault_enable();
+
+	if (err)
 		return NULL;
 		return NULL;
 
 
 	perf_callchain_store(entry, buftail.lr);
 	perf_callchain_store(entry, buftail.lr);
@@ -590,6 +595,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
 	}
 	}
 
 
 	perf_callchain_store(entry, regs->ARM_pc);
 	perf_callchain_store(entry, regs->ARM_pc);
+
+	if (!current->mm)
+		return;
+
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 	tail = (struct frame_tail __user *)regs->ARM_fp - 1;
 
 
 	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
 	while ((entry->nr < PERF_MAX_STACK_DEPTH) &&
@@ -621,10 +630,7 @@ perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 		return;
 		return;
 	}
 	}
 
 
-	fr.fp = regs->ARM_fp;
-	fr.sp = regs->ARM_sp;
-	fr.lr = regs->ARM_lr;
-	fr.pc = regs->ARM_pc;
+	arm_get_current_stackframe(regs, &fr);
 	walk_stackframe(&fr, callchain_trace, entry);
 	walk_stackframe(&fr, callchain_trace, entry);
 }
 }
 
 

+ 36 - 30
arch/arm/kernel/perf_event_cpu.c

@@ -233,14 +233,17 @@ static struct of_device_id cpu_pmu_of_device_ids[] = {
 	{.compatible = "arm,cortex-a7-pmu",	.data = armv7_a7_pmu_init},
 	{.compatible = "arm,cortex-a7-pmu",	.data = armv7_a7_pmu_init},
 	{.compatible = "arm,cortex-a5-pmu",	.data = armv7_a5_pmu_init},
 	{.compatible = "arm,cortex-a5-pmu",	.data = armv7_a5_pmu_init},
 	{.compatible = "arm,arm11mpcore-pmu",	.data = armv6mpcore_pmu_init},
 	{.compatible = "arm,arm11mpcore-pmu",	.data = armv6mpcore_pmu_init},
-	{.compatible = "arm,arm1176-pmu",	.data = armv6pmu_init},
-	{.compatible = "arm,arm1136-pmu",	.data = armv6pmu_init},
+	{.compatible = "arm,arm1176-pmu",	.data = armv6_1176_pmu_init},
+	{.compatible = "arm,arm1136-pmu",	.data = armv6_1136_pmu_init},
 	{.compatible = "qcom,krait-pmu",	.data = krait_pmu_init},
 	{.compatible = "qcom,krait-pmu",	.data = krait_pmu_init},
 	{},
 	{},
 };
 };
 
 
 static struct platform_device_id cpu_pmu_plat_device_ids[] = {
 static struct platform_device_id cpu_pmu_plat_device_ids[] = {
 	{.name = "arm-pmu"},
 	{.name = "arm-pmu"},
+	{.name = "armv6-pmu"},
+	{.name = "armv7-pmu"},
+	{.name = "xscale-pmu"},
 	{},
 	{},
 };
 };
 
 
@@ -250,40 +253,43 @@ static struct platform_device_id cpu_pmu_plat_device_ids[] = {
 static int probe_current_pmu(struct arm_pmu *pmu)
 static int probe_current_pmu(struct arm_pmu *pmu)
 {
 {
 	int cpu = get_cpu();
 	int cpu = get_cpu();
-	unsigned long implementor = read_cpuid_implementor();
-	unsigned long part_number = read_cpuid_part_number();
 	int ret = -ENODEV;
 	int ret = -ENODEV;
 
 
 	pr_info("probing PMU on CPU %d\n", cpu);
 	pr_info("probing PMU on CPU %d\n", cpu);
 
 
+	switch (read_cpuid_part()) {
 	/* ARM Ltd CPUs. */
 	/* ARM Ltd CPUs. */
-	if (implementor == ARM_CPU_IMP_ARM) {
-		switch (part_number) {
-		case ARM_CPU_PART_ARM1136:
-		case ARM_CPU_PART_ARM1156:
-		case ARM_CPU_PART_ARM1176:
-			ret = armv6pmu_init(pmu);
-			break;
-		case ARM_CPU_PART_ARM11MPCORE:
-			ret = armv6mpcore_pmu_init(pmu);
-			break;
-		case ARM_CPU_PART_CORTEX_A8:
-			ret = armv7_a8_pmu_init(pmu);
-			break;
-		case ARM_CPU_PART_CORTEX_A9:
-			ret = armv7_a9_pmu_init(pmu);
-			break;
-		}
-	/* Intel CPUs [xscale]. */
-	} else if (implementor == ARM_CPU_IMP_INTEL) {
-		switch (xscale_cpu_arch_version()) {
-		case ARM_CPU_XSCALE_ARCH_V1:
-			ret = xscale1pmu_init(pmu);
-			break;
-		case ARM_CPU_XSCALE_ARCH_V2:
-			ret = xscale2pmu_init(pmu);
-			break;
+	case ARM_CPU_PART_ARM1136:
+		ret = armv6_1136_pmu_init(pmu);
+		break;
+	case ARM_CPU_PART_ARM1156:
+		ret = armv6_1156_pmu_init(pmu);
+		break;
+	case ARM_CPU_PART_ARM1176:
+		ret = armv6_1176_pmu_init(pmu);
+		break;
+	case ARM_CPU_PART_ARM11MPCORE:
+		ret = armv6mpcore_pmu_init(pmu);
+		break;
+	case ARM_CPU_PART_CORTEX_A8:
+		ret = armv7_a8_pmu_init(pmu);
+		break;
+	case ARM_CPU_PART_CORTEX_A9:
+		ret = armv7_a9_pmu_init(pmu);
+		break;
+
+	default:
+		if (read_cpuid_implementor() == ARM_CPU_IMP_INTEL) {
+			switch (xscale_cpu_arch_version()) {
+			case ARM_CPU_XSCALE_ARCH_V1:
+				ret = xscale1pmu_init(pmu);
+				break;
+			case ARM_CPU_XSCALE_ARCH_V2:
+				ret = xscale2pmu_init(pmu);
+				break;
+			}
 		}
 		}
+		break;
 	}
 	}
 
 
 	put_cpu();
 	put_cpu();

+ 79 - 228
arch/arm/kernel/perf_event_v6.c

@@ -65,13 +65,11 @@ enum armv6_counters {
  * accesses/misses in hardware.
  * accesses/misses in hardware.
  */
  */
 static const unsigned armv6_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv6_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV6_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV6_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV6_PERFCTR_INSTR_EXEC,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV6_PERFCTR_INSTR_EXEC,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_CACHE_MISSES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV6_PERFCTR_BR_EXEC,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV6_PERFCTR_BR_EXEC,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV6_PERFCTR_BR_MISPREDICT,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV6_PERFCTR_BR_MISPREDICT,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV6_PERFCTR_IBUF_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV6_PERFCTR_IBUF_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV6_PERFCTR_LSU_FULL_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV6_PERFCTR_LSU_FULL_STALL,
 };
 };
@@ -79,116 +77,31 @@ static const unsigned armv6_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv6_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv6_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		/*
-		 * The performance counters don't differentiate between read
-		 * and write accesses/misses so this isn't strictly correct,
-		 * but it's the best we can do. Writes and reads get
-		 * combined.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ICACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		/*
-		 * The ARM performance counters can count micro DTLB misses,
-		 * micro ITLB misses and main TLB misses. There isn't an event
-		 * for TLB misses, so use the micro misses here and if users
-		 * want the main TLB misses they can use a raw counter.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	/*
+	 * The performance counters don't differentiate between read and write
+	 * accesses/misses so this isn't strictly correct, but it's the best we
+	 * can do. Writes and reads get combined.
+	 */
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV6_PERFCTR_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6_PERFCTR_DCACHE_MISS,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6_PERFCTR_ICACHE_MISS,
+
+	/*
+	 * The ARM performance counters can count micro DTLB misses, micro ITLB
+	 * misses and main TLB misses. There isn't an event for TLB misses, so
+	 * use the micro misses here and if users want the main TLB misses they
+	 * can use a raw counter.
+	 */
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6_PERFCTR_DTLB_MISS,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6_PERFCTR_ITLB_MISS,
 };
 };
 
 
 enum armv6mpcore_perf_types {
 enum armv6mpcore_perf_types {
@@ -220,13 +133,11 @@ enum armv6mpcore_perf_types {
  * accesses/misses in hardware.
  * accesses/misses in hardware.
  */
  */
 static const unsigned armv6mpcore_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv6mpcore_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV6MPCORE_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV6MPCORE_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV6MPCORE_PERFCTR_INSTR_EXEC,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV6MPCORE_PERFCTR_INSTR_EXEC,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_CACHE_MISSES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV6MPCORE_PERFCTR_BR_EXEC,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV6MPCORE_PERFCTR_BR_EXEC,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV6MPCORE_PERFCTR_BR_MISPREDICT,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV6MPCORE_PERFCTR_BR_MISPREDICT,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV6MPCORE_PERFCTR_IBUF_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV6MPCORE_PERFCTR_IBUF_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV6MPCORE_PERFCTR_LSU_FULL_STALL,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV6MPCORE_PERFCTR_LSU_FULL_STALL,
 };
 };
@@ -234,114 +145,26 @@ static const unsigned armv6mpcore_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv6mpcore_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv6mpcore_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  =
-				ARMV6MPCORE_PERFCTR_DCACHE_RDACCESS,
-			[C(RESULT_MISS)]    =
-				ARMV6MPCORE_PERFCTR_DCACHE_RDMISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  =
-				ARMV6MPCORE_PERFCTR_DCACHE_WRACCESS,
-			[C(RESULT_MISS)]    =
-				ARMV6MPCORE_PERFCTR_DCACHE_WRMISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = ARMV6MPCORE_PERFCTR_ICACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		/*
-		 * The ARM performance counters can count micro DTLB misses,
-		 * micro ITLB misses and main TLB misses. There isn't an event
-		 * for TLB misses, so use the micro misses here and if users
-		 * want the main TLB misses they can use a raw counter.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = ARMV6MPCORE_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = ARMV6MPCORE_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = ARMV6MPCORE_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = ARMV6MPCORE_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]  = CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]    = CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV6MPCORE_PERFCTR_DCACHE_RDACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_DCACHE_RDMISS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV6MPCORE_PERFCTR_DCACHE_WRACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_DCACHE_WRMISS,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_ICACHE_MISS,
+
+	/*
+	 * The ARM performance counters can count micro DTLB misses, micro ITLB
+	 * misses and main TLB misses. There isn't an event for TLB misses, so
+	 * use the micro misses here and if users want the main TLB misses they
+	 * can use a raw counter.
+	 */
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_DTLB_MISS,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_DTLB_MISS,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_ITLB_MISS,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV6MPCORE_PERFCTR_ITLB_MISS,
 };
 };
 
 
 static inline unsigned long
 static inline unsigned long
@@ -653,9 +476,8 @@ static int armv6_map_event(struct perf_event *event)
 				&armv6_perf_cache_map, 0xFF);
 				&armv6_perf_cache_map, 0xFF);
 }
 }
 
 
-static int armv6pmu_init(struct arm_pmu *cpu_pmu)
+static void armv6pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
-	cpu_pmu->name		= "v6";
 	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
 	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
 	cpu_pmu->enable		= armv6pmu_enable_event;
 	cpu_pmu->enable		= armv6pmu_enable_event;
 	cpu_pmu->disable	= armv6pmu_disable_event;
 	cpu_pmu->disable	= armv6pmu_disable_event;
@@ -667,7 +489,26 @@ static int armv6pmu_init(struct arm_pmu *cpu_pmu)
 	cpu_pmu->map_event	= armv6_map_event;
 	cpu_pmu->map_event	= armv6_map_event;
 	cpu_pmu->num_events	= 3;
 	cpu_pmu->num_events	= 3;
 	cpu_pmu->max_period	= (1LLU << 32) - 1;
 	cpu_pmu->max_period	= (1LLU << 32) - 1;
+}
+
+static int armv6_1136_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	armv6pmu_init(cpu_pmu);
+	cpu_pmu->name		= "armv6_1136";
+	return 0;
+}
 
 
+static int armv6_1156_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	armv6pmu_init(cpu_pmu);
+	cpu_pmu->name		= "armv6_1156";
+	return 0;
+}
+
+static int armv6_1176_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	armv6pmu_init(cpu_pmu);
+	cpu_pmu->name		= "armv6_1176";
 	return 0;
 	return 0;
 }
 }
 
 
@@ -687,7 +528,7 @@ static int armv6mpcore_map_event(struct perf_event *event)
 
 
 static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
-	cpu_pmu->name		= "v6mpcore";
+	cpu_pmu->name		= "armv6_11mpcore";
 	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
 	cpu_pmu->handle_irq	= armv6pmu_handle_irq;
 	cpu_pmu->enable		= armv6pmu_enable_event;
 	cpu_pmu->enable		= armv6pmu_enable_event;
 	cpu_pmu->disable	= armv6mpcore_pmu_disable_event;
 	cpu_pmu->disable	= armv6mpcore_pmu_disable_event;
@@ -703,7 +544,17 @@ static int armv6mpcore_pmu_init(struct arm_pmu *cpu_pmu)
 	return 0;
 	return 0;
 }
 }
 #else
 #else
-static int armv6pmu_init(struct arm_pmu *cpu_pmu)
+static int armv6_1136_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return -ENODEV;
+}
+
+static int armv6_1156_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return -ENODEV;
+}
+
+static int armv6_1176_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	return -ENODEV;
 	return -ENODEV;
 }
 }

+ 216 - 751
arch/arm/kernel/perf_event_v7.c

@@ -148,137 +148,62 @@ enum krait_perf_types {
  * accesses/misses in hardware.
  * accesses/misses in hardware.
  */
  */
 static const unsigned armv7_a8_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a8_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV7_A8_PERFCTR_STALL_ISIDE,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV7_A8_PERFCTR_STALL_ISIDE,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a8_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		/*
-		 * The performance counters don't differentiate between read
-		 * and write accesses/misses so this isn't strictly correct,
-		 * but it's the best we can do. Writes and reads get
-		 * combined.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L2_CACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_A8_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L2_CACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_A8_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	/*
+	 * The performance counters don't differentiate between read and write
+	 * accesses/misses so this isn't strictly correct, but it's the best we
+	 * can do. Writes and reads get combined.
+	 */
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+
+	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L2_CACHE_ACCESS,
+	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_A8_PERFCTR_L2_CACHE_REFILL,
+	[C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_A8_PERFCTR_L2_CACHE_ACCESS,
+	[C(LL)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_A8_PERFCTR_L2_CACHE_REFILL,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Cortex-A9 HW events mapping
  * Cortex-A9 HW events mapping
  */
  */
 static const unsigned armv7_a9_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a9_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_A9_PERFCTR_INSTR_CORE_RENAME,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_A9_PERFCTR_INSTR_CORE_RENAME,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV7_A9_PERFCTR_STALL_ICACHE,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV7_A9_PERFCTR_STALL_ICACHE,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV7_A9_PERFCTR_STALL_DISPATCH,
 	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV7_A9_PERFCTR_STALL_DISPATCH,
 };
 };
@@ -286,238 +211,83 @@ static const unsigned armv7_a9_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a9_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		/*
-		 * The performance counters don't differentiate between read
-		 * and write accesses/misses so this isn't strictly correct,
-		 * but it's the best we can do. Writes and reads get
-		 * combined.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	/*
+	 * The performance counters don't differentiate between read and write
+	 * accesses/misses so this isn't strictly correct, but it's the best we
+	 * can do. Writes and reads get combined.
+	 */
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Cortex-A5 HW events mapping
  * Cortex-A5 HW events mapping
  */
  */
 static const unsigned armv7_a5_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a5_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV7_PERFCTR_L1_DCACHE_REFILL,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned armv7_a5_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a5_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL,
-			[C(RESULT_MISS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		/*
-		 * The prefetch counters don't differentiate between the I
-		 * side and the D side.
-		 */
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL,
-			[C(RESULT_MISS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_PREFETCH)][C(RESULT_ACCESS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL,
+	[C(L1D)][C(OP_PREFETCH)][C(RESULT_MISS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+	/*
+	 * The prefetch counters don't differentiate between the I side and the
+	 * D side.
+	 */
+	[C(L1I)][C(OP_PREFETCH)][C(RESULT_ACCESS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL,
+	[C(L1I)][C(OP_PREFETCH)][C(RESULT_MISS)]	= ARMV7_A5_PERFCTR_PREFETCH_LINEFILL_DROP,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Cortex-A15 HW events mapping
  * Cortex-A15 HW events mapping
  */
  */
 static const unsigned armv7_a15_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a15_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
@@ -525,123 +295,48 @@ static const unsigned armv7_a15_perf_map[PERF_COUNT_HW_MAX] = {
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_A15_PERFCTR_PC_WRITE_SPEC,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_A15_PERFCTR_PC_WRITE_SPEC,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned armv7_a15_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a15_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_READ,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_READ,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_WRITE,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_WRITE,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		/*
-		 * Not all performance counters differentiate between read
-		 * and write accesses/misses so we're not always strictly
-		 * correct, but it's the best we can do. Writes and reads get
-		 * combined in these cases.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_READ,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L2_CACHE_REFILL_READ,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_WRITE,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L2_CACHE_REFILL_WRITE,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_DTLB_REFILL_L1_READ,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_DTLB_REFILL_L1_WRITE,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_READ,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_READ,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_ACCESS_WRITE,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L1_DCACHE_REFILL_WRITE,
+
+	/*
+	 * Not all performance counters differentiate between read and write
+	 * accesses/misses so we're not always strictly correct, but it's the
+	 * best we can do. Writes and reads get combined in these cases.
+	 */
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+
+	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_READ,
+	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L2_CACHE_REFILL_READ,
+	[C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_A15_PERFCTR_L2_CACHE_ACCESS_WRITE,
+	[C(LL)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_L2_CACHE_REFILL_WRITE,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_DTLB_REFILL_L1_READ,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_A15_PERFCTR_DTLB_REFILL_L1_WRITE,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Cortex-A7 HW events mapping
  * Cortex-A7 HW events mapping
  */
  */
 static const unsigned armv7_a7_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a7_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
@@ -649,123 +344,48 @@ static const unsigned armv7_a7_perf_map[PERF_COUNT_HW_MAX] = {
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a7_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		/*
-		 * The performance counters don't differentiate between read
-		 * and write accesses/misses so this isn't strictly correct,
-		 * but it's the best we can do. Writes and reads get
-		 * combined.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	/*
+	 * The performance counters don't differentiate between read and write
+	 * accesses/misses so this isn't strictly correct, but it's the best we
+	 * can do. Writes and reads get combined.
+	 */
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+
+	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
+	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+	[C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L2_CACHE_ACCESS,
+	[C(LL)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Cortex-A12 HW events mapping
  * Cortex-A12 HW events mapping
  */
  */
 static const unsigned armv7_a12_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned armv7_a12_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
 	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
@@ -773,138 +393,60 @@ static const unsigned armv7_a12_perf_map[PERF_COUNT_HW_MAX] = {
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_A12_PERFCTR_PC_WRITE_SPEC,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV7_A12_PERFCTR_PC_WRITE_SPEC,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
 	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV7_PERFCTR_BUS_CYCLES,
-	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned armv7_a12_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned armv7_a12_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_OP_MAX]
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_READ,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_WRITE,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		/*
-		 * Not all performance counters differentiate between read
-		 * and write accesses/misses so we're not always strictly
-		 * correct, but it's the best we can do. Writes and reads get
-		 * combined in these cases.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_READ,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_WRITE,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_A12_PERFCTR_PF_TLB_REFILL,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_READ,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L1_DCACHE_ACCESS_WRITE,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+
+	/*
+	 * Not all performance counters differentiate between read and write
+	 * accesses/misses so we're not always strictly correct, but it's the
+	 * best we can do. Writes and reads get combined in these cases.
+	 */
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_ICACHE_REFILL,
+
+	[C(LL)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_READ,
+	[C(LL)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+	[C(LL)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_A12_PERFCTR_L2_CACHE_ACCESS_WRITE,
+	[C(LL)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L2_CACHE_REFILL,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_DTLB_REFILL,
+	[C(DTLB)][C(OP_PREFETCH)][C(RESULT_MISS)]	= ARMV7_A12_PERFCTR_PF_TLB_REFILL,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_ITLB_REFILL,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
  * Krait HW events mapping
  * Krait HW events mapping
  */
  */
 static const unsigned krait_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned krait_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
-	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = ARMV7_PERFCTR_PC_WRITE,
 	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
 	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
 };
 };
 
 
 static const unsigned krait_perf_map_no_branch[PERF_COUNT_HW_MAX] = {
 static const unsigned krait_perf_map_no_branch[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_CPU_CYCLES]	    = ARMV7_PERFCTR_CPU_CYCLES,
 	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
 	[PERF_COUNT_HW_INSTRUCTIONS]	    = ARMV7_PERFCTR_INSTR_EXECUTED,
-	[PERF_COUNT_HW_CACHE_REFERENCES]    = HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_CACHE_MISSES]	    = HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BRANCH_MISSES]	    = ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
 	[PERF_COUNT_HW_BUS_CYCLES]	    = ARMV7_PERFCTR_CLOCK_CYCLES,
 };
 };
@@ -912,110 +454,31 @@ static const unsigned krait_perf_map_no_branch[PERF_COUNT_HW_MAX] = {
 static const unsigned krait_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned krait_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_OP_MAX]
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					  [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		/*
-		 * The performance counters don't differentiate between read
-		 * and write accesses/misses so this isn't strictly correct,
-		 * but it's the best we can do. Writes and reads get
-		 * combined.
-		 */
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ICACHE_ACCESS,
-			[C(RESULT_MISS)]	= KRAIT_PERFCTR_L1_ICACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_DTLB_ACCESS,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_DTLB_ACCESS,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ITLB_ACCESS,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ITLB_ACCESS,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
-			[C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	/*
+	 * The performance counters don't differentiate between read and write
+	 * accesses/misses so this isn't strictly correct, but it's the best we
+	 * can do. Writes and reads get combined.
+	 */
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_L1_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_L1_DCACHE_REFILL,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ICACHE_ACCESS,
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= KRAIT_PERFCTR_L1_ICACHE_MISS,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_DTLB_ACCESS,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_DTLB_ACCESS,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ITLB_ACCESS,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_ACCESS)]	= KRAIT_PERFCTR_L1_ITLB_ACCESS,
+
+	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV7_PERFCTR_PC_BRANCH_PRED,
+	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV7_PERFCTR_PC_BRANCH_MIS_PRED,
 };
 };
 
 
 /*
 /*
@@ -1545,7 +1008,7 @@ static u32 armv7_read_num_pmnc_events(void)
 static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A8";
+	cpu_pmu->name		= "armv7_cortex_a8";
 	cpu_pmu->map_event	= armv7_a8_map_event;
 	cpu_pmu->map_event	= armv7_a8_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	return 0;
 	return 0;
@@ -1554,7 +1017,7 @@ static int armv7_a8_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A9";
+	cpu_pmu->name		= "armv7_cortex_a9";
 	cpu_pmu->map_event	= armv7_a9_map_event;
 	cpu_pmu->map_event	= armv7_a9_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	return 0;
 	return 0;
@@ -1563,7 +1026,7 @@ static int armv7_a9_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A5";
+	cpu_pmu->name		= "armv7_cortex_a5";
 	cpu_pmu->map_event	= armv7_a5_map_event;
 	cpu_pmu->map_event	= armv7_a5_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	return 0;
 	return 0;
@@ -1572,7 +1035,7 @@ static int armv7_a5_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A15";
+	cpu_pmu->name		= "armv7_cortex_a15";
 	cpu_pmu->map_event	= armv7_a15_map_event;
 	cpu_pmu->map_event	= armv7_a15_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
@@ -1582,7 +1045,7 @@ static int armv7_a15_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A7";
+	cpu_pmu->name		= "armv7_cortex_a7";
 	cpu_pmu->map_event	= armv7_a7_map_event;
 	cpu_pmu->map_event	= armv7_a7_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
@@ -1592,7 +1055,7 @@ static int armv7_a7_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Cortex-A12";
+	cpu_pmu->name		= "armv7_cortex_a12";
 	cpu_pmu->map_event	= armv7_a12_map_event;
 	cpu_pmu->map_event	= armv7_a12_map_event;
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->num_events	= armv7_read_num_pmnc_events();
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
 	cpu_pmu->set_event_filter = armv7pmu_set_event_filter;
@@ -1602,7 +1065,7 @@ static int armv7_a12_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu)
 static int armv7_a17_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7_a12_pmu_init(cpu_pmu);
 	armv7_a12_pmu_init(cpu_pmu);
-	cpu_pmu->name = "ARMv7 Cortex-A17";
+	cpu_pmu->name = "armv7_cortex_a17";
 	return 0;
 	return 0;
 }
 }
 
 
@@ -1823,6 +1286,7 @@ static void krait_pmu_disable_event(struct perf_event *event)
 	unsigned long flags;
 	unsigned long flags;
 	struct hw_perf_event *hwc = &event->hw;
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 	int idx = hwc->idx;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
 
 
 	/* Disable counter and interrupt */
 	/* Disable counter and interrupt */
@@ -1848,6 +1312,7 @@ static void krait_pmu_enable_event(struct perf_event *event)
 	unsigned long flags;
 	unsigned long flags;
 	struct hw_perf_event *hwc = &event->hw;
 	struct hw_perf_event *hwc = &event->hw;
 	int idx = hwc->idx;
 	int idx = hwc->idx;
+	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
 	struct pmu_hw_events *events = cpu_pmu->get_hw_events();
 
 
 	/*
 	/*
@@ -1981,7 +1446,7 @@ static void krait_pmu_clear_event_idx(struct pmu_hw_events *cpuc,
 static int krait_pmu_init(struct arm_pmu *cpu_pmu)
 static int krait_pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
 	armv7pmu_init(cpu_pmu);
 	armv7pmu_init(cpu_pmu);
-	cpu_pmu->name		= "ARMv7 Krait";
+	cpu_pmu->name		= "armv7_krait";
 	/* Some early versions of Krait don't support PC write events */
 	/* Some early versions of Krait don't support PC write events */
 	if (of_property_read_bool(cpu_pmu->plat_device->dev.of_node,
 	if (of_property_read_bool(cpu_pmu->plat_device->dev.of_node,
 				  "qcom,no-pc-write"))
 				  "qcom,no-pc-write"))

+ 17 - 104
arch/arm/kernel/perf_event_xscale.c

@@ -48,118 +48,31 @@ enum xscale_counters {
 };
 };
 
 
 static const unsigned xscale_perf_map[PERF_COUNT_HW_MAX] = {
 static const unsigned xscale_perf_map[PERF_COUNT_HW_MAX] = {
+	PERF_MAP_ALL_UNSUPPORTED,
 	[PERF_COUNT_HW_CPU_CYCLES]		= XSCALE_PERFCTR_CCNT,
 	[PERF_COUNT_HW_CPU_CYCLES]		= XSCALE_PERFCTR_CCNT,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= XSCALE_PERFCTR_INSTRUCTION,
 	[PERF_COUNT_HW_INSTRUCTIONS]		= XSCALE_PERFCTR_INSTRUCTION,
-	[PERF_COUNT_HW_CACHE_REFERENCES]	= HW_OP_UNSUPPORTED,
-	[PERF_COUNT_HW_CACHE_MISSES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= XSCALE_PERFCTR_BRANCH,
 	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= XSCALE_PERFCTR_BRANCH,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= XSCALE_PERFCTR_BRANCH_MISS,
 	[PERF_COUNT_HW_BRANCH_MISSES]		= XSCALE_PERFCTR_BRANCH_MISS,
-	[PERF_COUNT_HW_BUS_CYCLES]		= HW_OP_UNSUPPORTED,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= XSCALE_PERFCTR_ICACHE_NO_DELIVER,
 	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= XSCALE_PERFCTR_ICACHE_NO_DELIVER,
-	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= HW_OP_UNSUPPORTED,
 };
 };
 
 
 static const unsigned xscale_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 static const unsigned xscale_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
 					   [PERF_COUNT_HW_CACHE_OP_MAX]
 					   [PERF_COUNT_HW_CACHE_OP_MAX]
 					   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
 					   [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
-	[C(L1D)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= XSCALE_PERFCTR_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_DCACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= XSCALE_PERFCTR_DCACHE_ACCESS,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_DCACHE_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(L1I)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_ICACHE_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(LL)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(DTLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_DTLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(ITLB)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= XSCALE_PERFCTR_ITLB_MISS,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(BPU)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
-	[C(NODE)] = {
-		[C(OP_READ)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_WRITE)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-		[C(OP_PREFETCH)] = {
-			[C(RESULT_ACCESS)]	= CACHE_OP_UNSUPPORTED,
-			[C(RESULT_MISS)]	= CACHE_OP_UNSUPPORTED,
-		},
-	},
+	PERF_CACHE_MAP_ALL_UNSUPPORTED,
+
+	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= XSCALE_PERFCTR_DCACHE_ACCESS,
+	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= XSCALE_PERFCTR_DCACHE_MISS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= XSCALE_PERFCTR_DCACHE_ACCESS,
+	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= XSCALE_PERFCTR_DCACHE_MISS,
+
+	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= XSCALE_PERFCTR_ICACHE_MISS,
+
+	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= XSCALE_PERFCTR_DTLB_MISS,
+	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= XSCALE_PERFCTR_DTLB_MISS,
+
+	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= XSCALE_PERFCTR_ITLB_MISS,
+	[C(ITLB)][C(OP_WRITE)][C(RESULT_MISS)]	= XSCALE_PERFCTR_ITLB_MISS,
 };
 };
 
 
 #define	XSCALE_PMU_ENABLE	0x001
 #define	XSCALE_PMU_ENABLE	0x001
@@ -442,7 +355,7 @@ static int xscale_map_event(struct perf_event *event)
 
 
 static int xscale1pmu_init(struct arm_pmu *cpu_pmu)
 static int xscale1pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
-	cpu_pmu->name		= "xscale1";
+	cpu_pmu->name		= "armv5_xscale1";
 	cpu_pmu->handle_irq	= xscale1pmu_handle_irq;
 	cpu_pmu->handle_irq	= xscale1pmu_handle_irq;
 	cpu_pmu->enable		= xscale1pmu_enable_event;
 	cpu_pmu->enable		= xscale1pmu_enable_event;
 	cpu_pmu->disable	= xscale1pmu_disable_event;
 	cpu_pmu->disable	= xscale1pmu_disable_event;
@@ -812,7 +725,7 @@ static inline void xscale2pmu_write_counter(struct perf_event *event, u32 val)
 
 
 static int xscale2pmu_init(struct arm_pmu *cpu_pmu)
 static int xscale2pmu_init(struct arm_pmu *cpu_pmu)
 {
 {
-	cpu_pmu->name		= "xscale2";
+	cpu_pmu->name		= "armv5_xscale2";
 	cpu_pmu->handle_irq	= xscale2pmu_handle_irq;
 	cpu_pmu->handle_irq	= xscale2pmu_handle_irq;
 	cpu_pmu->enable		= xscale2pmu_enable_event;
 	cpu_pmu->enable		= xscale2pmu_enable_event;
 	cpu_pmu->disable	= xscale2pmu_disable_event;
 	cpu_pmu->disable	= xscale2pmu_disable_event;

+ 2 - 1
arch/arm/kernel/relocate_kernel.S

@@ -3,6 +3,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/kexec.h>
 #include <asm/kexec.h>
 
 
 	.align	3	/* not needed for this code, but keeps fncpy() happy */
 	.align	3	/* not needed for this code, but keeps fncpy() happy */
@@ -59,7 +60,7 @@ ENTRY(relocate_new_kernel)
 	mov r0,#0
 	mov r0,#0
 	ldr r1,kexec_mach_type
 	ldr r1,kexec_mach_type
 	ldr r2,kexec_boot_atags
 	ldr r2,kexec_boot_atags
- ARM(	mov pc, lr	)
+ ARM(	ret lr	)
  THUMB(	bx lr		)
  THUMB(	bx lr		)
 
 
 	.align
 	.align

+ 22 - 7
arch/arm/kernel/setup.c

@@ -393,19 +393,34 @@ static void __init cpuid_init_hwcaps(void)
 		elf_hwcap |= HWCAP_LPAE;
 		elf_hwcap |= HWCAP_LPAE;
 }
 }
 
 
-static void __init feat_v6_fixup(void)
+static void __init elf_hwcap_fixup(void)
 {
 {
-	int id = read_cpuid_id();
-
-	if ((id & 0xff0f0000) != 0x41070000)
-		return;
+	unsigned id = read_cpuid_id();
+	unsigned sync_prim;
 
 
 	/*
 	/*
 	 * HWCAP_TLS is available only on 1136 r1p0 and later,
 	 * HWCAP_TLS is available only on 1136 r1p0 and later,
 	 * see also kuser_get_tls_init.
 	 * see also kuser_get_tls_init.
 	 */
 	 */
-	if ((((id >> 4) & 0xfff) == 0xb36) && (((id >> 20) & 3) == 0))
+	if (read_cpuid_part() == ARM_CPU_PART_ARM1136 &&
+	    ((id >> 20) & 3) == 0) {
 		elf_hwcap &= ~HWCAP_TLS;
 		elf_hwcap &= ~HWCAP_TLS;
+		return;
+	}
+
+	/* Verify if CPUID scheme is implemented */
+	if ((id & 0x000f0000) != 0x000f0000)
+		return;
+
+	/*
+	 * If the CPU supports LDREX/STREX and LDREXB/STREXB,
+	 * avoid advertising SWP; it may not be atomic with
+	 * multiprocessing cores.
+	 */
+	sync_prim = ((read_cpuid_ext(CPUID_EXT_ISAR3) >> 8) & 0xf0) |
+		    ((read_cpuid_ext(CPUID_EXT_ISAR4) >> 20) & 0x0f);
+	if (sync_prim >= 0x13)
+		elf_hwcap &= ~HWCAP_SWP;
 }
 }
 
 
 /*
 /*
@@ -609,7 +624,7 @@ static void __init setup_processor(void)
 #endif
 #endif
 	erratum_a15_798181_init();
 	erratum_a15_798181_init();
 
 
-	feat_v6_fixup();
+	elf_hwcap_fixup();
 
 
 	cacheid_init();
 	cacheid_init();
 	cpu_init();
 	cpu_init();

+ 1 - 1
arch/arm/kernel/sleep.S

@@ -107,7 +107,7 @@ ENTRY(cpu_resume_mmu)
 	instr_sync
 	instr_sync
 	mov	r0, r0
 	mov	r0, r0
 	mov	r0, r0
 	mov	r0, r0
-	mov	pc, r3			@ jump to virtual address
+	ret	r3			@ jump to virtual address
 ENDPROC(cpu_resume_mmu)
 ENDPROC(cpu_resume_mmu)
 	.popsection
 	.popsection
 cpu_resume_after_mmu:
 cpu_resume_after_mmu:

+ 10 - 2
arch/arm/kernel/smp_scu.c

@@ -17,6 +17,8 @@
 #include <asm/cputype.h>
 #include <asm/cputype.h>
 
 
 #define SCU_CTRL		0x00
 #define SCU_CTRL		0x00
+#define SCU_ENABLE		(1 << 0)
+#define SCU_STANDBY_ENABLE	(1 << 5)
 #define SCU_CONFIG		0x04
 #define SCU_CONFIG		0x04
 #define SCU_CPU_STATUS		0x08
 #define SCU_CPU_STATUS		0x08
 #define SCU_INVALIDATE		0x0c
 #define SCU_INVALIDATE		0x0c
@@ -50,10 +52,16 @@ void scu_enable(void __iomem *scu_base)
 
 
 	scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
 	scu_ctrl = readl_relaxed(scu_base + SCU_CTRL);
 	/* already enabled? */
 	/* already enabled? */
-	if (scu_ctrl & 1)
+	if (scu_ctrl & SCU_ENABLE)
 		return;
 		return;
 
 
-	scu_ctrl |= 1;
+	scu_ctrl |= SCU_ENABLE;
+
+	/* Cortex-A9 earlier than r2p0 has no standby bit in SCU */
+	if ((read_cpuid_id() & 0xff0ffff0) == 0x410fc090 &&
+	    (read_cpuid_id() & 0x00f0000f) >= 0x00200000)
+		scu_ctrl |= SCU_STANDBY_ENABLE;
+
 	writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
 	writel_relaxed(scu_ctrl, scu_base + SCU_CTRL);
 
 
 	/*
 	/*

+ 12 - 8
arch/arm/kernel/smp_tlb.c

@@ -92,15 +92,19 @@ void erratum_a15_798181_init(void)
 	unsigned int midr = read_cpuid_id();
 	unsigned int midr = read_cpuid_id();
 	unsigned int revidr = read_cpuid(CPUID_REVIDR);
 	unsigned int revidr = read_cpuid(CPUID_REVIDR);
 
 
-	/* Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
-	if ((midr & 0xff0ffff0) != 0x410fc0f0 || midr > 0x413fc0f2 ||
-	    (revidr & 0x210) == 0x210) {
-		return;
-	}
-	if (revidr & 0x10)
-		erratum_a15_798181_handler = erratum_a15_798181_partial;
-	else
+	/* Brahma-B15 r0p0..r0p2 affected
+	 * Cortex-A15 r0p0..r3p2 w/o ECO fix affected */
+	if ((midr & 0xff0ffff0) == 0x420f00f0 && midr <= 0x420f00f2)
 		erratum_a15_798181_handler = erratum_a15_798181_broadcast;
 		erratum_a15_798181_handler = erratum_a15_798181_broadcast;
+	else if ((midr & 0xff0ffff0) == 0x410fc0f0 && midr <= 0x413fc0f2 &&
+		 (revidr & 0x210) != 0x210) {
+		if (revidr & 0x10)
+			erratum_a15_798181_handler =
+				erratum_a15_798181_partial;
+		else
+			erratum_a15_798181_handler =
+				erratum_a15_798181_broadcast;
+	}
 }
 }
 #endif
 #endif
 
 

+ 4 - 0
arch/arm/kernel/swp_emulate.c

@@ -27,6 +27,7 @@
 #include <linux/perf_event.h>
 #include <linux/perf_event.h>
 
 
 #include <asm/opcodes.h>
 #include <asm/opcodes.h>
+#include <asm/system_info.h>
 #include <asm/traps.h>
 #include <asm/traps.h>
 #include <asm/uaccess.h>
 #include <asm/uaccess.h>
 
 
@@ -266,6 +267,9 @@ static struct undef_hook swp_hook = {
  */
  */
 static int __init swp_emulation_init(void)
 static int __init swp_emulation_init(void)
 {
 {
+	if (cpu_architecture() < CPU_ARCH_ARMv7)
+		return 0;
+
 #ifdef CONFIG_PROC_FS
 #ifdef CONFIG_PROC_FS
 	if (!proc_create("cpu/swp_emulation", S_IRUGO, NULL, &proc_status_fops))
 	if (!proc_create("cpu/swp_emulation", S_IRUGO, NULL, &proc_status_fops))
 		return -ENOMEM;
 		return -ENOMEM;

+ 1 - 4
arch/arm/kernel/time.c

@@ -50,10 +50,7 @@ unsigned long profile_pc(struct pt_regs *regs)
 	if (!in_lock_functions(regs->ARM_pc))
 	if (!in_lock_functions(regs->ARM_pc))
 		return regs->ARM_pc;
 		return regs->ARM_pc;
 
 
-	frame.fp = regs->ARM_fp;
-	frame.sp = regs->ARM_sp;
-	frame.lr = regs->ARM_lr;
-	frame.pc = regs->ARM_pc;
+	arm_get_current_stackframe(regs, &frame);
 	do {
 	do {
 		int ret = unwind_frame(&frame);
 		int ret = unwind_frame(&frame);
 		if (ret < 0)
 		if (ret < 0)

+ 4 - 2
arch/arm/kernel/traps.c

@@ -31,11 +31,13 @@
 #include <asm/exception.h>
 #include <asm/exception.h>
 #include <asm/unistd.h>
 #include <asm/unistd.h>
 #include <asm/traps.h>
 #include <asm/traps.h>
+#include <asm/ptrace.h>
 #include <asm/unwind.h>
 #include <asm/unwind.h>
 #include <asm/tls.h>
 #include <asm/tls.h>
 #include <asm/system_misc.h>
 #include <asm/system_misc.h>
 #include <asm/opcodes.h>
 #include <asm/opcodes.h>
 
 
+
 static const char *handler[]= {
 static const char *handler[]= {
 	"prefetch abort",
 	"prefetch abort",
 	"data abort",
 	"data abort",
@@ -184,7 +186,7 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		tsk = current;
 		tsk = current;
 
 
 	if (regs) {
 	if (regs) {
-		fp = regs->ARM_fp;
+		fp = frame_pointer(regs);
 		mode = processor_mode(regs);
 		mode = processor_mode(regs);
 	} else if (tsk != current) {
 	} else if (tsk != current) {
 		fp = thread_saved_fp(tsk);
 		fp = thread_saved_fp(tsk);
@@ -719,7 +721,7 @@ asmlinkage int arm_syscall(int no, struct pt_regs *regs)
 		dump_instr("", regs);
 		dump_instr("", regs);
 		if (user_mode(regs)) {
 		if (user_mode(regs)) {
 			__show_regs(regs);
 			__show_regs(regs);
-			c_backtrace(regs->ARM_fp, processor_mode(regs));
+			c_backtrace(frame_pointer(regs), processor_mode(regs));
 		}
 		}
 	}
 	}
 #endif
 #endif

+ 3 - 5
arch/arm/kernel/unwind.c

@@ -479,12 +479,10 @@ void unwind_backtrace(struct pt_regs *regs, struct task_struct *tsk)
 		tsk = current;
 		tsk = current;
 
 
 	if (regs) {
 	if (regs) {
-		frame.fp = regs->ARM_fp;
-		frame.sp = regs->ARM_sp;
-		frame.lr = regs->ARM_lr;
+		arm_get_current_stackframe(regs, &frame);
 		/* PC might be corrupted, use LR in that case. */
 		/* PC might be corrupted, use LR in that case. */
-		frame.pc = kernel_text_address(regs->ARM_pc)
-			 ? regs->ARM_pc : regs->ARM_lr;
+		if (!kernel_text_address(regs->ARM_pc))
+			frame.pc = regs->ARM_lr;
 	} else if (tsk == current) {
 	} else if (tsk == current) {
 		frame.fp = (unsigned long)__builtin_frame_address(0);
 		frame.fp = (unsigned long)__builtin_frame_address(0);
 		frame.sp = current_sp;
 		frame.sp = current_sp;

+ 0 - 1
arch/arm/kernel/vmlinux.lds.S

@@ -318,7 +318,6 @@ SECTIONS
 	_end = .;
 	_end = .;
 
 
 	STABS_DEBUG
 	STABS_DEBUG
-	.comment 0 : { *(.comment) }
 }
 }
 
 
 /*
 /*

+ 1 - 7
arch/arm/kvm/guest.c

@@ -274,13 +274,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
 
 
 int __attribute_const__ kvm_target_cpu(void)
 int __attribute_const__ kvm_target_cpu(void)
 {
 {
-	unsigned long implementor = read_cpuid_implementor();
-	unsigned long part_number = read_cpuid_part_number();
-
-	if (implementor != ARM_CPU_IMP_ARM)
-		return -EINVAL;
-
-	switch (part_number) {
+	switch (read_cpuid_part()) {
 	case ARM_CPU_PART_CORTEX_A7:
 	case ARM_CPU_PART_CORTEX_A7:
 		return KVM_ARM_TARGET_CORTEX_A7;
 		return KVM_ARM_TARGET_CORTEX_A7;
 	case ARM_CPU_PART_CORTEX_A15:
 	case ARM_CPU_PART_CORTEX_A15:

+ 2 - 1
arch/arm/kvm/init.S

@@ -17,6 +17,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/unified.h>
 #include <asm/unified.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <asm/kvm_asm.h>
 #include <asm/kvm_asm.h>
@@ -134,7 +135,7 @@ phase2:
 	ldr	r0, =TRAMPOLINE_VA
 	ldr	r0, =TRAMPOLINE_VA
 	adr	r1, target
 	adr	r1, target
 	bfi	r0, r1, #0, #PAGE_SHIFT
 	bfi	r0, r1, #0, #PAGE_SHIFT
-	mov	pc, r0
+	ret	r0
 
 
 target:	@ We're now in the trampoline code, switch page tables
 target:	@ We're now in the trampoline code, switch page tables
 	mcrr	p15, 4, r2, r3, c2
 	mcrr	p15, 4, r2, r3, c2

+ 2 - 1
arch/arm/lib/ashldi3.S

@@ -27,6 +27,7 @@ Boston, MA 02110-1301, USA.  */
 
 
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
 #define al r1
 #define al r1
@@ -47,7 +48,7 @@ ENTRY(__aeabi_llsl)
  THUMB(	lsrmi	r3, al, ip		)
  THUMB(	lsrmi	r3, al, ip		)
  THUMB(	orrmi	ah, ah, r3		)
  THUMB(	orrmi	ah, ah, r3		)
 	mov	al, al, lsl r2
 	mov	al, al, lsl r2
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__ashldi3)
 ENDPROC(__ashldi3)
 ENDPROC(__aeabi_llsl)
 ENDPROC(__aeabi_llsl)

+ 2 - 1
arch/arm/lib/ashrdi3.S

@@ -27,6 +27,7 @@ Boston, MA 02110-1301, USA.  */
 
 
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
 #define al r1
 #define al r1
@@ -47,7 +48,7 @@ ENTRY(__aeabi_lasr)
  THUMB(	lslmi	r3, ah, ip		)
  THUMB(	lslmi	r3, ah, ip		)
  THUMB(	orrmi	al, al, r3		)
  THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, asr r2
 	mov	ah, ah, asr r2
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__ashrdi3)
 ENDPROC(__ashrdi3)
 ENDPROC(__aeabi_lasr)
 ENDPROC(__aeabi_lasr)

+ 1 - 1
arch/arm/lib/backtrace.S

@@ -25,7 +25,7 @@
 ENTRY(c_backtrace)
 ENTRY(c_backtrace)
 
 
 #if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
 #if !defined(CONFIG_FRAME_POINTER) || !defined(CONFIG_PRINTK)
-		mov	pc, lr
+		ret	lr
 ENDPROC(c_backtrace)
 ENDPROC(c_backtrace)
 #else
 #else
 		stmfd	sp!, {r4 - r8, lr}	@ Save an extra register so we have a location...
 		stmfd	sp!, {r4 - r8, lr}	@ Save an extra register so we have a location...

+ 3 - 2
arch/arm/lib/bitops.h

@@ -1,3 +1,4 @@
+#include <asm/assembler.h>
 #include <asm/unwind.h>
 #include <asm/unwind.h>
 
 
 #if __LINUX_ARM_ARCH__ >= 6
 #if __LINUX_ARM_ARCH__ >= 6
@@ -70,7 +71,7 @@ UNWIND(	.fnstart	)
 	\instr	r2, r2, r3
 	\instr	r2, r2, r3
 	str	r2, [r1, r0, lsl #2]
 	str	r2, [r1, r0, lsl #2]
 	restore_irqs ip
 	restore_irqs ip
-	mov	pc, lr
+	ret	lr
 UNWIND(	.fnend		)
 UNWIND(	.fnend		)
 ENDPROC(\name		)
 ENDPROC(\name		)
 	.endm
 	.endm
@@ -98,7 +99,7 @@ UNWIND(	.fnstart	)
 	\store	r2, [r1]
 	\store	r2, [r1]
 	moveq	r0, #0
 	moveq	r0, #0
 	restore_irqs ip
 	restore_irqs ip
-	mov	pc, lr
+	ret	lr
 UNWIND(	.fnend		)
 UNWIND(	.fnend		)
 ENDPROC(\name		)
 ENDPROC(\name		)
 	.endm
 	.endm

+ 3 - 2
arch/arm/lib/bswapsdi2.S

@@ -1,4 +1,5 @@
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #if __LINUX_ARM_ARCH__ >= 6
 #if __LINUX_ARM_ARCH__ >= 6
 ENTRY(__bswapsi2)
 ENTRY(__bswapsi2)
@@ -18,7 +19,7 @@ ENTRY(__bswapsi2)
 	mov r3, r3, lsr #8
 	mov r3, r3, lsr #8
 	bic r3, r3, #0xff00
 	bic r3, r3, #0xff00
 	eor r0, r3, r0, ror #8
 	eor r0, r3, r0, ror #8
-	mov pc, lr
+	ret lr
 ENDPROC(__bswapsi2)
 ENDPROC(__bswapsi2)
 
 
 ENTRY(__bswapdi2)
 ENTRY(__bswapdi2)
@@ -31,6 +32,6 @@ ENTRY(__bswapdi2)
 	bic r1, r1, #0xff00
 	bic r1, r1, #0xff00
 	eor r1, r1, r0, ror #8
 	eor r1, r1, r0, ror #8
 	eor r0, r3, ip, ror #8
 	eor r0, r3, ip, ror #8
-	mov pc, lr
+	ret lr
 ENDPROC(__bswapdi2)
 ENDPROC(__bswapdi2)
 #endif
 #endif

+ 2 - 2
arch/arm/lib/call_with_stack.S

@@ -36,9 +36,9 @@ ENTRY(call_with_stack)
 	mov	r0, r1
 	mov	r0, r1
 
 
 	adr	lr, BSYM(1f)
 	adr	lr, BSYM(1f)
-	mov	pc, r2
+	ret	r2
 
 
 1:	ldr	lr, [sp]
 1:	ldr	lr, [sp]
 	ldr	sp, [sp, #4]
 	ldr	sp, [sp, #4]
-	mov	pc, lr
+	ret	lr
 ENDPROC(call_with_stack)
 ENDPROC(call_with_stack)

+ 1 - 1
arch/arm/lib/csumpartial.S

@@ -97,7 +97,7 @@ td3	.req	lr
 #endif
 #endif
 #endif
 #endif
 		adcnes	sum, sum, td0		@ update checksum
 		adcnes	sum, sum, td0		@ update checksum
-		mov	pc, lr
+		ret	lr
 
 
 ENTRY(csum_partial)
 ENTRY(csum_partial)
 		stmfd	sp!, {buf, lr}
 		stmfd	sp!, {buf, lr}

+ 3 - 2
arch/arm/lib/csumpartialcopygeneric.S

@@ -7,6 +7,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  * published by the Free Software Foundation.
  */
  */
+#include <asm/assembler.h>
 
 
 /*
 /*
  * unsigned int
  * unsigned int
@@ -40,7 +41,7 @@ sum	.req	r3
 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 		adcs	sum, sum, ip, put_byte_1	@ update checksum
 		strb	ip, [dst], #1
 		strb	ip, [dst], #1
 		tst	dst, #2
 		tst	dst, #2
-		moveq	pc, lr			@ dst is now 32bit aligned
+		reteq	lr			@ dst is now 32bit aligned
 
 
 .Ldst_16bit:	load2b	r8, ip
 .Ldst_16bit:	load2b	r8, ip
 		sub	len, len, #2
 		sub	len, len, #2
@@ -48,7 +49,7 @@ sum	.req	r3
 		strb	r8, [dst], #1
 		strb	r8, [dst], #1
 		adcs	sum, sum, ip, put_byte_1
 		adcs	sum, sum, ip, put_byte_1
 		strb	ip, [dst], #1
 		strb	ip, [dst], #1
-		mov	pc, lr			@ dst is now 32bit aligned
+		ret	lr			@ dst is now 32bit aligned
 
 
 		/*
 		/*
 		 * Handle 0 to 7 bytes, with any alignment of source and
 		 * Handle 0 to 7 bytes, with any alignment of source and

+ 9 - 9
arch/arm/lib/delay-loop.S

@@ -35,7 +35,7 @@ ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
 		mul	r0, r2, r0		@ max = 2^32-1
 		mul	r0, r2, r0		@ max = 2^32-1
 		add	r0, r0, r1, lsr #32-6
 		add	r0, r0, r1, lsr #32-6
 		movs	r0, r0, lsr #6
 		movs	r0, r0, lsr #6
-		moveq	pc, lr
+		reteq	lr
 
 
 /*
 /*
  * loops = r0 * HZ * loops_per_jiffy / 1000000
  * loops = r0 * HZ * loops_per_jiffy / 1000000
@@ -46,23 +46,23 @@ ENTRY(__loop_const_udelay)			@ 0 <= r0 <= 0x7fffff06
 ENTRY(__loop_delay)
 ENTRY(__loop_delay)
 		subs	r0, r0, #1
 		subs	r0, r0, #1
 #if 0
 #if 0
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
-		movls	pc, lr
+		retls	lr
 		subs	r0, r0, #1
 		subs	r0, r0, #1
 #endif
 #endif
 		bhi	__loop_delay
 		bhi	__loop_delay
-		mov	pc, lr
+		ret	lr
 ENDPROC(__loop_udelay)
 ENDPROC(__loop_udelay)
 ENDPROC(__loop_const_udelay)
 ENDPROC(__loop_const_udelay)
 ENDPROC(__loop_delay)
 ENDPROC(__loop_delay)

+ 7 - 6
arch/arm/lib/div64.S

@@ -13,6 +13,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/unwind.h>
 #include <asm/unwind.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
@@ -97,7 +98,7 @@ UNWIND(.fnstart)
 	mov	yl, #0
 	mov	yl, #0
 	cmpeq	xl, r4
 	cmpeq	xl, r4
 	movlo	xh, xl
 	movlo	xh, xl
-	movlo	pc, lr
+	retlo	lr
 
 
 	@ The division loop for lower bit positions.
 	@ The division loop for lower bit positions.
 	@ Here we shift remainer bits leftwards rather than moving the
 	@ Here we shift remainer bits leftwards rather than moving the
@@ -111,14 +112,14 @@ UNWIND(.fnstart)
 	subcs	xh, xh, r4
 	subcs	xh, xh, r4
 	movs	ip, ip, lsr #1
 	movs	ip, ip, lsr #1
 	bne	4b
 	bne	4b
-	mov	pc, lr
+	ret	lr
 
 
 	@ The top part of remainder became zero.  If carry is set
 	@ The top part of remainder became zero.  If carry is set
 	@ (the 33th bit) this is a false positive so resume the loop.
 	@ (the 33th bit) this is a false positive so resume the loop.
 	@ Otherwise, if lower part is also null then we are done.
 	@ Otherwise, if lower part is also null then we are done.
 6:	bcs	5b
 6:	bcs	5b
 	cmp	xl, #0
 	cmp	xl, #0
-	moveq	pc, lr
+	reteq	lr
 
 
 	@ We still have remainer bits in the low part.  Bring them up.
 	@ We still have remainer bits in the low part.  Bring them up.
 
 
@@ -144,7 +145,7 @@ UNWIND(.fnstart)
 	movs	ip, ip, lsr #1
 	movs	ip, ip, lsr #1
 	mov	xh, #1
 	mov	xh, #1
 	bne	4b
 	bne	4b
-	mov	pc, lr
+	ret	lr
 
 
 8:	@ Division by a power of 2: determine what that divisor order is
 8:	@ Division by a power of 2: determine what that divisor order is
 	@ then simply shift values around
 	@ then simply shift values around
@@ -184,13 +185,13 @@ UNWIND(.fnstart)
  THUMB(	orr	yl, yl, xh		)
  THUMB(	orr	yl, yl, xh		)
 	mov	xh, xl, lsl ip
 	mov	xh, xl, lsl ip
 	mov	xh, xh, lsr ip
 	mov	xh, xh, lsr ip
-	mov	pc, lr
+	ret	lr
 
 
 	@ eq -> division by 1: obvious enough...
 	@ eq -> division by 1: obvious enough...
 9:	moveq	yl, xl
 9:	moveq	yl, xl
 	moveq	yh, xh
 	moveq	yh, xh
 	moveq	xh, #0
 	moveq	xh, #0
-	moveq	pc, lr
+	reteq	lr
 UNWIND(.fnend)
 UNWIND(.fnend)
 
 
 UNWIND(.fnstart)
 UNWIND(.fnstart)

+ 5 - 5
arch/arm/lib/findbit.S

@@ -35,7 +35,7 @@ ENTRY(_find_first_zero_bit_le)
 2:		cmp	r2, r1			@ any more?
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 		blo	1b
 3:		mov	r0, r1			@ no free bits
 3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
+		ret	lr
 ENDPROC(_find_first_zero_bit_le)
 ENDPROC(_find_first_zero_bit_le)
 
 
 /*
 /*
@@ -76,7 +76,7 @@ ENTRY(_find_first_bit_le)
 2:		cmp	r2, r1			@ any more?
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 		blo	1b
 3:		mov	r0, r1			@ no free bits
 3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
+		ret	lr
 ENDPROC(_find_first_bit_le)
 ENDPROC(_find_first_bit_le)
 
 
 /*
 /*
@@ -114,7 +114,7 @@ ENTRY(_find_first_zero_bit_be)
 2:		cmp	r2, r1			@ any more?
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 		blo	1b
 3:		mov	r0, r1			@ no free bits
 3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
+		ret	lr
 ENDPROC(_find_first_zero_bit_be)
 ENDPROC(_find_first_zero_bit_be)
 
 
 ENTRY(_find_next_zero_bit_be)
 ENTRY(_find_next_zero_bit_be)
@@ -148,7 +148,7 @@ ENTRY(_find_first_bit_be)
 2:		cmp	r2, r1			@ any more?
 2:		cmp	r2, r1			@ any more?
 		blo	1b
 		blo	1b
 3:		mov	r0, r1			@ no free bits
 3:		mov	r0, r1			@ no free bits
-		mov	pc, lr
+		ret	lr
 ENDPROC(_find_first_bit_be)
 ENDPROC(_find_first_bit_be)
 
 
 ENTRY(_find_next_bit_be)
 ENTRY(_find_next_bit_be)
@@ -192,5 +192,5 @@ ENDPROC(_find_next_bit_be)
 #endif
 #endif
 		cmp	r1, r0			@ Clamp to maxbit
 		cmp	r1, r0			@ Clamp to maxbit
 		movlo	r0, r1
 		movlo	r0, r1
-		mov	pc, lr
+		ret	lr
 
 

+ 40 - 5
arch/arm/lib/getuser.S

@@ -18,7 +18,7 @@
  * Inputs:	r0 contains the address
  * Inputs:	r0 contains the address
  *		r1 contains the address limit, which must be preserved
  *		r1 contains the address limit, which must be preserved
  * Outputs:	r0 is the error code
  * Outputs:	r0 is the error code
- *		r2 contains the zero-extended value
+ *		r2, r3 contains the zero-extended value
  *		lr corrupted
  *		lr corrupted
  *
  *
  * No other registers must be altered.  (see <asm/uaccess.h>
  * No other registers must be altered.  (see <asm/uaccess.h>
@@ -36,7 +36,7 @@ ENTRY(__get_user_1)
 	check_uaccess r0, 1, r1, r2, __get_user_bad
 	check_uaccess r0, 1, r1, r2, __get_user_bad
 1: TUSER(ldrb)	r2, [r0]
 1: TUSER(ldrb)	r2, [r0]
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__get_user_1)
 ENDPROC(__get_user_1)
 
 
 ENTRY(__get_user_2)
 ENTRY(__get_user_2)
@@ -56,25 +56,60 @@ rb	.req	r0
 	orr	r2, rb, r2, lsl #8
 	orr	r2, rb, r2, lsl #8
 #endif
 #endif
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__get_user_2)
 ENDPROC(__get_user_2)
 
 
 ENTRY(__get_user_4)
 ENTRY(__get_user_4)
 	check_uaccess r0, 4, r1, r2, __get_user_bad
 	check_uaccess r0, 4, r1, r2, __get_user_bad
 4: TUSER(ldr)	r2, [r0]
 4: TUSER(ldr)	r2, [r0]
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__get_user_4)
 ENDPROC(__get_user_4)
 
 
+ENTRY(__get_user_8)
+	check_uaccess r0, 8, r1, r2, __get_user_bad
+#ifdef CONFIG_THUMB2_KERNEL
+5: TUSER(ldr)	r2, [r0]
+6: TUSER(ldr)	r3, [r0, #4]
+#else
+5: TUSER(ldr)	r2, [r0], #4
+6: TUSER(ldr)	r3, [r0]
+#endif
+	mov	r0, #0
+	ret	lr
+ENDPROC(__get_user_8)
+
+#ifdef __ARMEB__
+ENTRY(__get_user_lo8)
+	check_uaccess r0, 8, r1, r2, __get_user_bad
+#ifdef CONFIG_CPU_USE_DOMAINS
+	add	r0, r0, #4
+7:	ldrt	r2, [r0]
+#else
+7:	ldr	r2, [r0, #4]
+#endif
+	mov	r0, #0
+	ret	lr
+ENDPROC(__get_user_lo8)
+#endif
+
+__get_user_bad8:
+	mov	r3, #0
 __get_user_bad:
 __get_user_bad:
 	mov	r2, #0
 	mov	r2, #0
 	mov	r0, #-EFAULT
 	mov	r0, #-EFAULT
-	mov	pc, lr
+	ret	lr
 ENDPROC(__get_user_bad)
 ENDPROC(__get_user_bad)
+ENDPROC(__get_user_bad8)
 
 
 .pushsection __ex_table, "a"
 .pushsection __ex_table, "a"
 	.long	1b, __get_user_bad
 	.long	1b, __get_user_bad
 	.long	2b, __get_user_bad
 	.long	2b, __get_user_bad
 	.long	3b, __get_user_bad
 	.long	3b, __get_user_bad
 	.long	4b, __get_user_bad
 	.long	4b, __get_user_bad
+	.long	5b, __get_user_bad8
+	.long	6b, __get_user_bad8
+#ifdef __ARMEB__
+	.long   7b, __get_user_bad
+#endif
 .popsection
 .popsection

+ 1 - 1
arch/arm/lib/io-readsb.S

@@ -25,7 +25,7 @@
 
 
 ENTRY(__raw_readsb)
 ENTRY(__raw_readsb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		ands	ip, r1, #3
 		ands	ip, r1, #3
 		bne	.Linsb_align
 		bne	.Linsb_align
 
 

+ 3 - 3
arch/arm/lib/io-readsl.S

@@ -12,7 +12,7 @@
 
 
 ENTRY(__raw_readsl)
 ENTRY(__raw_readsl)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		ands	ip, r1, #3
 		ands	ip, r1, #3
 		bne	3f
 		bne	3f
 
 
@@ -33,7 +33,7 @@ ENTRY(__raw_readsl)
 		stmcsia	r1!, {r3, ip}
 		stmcsia	r1!, {r3, ip}
 		ldrne	r3, [r0, #0]
 		ldrne	r3, [r0, #0]
 		strne	r3, [r1, #0]
 		strne	r3, [r1, #0]
-		mov	pc, lr
+		ret	lr
 
 
 3:		ldr	r3, [r0]
 3:		ldr	r3, [r0]
 		cmp	ip, #2
 		cmp	ip, #2
@@ -75,5 +75,5 @@ ENTRY(__raw_readsl)
 		strb	r3, [r1, #1]
 		strb	r3, [r1, #1]
 8:		mov	r3, ip, get_byte_0
 8:		mov	r3, ip, get_byte_0
 		strb	r3, [r1, #0]
 		strb	r3, [r1, #0]
-		mov	pc, lr
+		ret	lr
 ENDPROC(__raw_readsl)
 ENDPROC(__raw_readsl)

+ 2 - 2
arch/arm/lib/io-readsw-armv3.S

@@ -27,11 +27,11 @@
 		strb	r3, [r1], #1
 		strb	r3, [r1], #1
 
 
 		subs	r2, r2, #1
 		subs	r2, r2, #1
-		moveq	pc, lr
+		reteq	lr
 
 
 ENTRY(__raw_readsw)
 ENTRY(__raw_readsw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		tst	r1, #3
 		tst	r1, #3
 		bne	.Linsw_align
 		bne	.Linsw_align
 
 

+ 1 - 1
arch/arm/lib/io-readsw-armv4.S

@@ -26,7 +26,7 @@
 
 
 ENTRY(__raw_readsw)
 ENTRY(__raw_readsw)
 		teq	r2, #0
 		teq	r2, #0
-		moveq	pc, lr
+		reteq	lr
 		tst	r1, #3
 		tst	r1, #3
 		bne	.Linsw_align
 		bne	.Linsw_align
 
 

+ 1 - 1
arch/arm/lib/io-writesb.S

@@ -45,7 +45,7 @@
 
 
 ENTRY(__raw_writesb)
 ENTRY(__raw_writesb)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		ands	ip, r1, #3
 		ands	ip, r1, #3
 		bne	.Loutsb_align
 		bne	.Loutsb_align
 
 

+ 5 - 5
arch/arm/lib/io-writesl.S

@@ -12,7 +12,7 @@
 
 
 ENTRY(__raw_writesl)
 ENTRY(__raw_writesl)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		ands	ip, r1, #3
 		ands	ip, r1, #3
 		bne	3f
 		bne	3f
 
 
@@ -33,7 +33,7 @@ ENTRY(__raw_writesl)
 		ldrne	r3, [r1, #0]
 		ldrne	r3, [r1, #0]
 		strcs	ip, [r0, #0]
 		strcs	ip, [r0, #0]
 		strne	r3, [r0, #0]
 		strne	r3, [r0, #0]
-		mov	pc, lr
+		ret	lr
 
 
 3:		bic	r1, r1, #3
 3:		bic	r1, r1, #3
 		ldr	r3, [r1], #4
 		ldr	r3, [r1], #4
@@ -47,7 +47,7 @@ ENTRY(__raw_writesl)
 		orr	ip, ip, r3, lspush #16
 		orr	ip, ip, r3, lspush #16
 		str	ip, [r0]
 		str	ip, [r0]
 		bne	4b
 		bne	4b
-		mov	pc, lr
+		ret	lr
 
 
 5:		mov	ip, r3, lspull #8
 5:		mov	ip, r3, lspull #8
 		ldr	r3, [r1], #4
 		ldr	r3, [r1], #4
@@ -55,7 +55,7 @@ ENTRY(__raw_writesl)
 		orr	ip, ip, r3, lspush #24
 		orr	ip, ip, r3, lspush #24
 		str	ip, [r0]
 		str	ip, [r0]
 		bne	5b
 		bne	5b
-		mov	pc, lr
+		ret	lr
 
 
 6:		mov	ip, r3, lspull #24
 6:		mov	ip, r3, lspull #24
 		ldr	r3, [r1], #4
 		ldr	r3, [r1], #4
@@ -63,5 +63,5 @@ ENTRY(__raw_writesl)
 		orr	ip, ip, r3, lspush #8
 		orr	ip, ip, r3, lspush #8
 		str	ip, [r0]
 		str	ip, [r0]
 		bne	6b
 		bne	6b
-		mov	pc, lr
+		ret	lr
 ENDPROC(__raw_writesl)
 ENDPROC(__raw_writesl)

+ 2 - 2
arch/arm/lib/io-writesw-armv3.S

@@ -28,11 +28,11 @@
 		orr	r3, r3, r3, lsl #16
 		orr	r3, r3, r3, lsl #16
 		str	r3, [r0]
 		str	r3, [r0]
 		subs	r2, r2, #1
 		subs	r2, r2, #1
-		moveq	pc, lr
+		reteq	lr
 
 
 ENTRY(__raw_writesw)
 ENTRY(__raw_writesw)
 		teq	r2, #0		@ do we have to check for the zero len?
 		teq	r2, #0		@ do we have to check for the zero len?
-		moveq	pc, lr
+		reteq	lr
 		tst	r1, #3
 		tst	r1, #3
 		bne	.Loutsw_align
 		bne	.Loutsw_align
 
 

+ 2 - 2
arch/arm/lib/io-writesw-armv4.S

@@ -31,7 +31,7 @@
 
 
 ENTRY(__raw_writesw)
 ENTRY(__raw_writesw)
 		teq	r2, #0
 		teq	r2, #0
-		moveq	pc, lr
+		reteq	lr
 		ands	r3, r1, #3
 		ands	r3, r1, #3
 		bne	.Loutsw_align
 		bne	.Loutsw_align
 
 
@@ -96,5 +96,5 @@ ENTRY(__raw_writesw)
 		tst	r2, #1
 		tst	r2, #1
 3:		movne	ip, r3, lsr #8
 3:		movne	ip, r3, lsr #8
 		strneh	ip, [r0]
 		strneh	ip, [r0]
-		mov	pc, lr
+		ret	lr
 ENDPROC(__raw_writesw)
 ENDPROC(__raw_writesw)

+ 13 - 13
arch/arm/lib/lib1funcs.S

@@ -210,7 +210,7 @@ ENTRY(__aeabi_uidiv)
 UNWIND(.fnstart)
 UNWIND(.fnstart)
 
 
 	subs	r2, r1, #1
 	subs	r2, r1, #1
-	moveq	pc, lr
+	reteq	lr
 	bcc	Ldiv0
 	bcc	Ldiv0
 	cmp	r0, r1
 	cmp	r0, r1
 	bls	11f
 	bls	11f
@@ -220,16 +220,16 @@ UNWIND(.fnstart)
 	ARM_DIV_BODY r0, r1, r2, r3
 	ARM_DIV_BODY r0, r1, r2, r3
 
 
 	mov	r0, r2
 	mov	r0, r2
-	mov	pc, lr
+	ret	lr
 
 
 11:	moveq	r0, #1
 11:	moveq	r0, #1
 	movne	r0, #0
 	movne	r0, #0
-	mov	pc, lr
+	ret	lr
 
 
 12:	ARM_DIV2_ORDER r1, r2
 12:	ARM_DIV2_ORDER r1, r2
 
 
 	mov	r0, r0, lsr r2
 	mov	r0, r0, lsr r2
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__udivsi3)
 ENDPROC(__udivsi3)
@@ -244,11 +244,11 @@ UNWIND(.fnstart)
 	moveq   r0, #0
 	moveq   r0, #0
 	tsthi	r1, r2				@ see if divisor is power of 2
 	tsthi	r1, r2				@ see if divisor is power of 2
 	andeq	r0, r0, r2
 	andeq	r0, r0, r2
-	movls	pc, lr
+	retls	lr
 
 
 	ARM_MOD_BODY r0, r1, r2, r3
 	ARM_MOD_BODY r0, r1, r2, r3
 
 
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__umodsi3)
 ENDPROC(__umodsi3)
@@ -274,23 +274,23 @@ UNWIND(.fnstart)
 
 
 	cmp	ip, #0
 	cmp	ip, #0
 	rsbmi	r0, r0, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	ret	lr
 
 
 10:	teq	ip, r0				@ same sign ?
 10:	teq	ip, r0				@ same sign ?
 	rsbmi	r0, r0, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	ret	lr
 
 
 11:	movlo	r0, #0
 11:	movlo	r0, #0
 	moveq	r0, ip, asr #31
 	moveq	r0, ip, asr #31
 	orreq	r0, r0, #1
 	orreq	r0, r0, #1
-	mov	pc, lr
+	ret	lr
 
 
 12:	ARM_DIV2_ORDER r1, r2
 12:	ARM_DIV2_ORDER r1, r2
 
 
 	cmp	ip, #0
 	cmp	ip, #0
 	mov	r0, r3, lsr r2
 	mov	r0, r3, lsr r2
 	rsbmi	r0, r0, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__divsi3)
 ENDPROC(__divsi3)
@@ -315,7 +315,7 @@ UNWIND(.fnstart)
 
 
 10:	cmp	ip, #0
 10:	cmp	ip, #0
 	rsbmi	r0, r0, #0
 	rsbmi	r0, r0, #0
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__modsi3)
 ENDPROC(__modsi3)
@@ -331,7 +331,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	mul	r3, r0, r2
 	sub	r1, r1, r3
 	sub	r1, r1, r3
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__aeabi_uidivmod)
 ENDPROC(__aeabi_uidivmod)
@@ -344,7 +344,7 @@ UNWIND(.save {r0, r1, ip, lr}	)
 	ldmfd	sp!, {r1, r2, ip, lr}
 	ldmfd	sp!, {r1, r2, ip, lr}
 	mul	r3, r0, r2
 	mul	r3, r0, r2
 	sub	r1, r1, r3
 	sub	r1, r1, r3
-	mov	pc, lr
+	ret	lr
 
 
 UNWIND(.fnend)
 UNWIND(.fnend)
 ENDPROC(__aeabi_idivmod)
 ENDPROC(__aeabi_idivmod)

+ 2 - 1
arch/arm/lib/lshrdi3.S

@@ -27,6 +27,7 @@ Boston, MA 02110-1301, USA.  */
 
 
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
 #define al r1
 #define al r1
@@ -47,7 +48,7 @@ ENTRY(__aeabi_llsr)
  THUMB(	lslmi	r3, ah, ip		)
  THUMB(	lslmi	r3, ah, ip		)
  THUMB(	orrmi	al, al, r3		)
  THUMB(	orrmi	al, al, r3		)
 	mov	ah, ah, lsr r2
 	mov	ah, ah, lsr r2
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__lshrdi3)
 ENDPROC(__lshrdi3)
 ENDPROC(__aeabi_llsr)
 ENDPROC(__aeabi_llsr)

+ 1 - 1
arch/arm/lib/memchr.S

@@ -22,5 +22,5 @@ ENTRY(memchr)
 	bne	1b
 	bne	1b
 	sub	r0, r0, #1
 	sub	r0, r0, #1
 2:	movne	r0, #0
 2:	movne	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(memchr)
 ENDPROC(memchr)

+ 1 - 1
arch/arm/lib/memset.S

@@ -110,7 +110,7 @@ ENTRY(memset)
 	strneb	r1, [ip], #1
 	strneb	r1, [ip], #1
 	tst	r2, #1
 	tst	r2, #1
 	strneb	r1, [ip], #1
 	strneb	r1, [ip], #1
-	mov	pc, lr
+	ret	lr
 
 
 6:	subs	r2, r2, #4		@ 1 do we have enough
 6:	subs	r2, r2, #4		@ 1 do we have enough
 	blt	5b			@ 1 bytes to align with?
 	blt	5b			@ 1 bytes to align with?

+ 1 - 1
arch/arm/lib/memzero.S

@@ -121,5 +121,5 @@ ENTRY(__memzero)
 	strneb	r2, [r0], #1		@ 1
 	strneb	r2, [r0], #1		@ 1
 	tst	r1, #1			@ 1 a byte left over
 	tst	r1, #1			@ 1 a byte left over
 	strneb	r2, [r0], #1		@ 1
 	strneb	r2, [r0], #1		@ 1
-	mov	pc, lr			@ 1
+	ret	lr			@ 1
 ENDPROC(__memzero)
 ENDPROC(__memzero)

+ 2 - 1
arch/arm/lib/muldi3.S

@@ -11,6 +11,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
 #define xh r0
 #define xh r0
@@ -41,7 +42,7 @@ ENTRY(__aeabi_lmul)
 	adc	xh, xh, yh, lsr #16
 	adc	xh, xh, yh, lsr #16
 	adds	xl, xl, ip, lsl #16
 	adds	xl, xl, ip, lsl #16
 	adc	xh, xh, ip, lsr #16
 	adc	xh, xh, ip, lsr #16
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__muldi3)
 ENDPROC(__muldi3)
 ENDPROC(__aeabi_lmul)
 ENDPROC(__aeabi_lmul)

+ 5 - 5
arch/arm/lib/putuser.S

@@ -36,7 +36,7 @@ ENTRY(__put_user_1)
 	check_uaccess r0, 1, r1, ip, __put_user_bad
 	check_uaccess r0, 1, r1, ip, __put_user_bad
 1: TUSER(strb)	r2, [r0]
 1: TUSER(strb)	r2, [r0]
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__put_user_1)
 ENDPROC(__put_user_1)
 
 
 ENTRY(__put_user_2)
 ENTRY(__put_user_2)
@@ -60,14 +60,14 @@ ENTRY(__put_user_2)
 #endif
 #endif
 #endif	/* CONFIG_THUMB2_KERNEL */
 #endif	/* CONFIG_THUMB2_KERNEL */
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__put_user_2)
 ENDPROC(__put_user_2)
 
 
 ENTRY(__put_user_4)
 ENTRY(__put_user_4)
 	check_uaccess r0, 4, r1, ip, __put_user_bad
 	check_uaccess r0, 4, r1, ip, __put_user_bad
 4: TUSER(str)	r2, [r0]
 4: TUSER(str)	r2, [r0]
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__put_user_4)
 ENDPROC(__put_user_4)
 
 
 ENTRY(__put_user_8)
 ENTRY(__put_user_8)
@@ -80,12 +80,12 @@ ENTRY(__put_user_8)
 6: TUSER(str)	r3, [r0]
 6: TUSER(str)	r3, [r0]
 #endif
 #endif
 	mov	r0, #0
 	mov	r0, #0
-	mov	pc, lr
+	ret	lr
 ENDPROC(__put_user_8)
 ENDPROC(__put_user_8)
 
 
 __put_user_bad:
 __put_user_bad:
 	mov	r0, #-EFAULT
 	mov	r0, #-EFAULT
-	mov	pc, lr
+	ret	lr
 ENDPROC(__put_user_bad)
 ENDPROC(__put_user_bad)
 
 
 .pushsection __ex_table, "a"
 .pushsection __ex_table, "a"

+ 1 - 1
arch/arm/lib/strchr.S

@@ -23,5 +23,5 @@ ENTRY(strchr)
 		teq	r2, r1
 		teq	r2, r1
 		movne	r0, #0
 		movne	r0, #0
 		subeq	r0, r0, #1
 		subeq	r0, r0, #1
-		mov	pc, lr
+		ret	lr
 ENDPROC(strchr)
 ENDPROC(strchr)

+ 1 - 1
arch/arm/lib/strrchr.S

@@ -22,5 +22,5 @@ ENTRY(strrchr)
 		teq	r2, #0
 		teq	r2, #0
 		bne	1b
 		bne	1b
 		mov	r0, r3
 		mov	r0, r3
-		mov	pc, lr
+		ret	lr
 ENDPROC(strrchr)
 ENDPROC(strrchr)

+ 3 - 2
arch/arm/lib/ucmpdi2.S

@@ -11,6 +11,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 
 
 #ifdef __ARMEB__
 #ifdef __ARMEB__
 #define xh r0
 #define xh r0
@@ -31,7 +32,7 @@ ENTRY(__ucmpdi2)
 	movlo	r0, #0
 	movlo	r0, #0
 	moveq	r0, #1
 	moveq	r0, #1
 	movhi	r0, #2
 	movhi	r0, #2
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__ucmpdi2)
 ENDPROC(__ucmpdi2)
 
 
@@ -44,7 +45,7 @@ ENTRY(__aeabi_ulcmp)
 	movlo	r0, #-1
 	movlo	r0, #-1
 	moveq	r0, #0
 	moveq	r0, #0
 	movhi	r0, #1
 	movhi	r0, #1
-	mov	pc, lr
+	ret	lr
 
 
 ENDPROC(__aeabi_ulcmp)
 ENDPROC(__aeabi_ulcmp)
 
 

+ 1 - 1
arch/arm/mach-davinci/sleep.S

@@ -213,7 +213,7 @@ ddr2clk_stop_done:
 	cmp	ip, r0
 	cmp	ip, r0
 	bne	ddr2clk_stop_done
 	bne	ddr2clk_stop_done
 
 
-	mov	pc, lr
+	ret	lr
 ENDPROC(davinci_ddr_psc_config)
 ENDPROC(davinci_ddr_psc_config)
 
 
 CACHE_FLUSH:
 CACHE_FLUSH:

+ 0 - 5
arch/arm/mach-ebsa110/include/mach/memory.h

@@ -16,11 +16,6 @@
 #ifndef __ASM_ARCH_MEMORY_H
 #ifndef __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 #define __ASM_ARCH_MEMORY_H
 
 
-/*
- * Physical DRAM offset.
- */
-#define PLAT_PHYS_OFFSET	UL(0x00000000)
-
 /*
 /*
  * Cache flushing area - SRAM
  * Cache flushing area - SRAM
  */
  */

+ 3 - 3
arch/arm/mach-ep93xx/crunch-bits.S

@@ -198,7 +198,7 @@ crunch_load:
 	get_thread_info r10
 	get_thread_info r10
 #endif
 #endif
 2:	dec_preempt_count r10, r3
 2:	dec_preempt_count r10, r3
-	mov	pc, lr
+	ret	lr
 
 
 /*
 /*
  * Back up crunch regs to save area and disable access to them
  * Back up crunch regs to save area and disable access to them
@@ -277,7 +277,7 @@ ENTRY(crunch_task_copy)
 	mov	r3, lr				@ preserve return address
 	mov	r3, lr				@ preserve return address
 	bl	crunch_save
 	bl	crunch_save
 	msr	cpsr_c, ip			@ restore interrupt mode
 	msr	cpsr_c, ip			@ restore interrupt mode
-	mov	pc, r3
+	ret	r3
 
 
 /*
 /*
  * Restore crunch state from given memory address
  * Restore crunch state from given memory address
@@ -310,4 +310,4 @@ ENTRY(crunch_task_restore)
 	mov	r3, lr				@ preserve return address
 	mov	r3, lr				@ preserve return address
 	bl	crunch_load
 	bl	crunch_load
 	msr	cpsr_c, ip			@ restore interrupt mode
 	msr	cpsr_c, ip			@ restore interrupt mode
-	mov	pc, r3
+	ret	r3

+ 0 - 22
arch/arm/mach-ep93xx/include/mach/memory.h

@@ -1,22 +0,0 @@
-/*
- * arch/arm/mach-ep93xx/include/mach/memory.h
- */
-
-#ifndef __ASM_ARCH_MEMORY_H
-#define __ASM_ARCH_MEMORY_H
-
-#if defined(CONFIG_EP93XX_SDCE3_SYNC_PHYS_OFFSET)
-#define PLAT_PHYS_OFFSET		UL(0x00000000)
-#elif defined(CONFIG_EP93XX_SDCE0_PHYS_OFFSET)
-#define PLAT_PHYS_OFFSET		UL(0xc0000000)
-#elif defined(CONFIG_EP93XX_SDCE1_PHYS_OFFSET)
-#define PLAT_PHYS_OFFSET		UL(0xd0000000)
-#elif defined(CONFIG_EP93XX_SDCE2_PHYS_OFFSET)
-#define PLAT_PHYS_OFFSET		UL(0xe0000000)
-#elif defined(CONFIG_EP93XX_SDCE3_ASYNC_PHYS_OFFSET)
-#define PLAT_PHYS_OFFSET		UL(0xf0000000)
-#else
-#error "Kconfig bug: No EP93xx PHYS_OFFSET set"
-#endif
-
-#endif

+ 1 - 0
arch/arm/mach-exynos/Kconfig

@@ -119,6 +119,7 @@ config EXYNOS5420_MCPM
 	bool "Exynos5420 Multi-Cluster PM support"
 	bool "Exynos5420 Multi-Cluster PM support"
 	depends on MCPM && SOC_EXYNOS5420
 	depends on MCPM && SOC_EXYNOS5420
 	select ARM_CCI
 	select ARM_CCI
+	select ARM_CPU_SUSPEND
 	help
 	help
 	  This is needed to provide CPU and cluster power management
 	  This is needed to provide CPU and cluster power management
 	  on Exynos5420 implementing big.LITTLE.
 	  on Exynos5420 implementing big.LITTLE.

+ 16 - 1
arch/arm/mach-exynos/mcpm-exynos.c

@@ -196,7 +196,7 @@ static void exynos_power_down(void)
 	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
 	if (last_man && __mcpm_outbound_enter_critical(cpu, cluster)) {
 		arch_spin_unlock(&exynos_mcpm_lock);
 		arch_spin_unlock(&exynos_mcpm_lock);
 
 
-		if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A15) {
+		if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A15) {
 			/*
 			/*
 			 * On the Cortex-A15 we need to disable
 			 * On the Cortex-A15 we need to disable
 			 * L2 prefetching before flushing the cache.
 			 * L2 prefetching before flushing the cache.
@@ -289,6 +289,19 @@ static void __naked exynos_pm_power_up_setup(unsigned int affinity_level)
 	"b	cci_enable_port_for_self");
 	"b	cci_enable_port_for_self");
 }
 }
 
 
+static void __init exynos_cache_off(void)
+{
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A15) {
+		/* disable L2 prefetching on the Cortex-A15 */
+		asm volatile(
+		"mcr	p15, 1, %0, c15, c0, 3\n\t"
+		"isb\n\t"
+		"dsb"
+		: : "r" (0x400));
+	}
+	exynos_v7_exit_coherency_flush(all);
+}
+
 static const struct of_device_id exynos_dt_mcpm_match[] = {
 static const struct of_device_id exynos_dt_mcpm_match[] = {
 	{ .compatible = "samsung,exynos5420" },
 	{ .compatible = "samsung,exynos5420" },
 	{ .compatible = "samsung,exynos5800" },
 	{ .compatible = "samsung,exynos5800" },
@@ -332,6 +345,8 @@ static int __init exynos_mcpm_init(void)
 	ret = mcpm_platform_register(&exynos_power_ops);
 	ret = mcpm_platform_register(&exynos_power_ops);
 	if (!ret)
 	if (!ret)
 		ret = mcpm_sync_init(exynos_pm_power_up_setup);
 		ret = mcpm_sync_init(exynos_pm_power_up_setup);
+	if (!ret)
+		ret = mcpm_loopback(exynos_cache_off); /* turn on the CCI */
 	if (ret) {
 	if (ret) {
 		iounmap(ns_sram_base_addr);
 		iounmap(ns_sram_base_addr);
 		return ret;
 		return ret;

+ 2 - 2
arch/arm/mach-exynos/platsmp.c

@@ -190,7 +190,7 @@ static void __init exynos_smp_init_cpus(void)
 	void __iomem *scu_base = scu_base_addr();
 	void __iomem *scu_base = scu_base_addr();
 	unsigned int i, ncores;
 	unsigned int i, ncores;
 
 
-	if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 		ncores = scu_base ? scu_get_core_count(scu_base) : 1;
 		ncores = scu_base ? scu_get_core_count(scu_base) : 1;
 	else
 	else
 		/*
 		/*
@@ -216,7 +216,7 @@ static void __init exynos_smp_prepare_cpus(unsigned int max_cpus)
 
 
 	exynos_sysram_init();
 	exynos_sysram_init();
 
 
-	if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 		scu_enable(scu_base_addr());
 		scu_enable(scu_base_addr());
 
 
 	/*
 	/*

+ 5 - 6
arch/arm/mach-exynos/pm.c

@@ -300,7 +300,7 @@ static int exynos_pm_suspend(void)
 	tmp = (S5P_USE_STANDBY_WFI0 | S5P_USE_STANDBY_WFE0);
 	tmp = (S5P_USE_STANDBY_WFI0 | S5P_USE_STANDBY_WFE0);
 	__raw_writel(tmp, S5P_CENTRAL_SEQ_OPTION);
 	__raw_writel(tmp, S5P_CENTRAL_SEQ_OPTION);
 
 
-	if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 		exynos_cpu_save_register();
 		exynos_cpu_save_register();
 
 
 	return 0;
 	return 0;
@@ -334,7 +334,7 @@ static void exynos_pm_resume(void)
 	if (exynos_pm_central_resume())
 	if (exynos_pm_central_resume())
 		goto early_wakeup;
 		goto early_wakeup;
 
 
-	if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 		exynos_cpu_restore_register();
 		exynos_cpu_restore_register();
 
 
 	/* For release retention */
 	/* For release retention */
@@ -353,7 +353,7 @@ static void exynos_pm_resume(void)
 
 
 	s3c_pm_do_restore_core(exynos_core_save, ARRAY_SIZE(exynos_core_save));
 	s3c_pm_do_restore_core(exynos_core_save, ARRAY_SIZE(exynos_core_save));
 
 
-	if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+	if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 		scu_enable(S5P_VA_SCU);
 		scu_enable(S5P_VA_SCU);
 
 
 early_wakeup:
 early_wakeup:
@@ -440,15 +440,14 @@ static int exynos_cpu_pm_notifier(struct notifier_block *self,
 	case CPU_PM_ENTER:
 	case CPU_PM_ENTER:
 		if (cpu == 0) {
 		if (cpu == 0) {
 			exynos_pm_central_suspend();
 			exynos_pm_central_suspend();
-			if (read_cpuid_part_number() == ARM_CPU_PART_CORTEX_A9)
+			if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9)
 				exynos_cpu_save_register();
 				exynos_cpu_save_register();
 		}
 		}
 		break;
 		break;
 
 
 	case CPU_PM_EXIT:
 	case CPU_PM_EXIT:
 		if (cpu == 0) {
 		if (cpu == 0) {
-			if (read_cpuid_part_number() ==
-					ARM_CPU_PART_CORTEX_A9) {
+			if (read_cpuid_part() == ARM_CPU_PART_CORTEX_A9) {
 				scu_enable(S5P_VA_SCU);
 				scu_enable(S5P_VA_SCU);
 				exynos_cpu_restore_register();
 				exynos_cpu_restore_register();
 			}
 			}

+ 0 - 5
arch/arm/mach-footbridge/include/mach/memory.h

@@ -59,11 +59,6 @@ extern unsigned long __bus_to_pfn(unsigned long);
  */
  */
 #define FLUSH_BASE		0xf9000000
 #define FLUSH_BASE		0xf9000000
 
 
-/*
- * Physical DRAM offset.
- */
-#define PLAT_PHYS_OFFSET		UL(0x00000000)
-
 #define FLUSH_BASE_PHYS		0x50000000
 #define FLUSH_BASE_PHYS		0x50000000
 
 
 #endif
 #endif

+ 3 - 2
arch/arm/mach-imx/suspend-imx6.S

@@ -10,6 +10,7 @@
  */
  */
 
 
 #include <linux/linkage.h>
 #include <linux/linkage.h>
+#include <asm/assembler.h>
 #include <asm/asm-offsets.h>
 #include <asm/asm-offsets.h>
 #include <asm/hardware/cache-l2x0.h>
 #include <asm/hardware/cache-l2x0.h>
 #include "hardware.h"
 #include "hardware.h"
@@ -301,7 +302,7 @@ rbc_loop:
 	resume_mmdc
 	resume_mmdc
 
 
 	/* return to suspend finish */
 	/* return to suspend finish */
-	mov	pc, lr
+	ret	lr
 
 
 resume:
 resume:
 	/* invalidate L1 I-cache first */
 	/* invalidate L1 I-cache first */
@@ -325,7 +326,7 @@ resume:
 	mov	r5, #0x1
 	mov	r5, #0x1
 	resume_mmdc
 	resume_mmdc
 
 
-	mov	pc, lr
+	ret	lr
 ENDPROC(imx6_suspend)
 ENDPROC(imx6_suspend)
 
 
 /*
 /*

Some files were not shown because too many files changed in this diff