Pārlūkot izejas kodu

Merge tag 'openrisc-for-linus' of git://github.com/openrisc/linux

Pull OpenRISC updates from Stafford Horne:
 "Highlights include:

   - optimized memset and memcpy routines, ~20% boot time saving

   - support for cpu idling

   - adding support for l.swa and l.lwa atomic operations (in spec from
     2014)

   - use atomics to implement: bitops, cmpxchg, futex

   - the atomics are in preparation for SMP support"

* tag 'openrisc-for-linus' of git://github.com/openrisc/linux: (25 commits)
  openrisc: head: Init r0 to 0 on start
  openrisc: Export ioremap symbols used by modules
  arch/openrisc/lib/memcpy.c: use correct OR1200 option
  openrisc: head: Remove unused strings
  openrisc: head: Move init strings to rodata section
  openrisc: entry: Fix delay slot detection
  openrisc: entry: Whitespace and comment cleanups
  scripts/checkstack.pl: Add openrisc support
  MAINTAINERS: Add the openrisc official repository
  openrisc: Add .gitignore
  openrisc: Add optimized memcpy routine
  openrisc: Add optimized memset
  openrisc: Initial support for the idle state
  openrisc: Fix the bitmask for the unit present register
  openrisc: remove unnecessary stddef.h include
  openrisc: add futex_atomic_* implementations
  openrisc: add optimized atomic operations
  openrisc: add cmpxchg and xchg implementations
  openrisc: add atomic bitops
  openrisc: add l.lwa/l.swa emulation
  ...
Linus Torvalds 8 gadi atpakaļ
vecāks
revīzija
9e31489029

+ 1 - 0
MAINTAINERS

@@ -9315,6 +9315,7 @@ OPENRISC ARCHITECTURE
 M:	Jonas Bonn <jonas@southpole.se>
 M:	Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
 M:	Stafford Horne <shorne@gmail.com>
+T:	git git://github.com/openrisc/linux.git
 L:	openrisc@lists.librecores.org
 W:	http://openrisc.io
 S:	Maintained

+ 1 - 0
arch/openrisc/Kconfig

@@ -12,6 +12,7 @@ config OPENRISC
 	select HAVE_MEMBLOCK
 	select GPIOLIB
         select HAVE_ARCH_TRACEHOOK
+	select SPARSE_IRQ
 	select GENERIC_IRQ_CHIP
 	select GENERIC_IRQ_PROBE
 	select GENERIC_IRQ_SHOW

+ 0 - 1
arch/openrisc/TODO.openrisc

@@ -10,4 +10,3 @@ that are due for investigation shortly, i.e. our TODO list:
    or1k and this change is slowly trickling through the stack.  For the time
    being, or32 is equivalent to or1k.
 
--- Implement optimized version of memcpy and memset

+ 1 - 4
arch/openrisc/include/asm/Kbuild

@@ -1,7 +1,6 @@
 
 header-y += ucontext.h
 
-generic-y += atomic.h
 generic-y += auxvec.h
 generic-y += barrier.h
 generic-y += bitsperlong.h
@@ -10,8 +9,6 @@ generic-y += bugs.h
 generic-y += cacheflush.h
 generic-y += checksum.h
 generic-y += clkdev.h
-generic-y += cmpxchg-local.h
-generic-y += cmpxchg.h
 generic-y += current.h
 generic-y += device.h
 generic-y += div64.h
@@ -22,12 +19,12 @@ generic-y += exec.h
 generic-y += fb.h
 generic-y += fcntl.h
 generic-y += ftrace.h
-generic-y += futex.h
 generic-y += hardirq.h
 generic-y += hw_irq.h
 generic-y += ioctl.h
 generic-y += ioctls.h
 generic-y += ipcbuf.h
+generic-y += irq.h
 generic-y += irq_regs.h
 generic-y += irq_work.h
 generic-y += kdebug.h

+ 126 - 0
arch/openrisc/include/asm/atomic.h

@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_ATOMIC_H
+#define __ASM_OPENRISC_ATOMIC_H
+
+#include <linux/types.h>
+
+/* Atomically perform op with v->counter and i */
+#define ATOMIC_OP(op)							\
+static inline void atomic_##op(int i, atomic_t *v)			\
+{									\
+	int tmp;							\
+									\
+	__asm__ __volatile__(						\
+		"1:	l.lwa	%0,0(%1)	\n"			\
+		"	l." #op " %0,%0,%2	\n"			\
+		"	l.swa	0(%1),%0	\n"			\
+		"	l.bnf	1b		\n"			\
+		"	 l.nop			\n"			\
+		: "=&r"(tmp)						\
+		: "r"(&v->counter), "r"(i)				\
+		: "cc", "memory");					\
+}
+
+/* Atomically perform op with v->counter and i, return the result */
+#define ATOMIC_OP_RETURN(op)						\
+static inline int atomic_##op##_return(int i, atomic_t *v)		\
+{									\
+	int tmp;							\
+									\
+	__asm__ __volatile__(						\
+		"1:	l.lwa	%0,0(%1)	\n"			\
+		"	l." #op " %0,%0,%2	\n"			\
+		"	l.swa	0(%1),%0	\n"			\
+		"	l.bnf	1b		\n"			\
+		"	 l.nop			\n"			\
+		: "=&r"(tmp)						\
+		: "r"(&v->counter), "r"(i)				\
+		: "cc", "memory");					\
+									\
+	return tmp;							\
+}
+
+/* Atomically perform op with v->counter and i, return orig v->counter */
+#define ATOMIC_FETCH_OP(op)						\
+static inline int atomic_fetch_##op(int i, atomic_t *v)			\
+{									\
+	int tmp, old;							\
+									\
+	__asm__ __volatile__(						\
+		"1:	l.lwa	%0,0(%2)	\n"			\
+		"	l." #op " %1,%0,%3	\n"			\
+		"	l.swa	0(%2),%1	\n"			\
+		"	l.bnf	1b		\n"			\
+		"	 l.nop			\n"			\
+		: "=&r"(old), "=&r"(tmp)				\
+		: "r"(&v->counter), "r"(i)				\
+		: "cc", "memory");					\
+									\
+	return old;							\
+}
+
+ATOMIC_OP_RETURN(add)
+ATOMIC_OP_RETURN(sub)
+
+ATOMIC_FETCH_OP(add)
+ATOMIC_FETCH_OP(sub)
+ATOMIC_FETCH_OP(and)
+ATOMIC_FETCH_OP(or)
+ATOMIC_FETCH_OP(xor)
+
+ATOMIC_OP(and)
+ATOMIC_OP(or)
+ATOMIC_OP(xor)
+
+#undef ATOMIC_FETCH_OP
+#undef ATOMIC_OP_RETURN
+#undef ATOMIC_OP
+
+#define atomic_add_return	atomic_add_return
+#define atomic_sub_return	atomic_sub_return
+#define atomic_fetch_add	atomic_fetch_add
+#define atomic_fetch_sub	atomic_fetch_sub
+#define atomic_fetch_and	atomic_fetch_and
+#define atomic_fetch_or		atomic_fetch_or
+#define atomic_fetch_xor	atomic_fetch_xor
+#define atomic_and	atomic_and
+#define atomic_or	atomic_or
+#define atomic_xor	atomic_xor
+
+/*
+ * Atomically add a to v->counter as long as v is not already u.
+ * Returns the original value at v->counter.
+ *
+ * This is often used through atomic_inc_not_zero()
+ */
+static inline int __atomic_add_unless(atomic_t *v, int a, int u)
+{
+	int old, tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa %0, 0(%2)		\n"
+		"	l.sfeq %0, %4		\n"
+		"	l.bf 2f			\n"
+		"	 l.add %1, %0, %3	\n"
+		"	l.swa 0(%2), %1		\n"
+		"	l.bnf 1b		\n"
+		"	 l.nop			\n"
+		"2:				\n"
+		: "=&r"(old), "=&r" (tmp)
+		: "r"(&v->counter), "r"(a), "r"(u)
+		: "cc", "memory");
+
+	return old;
+}
+#define __atomic_add_unless	__atomic_add_unless
+
+#include <asm-generic/atomic.h>
+
+#endif /* __ASM_OPENRISC_ATOMIC_H */

+ 1 - 1
arch/openrisc/include/asm/bitops.h

@@ -45,7 +45,7 @@
 #include <asm-generic/bitops/hweight.h>
 #include <asm-generic/bitops/lock.h>
 
-#include <asm-generic/bitops/atomic.h>
+#include <asm/bitops/atomic.h>
 #include <asm-generic/bitops/non-atomic.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic.h>

+ 123 - 0
arch/openrisc/include/asm/bitops/atomic.h

@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_BITOPS_ATOMIC_H
+#define __ASM_OPENRISC_BITOPS_ATOMIC_H
+
+static inline void set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%1)	\n"
+		"	l.or	%0,%0,%2	\n"
+		"	l.swa	0(%1),%0	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(tmp)
+		: "r"(p), "r"(mask)
+		: "cc", "memory");
+}
+
+static inline void clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%1)	\n"
+		"	l.and	%0,%0,%2	\n"
+		"	l.swa	0(%1),%0	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(tmp)
+		: "r"(p), "r"(~mask)
+		: "cc", "memory");
+}
+
+static inline void change_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%1)	\n"
+		"	l.xor	%0,%0,%2	\n"
+		"	l.swa	0(%1),%0	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(tmp)
+		: "r"(p), "r"(mask)
+		: "cc", "memory");
+}
+
+static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%2)	\n"
+		"	l.or	%1,%0,%3	\n"
+		"	l.swa	0(%2),%1	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(p), "r"(mask)
+		: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
+static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%2)	\n"
+		"	l.and	%1,%0,%3	\n"
+		"	l.swa	0(%2),%1	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(p), "r"(~mask)
+		: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
+static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
+{
+	unsigned long mask = BIT_MASK(nr);
+	unsigned long *p = ((unsigned long *)addr) + BIT_WORD(nr);
+	unsigned long old;
+	unsigned long tmp;
+
+	__asm__ __volatile__(
+		"1:	l.lwa	%0,0(%2)	\n"
+		"	l.xor	%1,%0,%3	\n"
+		"	l.swa	0(%2),%1	\n"
+		"	l.bnf	1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(old), "=&r"(tmp)
+		: "r"(p), "r"(mask)
+		: "cc", "memory");
+
+	return (old & mask) != 0;
+}
+
+#endif /* __ASM_OPENRISC_BITOPS_ATOMIC_H */

+ 83 - 0
arch/openrisc/include/asm/cmpxchg.h

@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2014 Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
+ *
+ * This file is licensed under the terms of the GNU General Public License
+ * version 2.  This program is licensed "as is" without any warranty of any
+ * kind, whether express or implied.
+ */
+
+#ifndef __ASM_OPENRISC_CMPXCHG_H
+#define __ASM_OPENRISC_CMPXCHG_H
+
+#include  <linux/types.h>
+
+/*
+ * This function doesn't exist, so you'll get a linker error
+ * if something tries to do an invalid cmpxchg().
+ */
+extern void __cmpxchg_called_with_bad_pointer(void);
+
+#define __HAVE_ARCH_CMPXCHG 1
+
+static inline unsigned long
+__cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
+{
+	if (size != 4) {
+		__cmpxchg_called_with_bad_pointer();
+		return old;
+	}
+
+	__asm__ __volatile__(
+		"1:	l.lwa %0, 0(%1)		\n"
+		"	l.sfeq %0, %2		\n"
+		"	l.bnf 2f		\n"
+		"	 l.nop			\n"
+		"	l.swa 0(%1), %3		\n"
+		"	l.bnf 1b		\n"
+		"	 l.nop			\n"
+		"2:				\n"
+		: "=&r"(old)
+		: "r"(ptr), "r"(old), "r"(new)
+		: "cc", "memory");
+
+	return old;
+}
+
+#define cmpxchg(ptr, o, n)						\
+	({								\
+		(__typeof__(*(ptr))) __cmpxchg((ptr),			\
+					       (unsigned long)(o),	\
+					       (unsigned long)(n),	\
+					       sizeof(*(ptr)));		\
+	})
+
+/*
+ * This function doesn't exist, so you'll get a linker error if
+ * something tries to do an invalidly-sized xchg().
+ */
+extern void __xchg_called_with_bad_pointer(void);
+
+static inline unsigned long __xchg(unsigned long val, volatile void *ptr,
+				   int size)
+{
+	if (size != 4) {
+		__xchg_called_with_bad_pointer();
+		return val;
+	}
+
+	__asm__ __volatile__(
+		"1:	l.lwa %0, 0(%1)		\n"
+		"	l.swa 0(%1), %2		\n"
+		"	l.bnf 1b		\n"
+		"	 l.nop			\n"
+		: "=&r"(val)
+		: "r"(ptr), "r"(val)
+		: "cc", "memory");
+
+	return val;
+}
+
+#define xchg(ptr, with) \
+	((typeof(*(ptr)))__xchg((unsigned long)(with), (ptr), sizeof(*(ptr))))
+
+#endif /* __ASM_OPENRISC_CMPXCHG_H */

+ 2 - 0
arch/openrisc/include/asm/cpuinfo.h

@@ -24,9 +24,11 @@ struct cpuinfo {
 
 	u32 icache_size;
 	u32 icache_block_size;
+	u32 icache_ways;
 
 	u32 dcache_size;
 	u32 dcache_block_size;
+	u32 dcache_ways;
 };
 
 extern struct cpuinfo cpuinfo;

+ 135 - 0
arch/openrisc/include/asm/futex.h

@@ -0,0 +1,135 @@
+#ifndef __ASM_OPENRISC_FUTEX_H
+#define __ASM_OPENRISC_FUTEX_H
+
+#ifdef __KERNEL__
+
+#include <linux/futex.h>
+#include <linux/uaccess.h>
+#include <asm/errno.h>
+
+#define __futex_atomic_op(insn, ret, oldval, uaddr, oparg) \
+({								\
+	__asm__ __volatile__ (					\
+		"1:	l.lwa	%0, %2			\n"	\
+			insn				"\n"	\
+		"2:	l.swa	%2, %1			\n"	\
+		"	l.bnf	1b			\n"	\
+		"	 l.ori	%1, r0, 0		\n"	\
+		"3:					\n"	\
+		".section .fixup,\"ax\"			\n"	\
+		"4:	l.j	3b			\n"	\
+		"	 l.addi	%1, r0, %3		\n"	\
+		".previous				\n"	\
+		".section __ex_table,\"a\"		\n"	\
+		".word	1b,4b,2b,4b			\n"	\
+		".previous				\n"	\
+		: "=&r" (oldval), "=&r" (ret), "+m" (*uaddr)	\
+		: "i" (-EFAULT), "r" (oparg)			\
+		: "cc", "memory"				\
+		);						\
+})
+
+static inline int
+futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
+{
+	int op = (encoded_op >> 28) & 7;
+	int cmp = (encoded_op >> 24) & 15;
+	int oparg = (encoded_op << 8) >> 20;
+	int cmparg = (encoded_op << 20) >> 20;
+	int oldval = 0, ret;
+
+	if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
+		oparg = 1 << oparg;
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	pagefault_disable();
+
+	switch (op) {
+	case FUTEX_OP_SET:
+		__futex_atomic_op("l.or %1,%4,%4", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ADD:
+		__futex_atomic_op("l.add %1,%0,%4", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_OR:
+		__futex_atomic_op("l.or %1,%0,%4", ret, oldval, uaddr, oparg);
+		break;
+	case FUTEX_OP_ANDN:
+		__futex_atomic_op("l.and %1,%0,%4", ret, oldval, uaddr, ~oparg);
+		break;
+	case FUTEX_OP_XOR:
+		__futex_atomic_op("l.xor %1,%0,%4", ret, oldval, uaddr, oparg);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	pagefault_enable();
+
+	if (!ret) {
+		switch (cmp) {
+		case FUTEX_OP_CMP_EQ:
+			ret = (oldval == cmparg);
+			break;
+		case FUTEX_OP_CMP_NE:
+			ret = (oldval != cmparg);
+			break;
+		case FUTEX_OP_CMP_LT:
+			ret = (oldval < cmparg);
+			break;
+		case FUTEX_OP_CMP_GE:
+			ret = (oldval >= cmparg);
+			break;
+		case FUTEX_OP_CMP_LE:
+			ret = (oldval <= cmparg);
+			break;
+		case FUTEX_OP_CMP_GT:
+			ret = (oldval > cmparg);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+	}
+	return ret;
+}
+
+static inline int
+futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
+			      u32 oldval, u32 newval)
+{
+	int ret = 0;
+	u32 prev;
+
+	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
+		return -EFAULT;
+
+	__asm__ __volatile__ (				\
+		"1:	l.lwa	%1, %2		\n"	\
+		"	l.sfeq	%1, %3		\n"	\
+		"	l.bnf	3f		\n"	\
+		"	 l.nop			\n"	\
+		"2:	l.swa	%2, %4		\n"	\
+		"	l.bnf	1b		\n"	\
+		"	 l.nop			\n"	\
+		"3:				\n"	\
+		".section .fixup,\"ax\"		\n"	\
+		"4:	l.j	3b		\n"	\
+		"	 l.addi	%0, r0, %5	\n"	\
+		".previous			\n"	\
+		".section __ex_table,\"a\"	\n"	\
+		".word	1b,4b,2b,4b		\n"	\
+		".previous			\n"	\
+		: "+r" (ret), "=&r" (prev), "+m" (*uaddr) \
+		: "r" (oldval), "r" (newval), "i" (-EFAULT) \
+		: "cc",	"memory"			\
+		);
+
+	*uval = prev;
+	return ret;
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* __ASM_OPENRISC_FUTEX_H */

+ 2 - 2
arch/openrisc/include/asm/spr_defs.h

@@ -152,8 +152,8 @@
 #define SPR_UPR_MP	   0x00000020  /* MAC present */
 #define SPR_UPR_DUP	   0x00000040  /* Debug unit present */
 #define SPR_UPR_PCUP	   0x00000080  /* Performance counters unit present */
-#define SPR_UPR_PMP	   0x00000100  /* Power management present */
-#define SPR_UPR_PICP	   0x00000200  /* PIC present */
+#define SPR_UPR_PICP	   0x00000100  /* PIC present */
+#define SPR_UPR_PMP	   0x00000200  /* Power management present */
 #define SPR_UPR_TTP	   0x00000400  /* Tick timer present */
 #define SPR_UPR_RES	   0x00fe0000  /* Reserved */
 #define SPR_UPR_CUP	   0xff000000  /* Context units present */

+ 10 - 0
arch/openrisc/include/asm/string.h

@@ -0,0 +1,10 @@
+#ifndef __ASM_OPENRISC_STRING_H
+#define __ASM_OPENRISC_STRING_H
+
+#define __HAVE_ARCH_MEMSET
+extern void *memset(void *s, int c, __kernel_size_t n);
+
+#define __HAVE_ARCH_MEMCPY
+extern void *memcpy(void *dest, __const void *src, __kernel_size_t n);
+
+#endif /* __ASM_OPENRISC_STRING_H */

+ 1 - 0
arch/openrisc/kernel/.gitignore

@@ -0,0 +1 @@
+vmlinux.lds

+ 38 - 22
arch/openrisc/kernel/entry.S

@@ -173,6 +173,11 @@ handler:							;\
 	l.j	_ret_from_exception				;\
 	 l.nop
 
+/* clobbers 'reg' */
+#define CLEAR_LWA_FLAG(reg)		\
+	l.movhi	reg,hi(lwa_flag)	;\
+	l.ori	reg,reg,lo(lwa_flag)	;\
+	l.sw	0(reg),r0
 /*
  * NOTE: one should never assume that SPR_EPC, SPR_ESR, SPR_EEAR
  *       contain the same values as when exception we're handling
@@ -193,6 +198,7 @@ EXCEPTION_ENTRY(_tng_kernel_start)
 /* ---[ 0x200: BUS exception ]------------------------------------------- */
 
 EXCEPTION_ENTRY(_bus_fault_handler)
+	CLEAR_LWA_FLAG(r3)
 	/* r4: EA of fault (set by EXCEPTION_HANDLE) */
 	l.jal   do_bus_fault
 	 l.addi  r3,r1,0 /* pt_regs */
@@ -202,11 +208,13 @@ EXCEPTION_ENTRY(_bus_fault_handler)
 
 /* ---[ 0x300: Data Page Fault exception ]------------------------------- */
 EXCEPTION_ENTRY(_dtlb_miss_page_fault_handler)
+	CLEAR_LWA_FLAG(r3)
 	l.and	r5,r5,r0
 	l.j	1f
 	 l.nop
 
 EXCEPTION_ENTRY(_data_page_fault_handler)
+	CLEAR_LWA_FLAG(r3)
 	/* set up parameters for do_page_fault */
 	l.ori	r5,r0,0x300		   // exception vector
 1:
@@ -220,7 +228,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler)
 	 * DTLB miss handler in the CONFIG_GUARD_PROTECTED_CORE part
 	 */
 #ifdef CONFIG_OPENRISC_NO_SPR_SR_DSX
-	l.lwz   r6,PT_PC(r3)                  // address of an offending insn
+	l.lwz   r6,PT_PC(r3)               // address of an offending insn
 	l.lwz   r6,0(r6)                   // instruction that caused pf
 
 	l.srli  r6,r6,26                   // check opcode for jump insn
@@ -236,57 +244,57 @@ EXCEPTION_ENTRY(_data_page_fault_handler)
 	l.bf    8f
 	l.sfeqi r6,0x12                    // l.jalr
 	l.bf    8f
-
-	l.nop
+	 l.nop
 
 	l.j     9f
-	l.nop
-8:
+	 l.nop
 
-	l.lwz   r6,PT_PC(r3)                  // address of an offending insn
+8: // offending insn is in delay slot
+	l.lwz   r6,PT_PC(r3)               // address of an offending insn
 	l.addi  r6,r6,4
 	l.lwz   r6,0(r6)                   // instruction that caused pf
 	l.srli  r6,r6,26                   // get opcode
-9:
+9: // offending instruction opcode loaded in r6
 
 #else
 
-	l.mfspr r6,r0,SPR_SR		   // SR
-//	l.lwz	r6,PT_SR(r3)		   // ESR
-	l.andi	r6,r6,SPR_SR_DSX	   // check for delay slot exception
-	l.sfeqi	r6,0x1			   // exception happened in delay slot
-	l.bnf	7f
-	l.lwz	r6,PT_PC(r3)		   // address of an offending insn
+	l.lwz   r6,PT_SR(r3)               // SR
+	l.andi  r6,r6,SPR_SR_DSX           // check for delay slot exception
+	l.sfne  r6,r0                      // exception happened in delay slot
+	l.bnf   7f
+	 l.lwz  r6,PT_PC(r3)               // address of an offending insn
 
-	l.addi	r6,r6,4			   // offending insn is in delay slot
+	l.addi	r6,r6,4                    // offending insn is in delay slot
 7:
 	l.lwz   r6,0(r6)                   // instruction that caused pf
 	l.srli  r6,r6,26                   // check opcode for write access
 #endif
 
-	l.sfgeui r6,0x33		   // check opcode for write access
+	l.sfgeui r6,0x33                   // check opcode for write access
 	l.bnf   1f
 	l.sfleui r6,0x37
 	l.bnf   1f
 	l.ori   r6,r0,0x1                  // write access
 	l.j     2f
-	l.nop
+	 l.nop
 1:	l.ori   r6,r0,0x0                  // !write access
 2:
 
 	/* call fault.c handler in or32/mm/fault.c */
 	l.jal   do_page_fault
-	l.nop
+	 l.nop
 	l.j     _ret_from_exception
-	l.nop
+	 l.nop
 
 /* ---[ 0x400: Insn Page Fault exception ]------------------------------- */
 EXCEPTION_ENTRY(_itlb_miss_page_fault_handler)
+	CLEAR_LWA_FLAG(r3)
 	l.and	r5,r5,r0
 	l.j	1f
 	 l.nop
 
 EXCEPTION_ENTRY(_insn_page_fault_handler)
+	CLEAR_LWA_FLAG(r3)
 	/* set up parameters for do_page_fault */
 	l.ori	r5,r0,0x400		   // exception vector
 1:
@@ -296,14 +304,15 @@ EXCEPTION_ENTRY(_insn_page_fault_handler)
 
 	/* call fault.c handler in or32/mm/fault.c */
 	l.jal   do_page_fault
-	l.nop
+	 l.nop
 	l.j     _ret_from_exception
-	l.nop
+	 l.nop
 
 
 /* ---[ 0x500: Timer exception ]----------------------------------------- */
 
 EXCEPTION_ENTRY(_timer_handler)
+	CLEAR_LWA_FLAG(r3)
 	l.jal	timer_interrupt
 	 l.addi r3,r1,0 /* pt_regs */
 
@@ -313,6 +322,7 @@ EXCEPTION_ENTRY(_timer_handler)
 /* ---[ 0x600: Aligment exception ]-------------------------------------- */
 
 EXCEPTION_ENTRY(_alignment_handler)
+	CLEAR_LWA_FLAG(r3)
 	/* r4: EA of fault (set by EXCEPTION_HANDLE) */
 	l.jal   do_unaligned_access
 	 l.addi  r3,r1,0 /* pt_regs */
@@ -509,6 +519,7 @@ EXCEPTION_ENTRY(_external_irq_handler)
 //	l.sw	PT_SR(r1),r4
 1:
 #endif
+	CLEAR_LWA_FLAG(r3)
 	l.addi	r3,r1,0
 	l.movhi	r8,hi(do_IRQ)
 	l.ori	r8,r8,lo(do_IRQ)
@@ -556,8 +567,12 @@ ENTRY(_sys_call_handler)
 	 * they should be clobbered, otherwise
 	 */
 	l.sw    PT_GPR3(r1),r3
-	/* r4 already saved */
-	/* r4 holds the EEAR address of the fault, load the original r4 */
+	/*
+	 * r4 already saved
+	 * r4 holds the EEAR address of the fault, use it as screatch reg and
+	 * then load the original r4
+	 */
+	CLEAR_LWA_FLAG(r4)
 	l.lwz	r4,PT_GPR4(r1)
 	l.sw    PT_GPR5(r1),r5
 	l.sw    PT_GPR6(r1),r6
@@ -776,6 +791,7 @@ UNHANDLED_EXCEPTION(_vector_0xd00,0xd00)
 /* ---[ 0xe00: Trap exception ]------------------------------------------ */
 
 EXCEPTION_ENTRY(_trap_handler)
+	CLEAR_LWA_FLAG(r3)
 	/* r4: EA of fault (set by EXCEPTION_HANDLE) */
 	l.jal   do_trap
 	 l.addi  r3,r1,0 /* pt_regs */

+ 76 - 124
arch/openrisc/kernel/head.S

@@ -24,6 +24,7 @@
 #include <asm/page.h>
 #include <asm/mmu.h>
 #include <asm/pgtable.h>
+#include <asm/thread_info.h>
 #include <asm/cache.h>
 #include <asm/spr_defs.h>
 #include <asm/asm-offsets.h>
@@ -34,7 +35,7 @@
 	l.add	rd,rd,rs
 
 #define CLEAR_GPR(gpr)				\
-	l.or    gpr,r0,r0
+	l.movhi	gpr,0x0
 
 #define LOAD_SYMBOL_2_GPR(gpr,symbol)		\
 	l.movhi gpr,hi(symbol)			;\
@@ -442,6 +443,9 @@ _dispatch_do_ipage_fault:
 	__HEAD
 	.global _start
 _start:
+	/* Init r0 to zero as per spec */
+	CLEAR_GPR(r0)
+
 	/* save kernel parameters */
 	l.or	r25,r0,r3	/* pointer to fdt */
 
@@ -486,7 +490,8 @@ _start:
 	/*
 	 * set up initial ksp and current
 	 */
-	LOAD_SYMBOL_2_GPR(r1,init_thread_union+0x2000)	// setup kernel stack
+	/* setup kernel stack */
+	LOAD_SYMBOL_2_GPR(r1,init_thread_union + THREAD_SIZE)
 	LOAD_SYMBOL_2_GPR(r10,init_thread_union)	// setup current
 	tophys	(r31,r10)
 	l.sw	TI_KSP(r31), r1
@@ -520,22 +525,8 @@ enable_dc:
 	 l.nop
 
 flush_tlb:
-	/*
-	 *  I N V A L I D A T E   T L B   e n t r i e s
-	 */
-	LOAD_SYMBOL_2_GPR(r5,SPR_DTLBMR_BASE(0))
-	LOAD_SYMBOL_2_GPR(r6,SPR_ITLBMR_BASE(0))
-	l.addi	r7,r0,128 /* Maximum number of sets */
-1:
-	l.mtspr	r5,r0,0x0
-	l.mtspr	r6,r0,0x0
-
-	l.addi	r5,r5,1
-	l.addi	r6,r6,1
-	l.sfeq	r7,r0
-	l.bnf	1b
-	 l.addi	r7,r7,-1
-
+	l.jal	_flush_tlb
+	 l.nop
 
 /* The MMU needs to be enabled before or32_early_setup is called */
 
@@ -627,6 +618,26 @@ jump_start_kernel:
 	l.jr    r30
 	 l.nop
 
+_flush_tlb:
+	/*
+	 *  I N V A L I D A T E   T L B   e n t r i e s
+	 */
+	LOAD_SYMBOL_2_GPR(r5,SPR_DTLBMR_BASE(0))
+	LOAD_SYMBOL_2_GPR(r6,SPR_ITLBMR_BASE(0))
+	l.addi	r7,r0,128 /* Maximum number of sets */
+1:
+	l.mtspr	r5,r0,0x0
+	l.mtspr	r6,r0,0x0
+
+	l.addi	r5,r5,1
+	l.addi	r6,r6,1
+	l.sfeq	r7,r0
+	l.bnf	1b
+	 l.addi	r7,r7,-1
+
+	l.jr	r9
+	 l.nop
+
 /* ========================================[ cache ]=== */
 
 	/* aligment here so we don't change memory offsets with
@@ -971,8 +982,6 @@ ENTRY(dtlb_miss_handler)
 	EXCEPTION_STORE_GPR2
 	EXCEPTION_STORE_GPR3
 	EXCEPTION_STORE_GPR4
-	EXCEPTION_STORE_GPR5
-	EXCEPTION_STORE_GPR6
 	/*
 	 * get EA of the miss
 	 */
@@ -980,91 +989,70 @@ ENTRY(dtlb_miss_handler)
 	/*
 	 * pmd = (pmd_t *)(current_pgd + pgd_index(daddr));
 	 */
-	GET_CURRENT_PGD(r3,r5)		// r3 is current_pgd, r5 is temp
+	GET_CURRENT_PGD(r3,r4)		// r3 is current_pgd, r4 is temp
 	l.srli	r4,r2,0x18		// >> PAGE_SHIFT + (PAGE_SHIFT - 2)
 	l.slli	r4,r4,0x2		// to get address << 2
-	l.add	r5,r4,r3		// r4 is pgd_index(daddr)
+	l.add	r3,r4,r3		// r4 is pgd_index(daddr)
 	/*
 	 * if (pmd_none(*pmd))
 	 *   goto pmd_none:
 	 */
-	tophys	(r4,r5)
+	tophys	(r4,r3)
 	l.lwz	r3,0x0(r4)		// get *pmd value
 	l.sfne	r3,r0
 	l.bnf	d_pmd_none
-	 l.andi	r3,r3,~PAGE_MASK //0x1fff		// ~PAGE_MASK
-	/*
-	 * if (pmd_bad(*pmd))
-	 *   pmd_clear(pmd)
-	 *   goto pmd_bad:
-	 */
-//	l.sfeq	r3,r0			// check *pmd value
-//	l.bf	d_pmd_good
-	l.addi	r3,r0,0xffffe000	// PAGE_MASK
-//	l.j	d_pmd_bad
-//	l.sw	0x0(r4),r0		// clear pmd
+	 l.addi	r3,r0,0xffffe000	// PAGE_MASK
+
 d_pmd_good:
 	/*
 	 * pte = *pte_offset(pmd, daddr);
 	 */
 	l.lwz	r4,0x0(r4)		// get **pmd value
 	l.and	r4,r4,r3		// & PAGE_MASK
-	l.srli	r5,r2,0xd		// >> PAGE_SHIFT, r2 == EEAR
-	l.andi	r3,r5,0x7ff		// (1UL << PAGE_SHIFT - 2) - 1
+	l.srli	r2,r2,0xd		// >> PAGE_SHIFT, r2 == EEAR
+	l.andi	r3,r2,0x7ff		// (1UL << PAGE_SHIFT - 2) - 1
 	l.slli	r3,r3,0x2		// to get address << 2
 	l.add	r3,r3,r4
-	l.lwz	r2,0x0(r3)		// this is pte at last
+	l.lwz	r3,0x0(r3)		// this is pte at last
 	/*
 	 * if (!pte_present(pte))
 	 */
-	l.andi	r4,r2,0x1
+	l.andi	r4,r3,0x1
 	l.sfne	r4,r0			// is pte present
 	l.bnf	d_pte_not_present
-	l.addi	r3,r0,0xffffe3fa	// PAGE_MASK | DTLB_UP_CONVERT_MASK
+	l.addi	r4,r0,0xffffe3fa	// PAGE_MASK | DTLB_UP_CONVERT_MASK
 	/*
 	 * fill DTLB TR register
 	 */
-	l.and	r4,r2,r3		// apply the mask
+	l.and	r4,r3,r4		// apply the mask
 	// Determine number of DMMU sets
-	l.mfspr r6, r0, SPR_DMMUCFGR
-	l.andi	r6, r6, SPR_DMMUCFGR_NTS
-	l.srli	r6, r6, SPR_DMMUCFGR_NTS_OFF
+	l.mfspr r2, r0, SPR_DMMUCFGR
+	l.andi	r2, r2, SPR_DMMUCFGR_NTS
+	l.srli	r2, r2, SPR_DMMUCFGR_NTS_OFF
 	l.ori	r3, r0, 0x1
-	l.sll	r3, r3, r6 	// r3 = number DMMU sets DMMUCFGR
-	l.addi	r6, r3, -1  	// r6 = nsets mask
-	l.and	r5, r5, r6	// calc offset:	 & (NUM_TLB_ENTRIES-1)
+	l.sll	r3, r3, r2 	// r3 = number DMMU sets DMMUCFGR
+	l.addi	r2, r3, -1  	// r2 = nsets mask
+	l.mfspr	r3, r0, SPR_EEAR_BASE
+	l.srli	r3, r3, 0xd	// >> PAGE_SHIFT
+	l.and	r2, r3, r2	// calc offset:	 & (NUM_TLB_ENTRIES-1)
 	                                                   //NUM_TLB_ENTRIES
-	l.mtspr	r5,r4,SPR_DTLBTR_BASE(0)
+	l.mtspr	r2,r4,SPR_DTLBTR_BASE(0)
 	/*
 	 * fill DTLB MR register
 	 */
-	l.mfspr	r2,r0,SPR_EEAR_BASE
-	l.addi	r3,r0,0xffffe000	// PAGE_MASK
-	l.and	r4,r2,r3		// apply PAGE_MASK to EA (__PHX__ do we really need this?)
-	l.ori	r4,r4,0x1		// set hardware valid bit: DTBL_MR entry
-	l.mtspr	r5,r4,SPR_DTLBMR_BASE(0)
+	l.slli	r3, r3, 0xd		/* << PAGE_SHIFT => EA & PAGE_MASK */
+	l.ori	r4,r3,0x1		// set hardware valid bit: DTBL_MR entry
+	l.mtspr	r2,r4,SPR_DTLBMR_BASE(0)
 
 	EXCEPTION_LOAD_GPR2
 	EXCEPTION_LOAD_GPR3
 	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
-	l.rfe
-d_pmd_bad:
-	l.nop	1
-	EXCEPTION_LOAD_GPR2
-	EXCEPTION_LOAD_GPR3
-	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
 	l.rfe
 d_pmd_none:
 d_pte_not_present:
 	EXCEPTION_LOAD_GPR2
 	EXCEPTION_LOAD_GPR3
 	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
 	EXCEPTION_HANDLE(_dtlb_miss_page_fault_handler)
 
 /* ==============================================[ ITLB miss handler ]=== */
@@ -1072,8 +1060,6 @@ ENTRY(itlb_miss_handler)
 	EXCEPTION_STORE_GPR2
 	EXCEPTION_STORE_GPR3
 	EXCEPTION_STORE_GPR4
-	EXCEPTION_STORE_GPR5
-	EXCEPTION_STORE_GPR6
 	/*
 	 * get EA of the miss
 	 */
@@ -1083,30 +1069,19 @@ ENTRY(itlb_miss_handler)
 	 * pmd = (pmd_t *)(current_pgd + pgd_index(daddr));
 	 *
 	 */
-	GET_CURRENT_PGD(r3,r5)		// r3 is current_pgd, r5 is temp
+	GET_CURRENT_PGD(r3,r4)		// r3 is current_pgd, r5 is temp
 	l.srli	r4,r2,0x18		// >> PAGE_SHIFT + (PAGE_SHIFT - 2)
 	l.slli	r4,r4,0x2		// to get address << 2
-	l.add	r5,r4,r3		// r4 is pgd_index(daddr)
+	l.add	r3,r4,r3		// r4 is pgd_index(daddr)
 	/*
 	 * if (pmd_none(*pmd))
 	 *   goto pmd_none:
 	 */
-	tophys	(r4,r5)
+	tophys	(r4,r3)
 	l.lwz	r3,0x0(r4)		// get *pmd value
 	l.sfne	r3,r0
 	l.bnf	i_pmd_none
-	l.andi	r3,r3,0x1fff		// ~PAGE_MASK
-	/*
-	 * if (pmd_bad(*pmd))
-	 *   pmd_clear(pmd)
-	 *   goto pmd_bad:
-	 */
-
-//	l.sfeq	r3,r0			// check *pmd value
-//	l.bf	i_pmd_good
-	l.addi	r3,r0,0xffffe000	// PAGE_MASK
-//	l.j	i_pmd_bad
-//	l.sw	0x0(r4),r0		// clear pmd
+	 l.addi	r3,r0,0xffffe000	// PAGE_MASK
 
 i_pmd_good:
 	/*
@@ -1115,35 +1090,36 @@ i_pmd_good:
 	 */
 	l.lwz	r4,0x0(r4)		// get **pmd value
 	l.and	r4,r4,r3		// & PAGE_MASK
-	l.srli	r5,r2,0xd		// >> PAGE_SHIFT, r2 == EEAR
-	l.andi	r3,r5,0x7ff		// (1UL << PAGE_SHIFT - 2) - 1
+	l.srli	r2,r2,0xd		// >> PAGE_SHIFT, r2 == EEAR
+	l.andi	r3,r2,0x7ff		// (1UL << PAGE_SHIFT - 2) - 1
 	l.slli	r3,r3,0x2		// to get address << 2
 	l.add	r3,r3,r4
-	l.lwz	r2,0x0(r3)		// this is pte at last
+	l.lwz	r3,0x0(r3)		// this is pte at last
 	/*
 	 * if (!pte_present(pte))
 	 *
 	 */
-	l.andi	r4,r2,0x1
+	l.andi	r4,r3,0x1
 	l.sfne	r4,r0			// is pte present
 	l.bnf	i_pte_not_present
-	l.addi	r3,r0,0xffffe03a	// PAGE_MASK | ITLB_UP_CONVERT_MASK
+	 l.addi	r4,r0,0xffffe03a	// PAGE_MASK | ITLB_UP_CONVERT_MASK
 	/*
 	 * fill ITLB TR register
 	 */
-	l.and	r4,r2,r3		// apply the mask
-	l.andi	r3,r2,0x7c0		// _PAGE_EXEC | _PAGE_SRE | _PAGE_SWE |  _PAGE_URE | _PAGE_UWE
-//	l.andi	r3,r2,0x400		// _PAGE_EXEC
+	l.and	r4,r3,r4		// apply the mask
+	l.andi	r3,r3,0x7c0		// _PAGE_EXEC | _PAGE_SRE | _PAGE_SWE |  _PAGE_URE | _PAGE_UWE
 	l.sfeq	r3,r0
 	l.bf	itlb_tr_fill //_workaround
 	// Determine number of IMMU sets
-	l.mfspr r6, r0, SPR_IMMUCFGR
-	l.andi	r6, r6, SPR_IMMUCFGR_NTS
-	l.srli	r6, r6, SPR_IMMUCFGR_NTS_OFF
+	l.mfspr r2, r0, SPR_IMMUCFGR
+	l.andi	r2, r2, SPR_IMMUCFGR_NTS
+	l.srli	r2, r2, SPR_IMMUCFGR_NTS_OFF
 	l.ori	r3, r0, 0x1
-	l.sll	r3, r3, r6 	// r3 = number IMMU sets IMMUCFGR
-	l.addi	r6, r3, -1  	// r6 = nsets mask
-	l.and	r5, r5, r6	// calc offset:	 & (NUM_TLB_ENTRIES-1)
+	l.sll	r3, r3, r2 	// r3 = number IMMU sets IMMUCFGR
+	l.addi	r2, r3, -1  	// r2 = nsets mask
+	l.mfspr	r3, r0, SPR_EEAR_BASE
+	l.srli	r3, r3, 0xd	// >> PAGE_SHIFT
+	l.and	r2, r3, r2	// calc offset:	 & (NUM_TLB_ENTRIES-1)
 
 /*
  * __PHX__ :: fixme
@@ -1155,38 +1131,24 @@ i_pmd_good:
 itlb_tr_fill_workaround:
 	l.ori	r4,r4,0xc0		// | (SPR_ITLBTR_UXE | ITLBTR_SXE)
 itlb_tr_fill:
-	l.mtspr	r5,r4,SPR_ITLBTR_BASE(0)
+	l.mtspr	r2,r4,SPR_ITLBTR_BASE(0)
 	/*
 	 * fill DTLB MR register
 	 */
-	l.mfspr	r2,r0,SPR_EEAR_BASE
-	l.addi	r3,r0,0xffffe000	// PAGE_MASK
-	l.and	r4,r2,r3		// apply PAGE_MASK to EA (__PHX__ do we really need this?)
-	l.ori	r4,r4,0x1		// set hardware valid bit: DTBL_MR entry
-	l.mtspr	r5,r4,SPR_ITLBMR_BASE(0)
+	l.slli	r3, r3, 0xd		/* << PAGE_SHIFT => EA & PAGE_MASK */
+	l.ori	r4,r3,0x1		// set hardware valid bit: ITBL_MR entry
+	l.mtspr	r2,r4,SPR_ITLBMR_BASE(0)
 
 	EXCEPTION_LOAD_GPR2
 	EXCEPTION_LOAD_GPR3
 	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
 	l.rfe
 
-i_pmd_bad:
-	l.nop	1
-	EXCEPTION_LOAD_GPR2
-	EXCEPTION_LOAD_GPR3
-	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
-	l.rfe
 i_pmd_none:
 i_pte_not_present:
 	EXCEPTION_LOAD_GPR2
 	EXCEPTION_LOAD_GPR3
 	EXCEPTION_LOAD_GPR4
-	EXCEPTION_LOAD_GPR5
-	EXCEPTION_LOAD_GPR6
 	EXCEPTION_HANDLE(_itlb_miss_page_fault_handler)
 
 /* ==============================================[ boot tlb handlers ]=== */
@@ -1571,12 +1533,7 @@ ENTRY(_early_uart_init)
 	l.jr	r9
 	l.nop
 
-_string_copying_linux:
-	.string "\n\n\n\n\n\rCopying Linux... \0"
-
-_string_ok_booting:
-	.string "Ok, booting the kernel.\n\r\0"
-
+	.section .rodata
 _string_unhandled_exception:
 	.string "\n\rRunarunaround: Unhandled exception 0x\0"
 
@@ -1586,11 +1543,6 @@ _string_epc_prefix:
 _string_nl:
 	.string "\n\r\0"
 
-	.global	_string_esr_irq_bug
-_string_esr_irq_bug:
-	.string "\n\rESR external interrupt bug, for details look into entry.S\n\r\0"
-
-
 
 /* ========================================[ page aligned structures ]=== */
 

+ 1 - 0
arch/openrisc/kernel/or32_ksyms.c

@@ -44,3 +44,4 @@ DECLARE_EXPORT(__ashldi3);
 DECLARE_EXPORT(__lshrdi3);
 
 EXPORT_SYMBOL(__copy_tofrom_user);
+EXPORT_SYMBOL(memset);

+ 14 - 0
arch/openrisc/kernel/process.c

@@ -75,6 +75,17 @@ void machine_power_off(void)
 	__asm__("l.nop 1");
 }
 
+/*
+ * Send the doze signal to the cpu if available.
+ * Make sure, that all interrupts are enabled
+ */
+void arch_cpu_idle(void)
+{
+	local_irq_enable();
+	if (mfspr(SPR_UPR) & SPR_UPR_PMP)
+		mtspr(SPR_PMR, mfspr(SPR_PMR) | SPR_PMR_DME);
+}
+
 void (*pm_power_off) (void) = machine_power_off;
 
 /*
@@ -226,6 +237,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t * fpu)
 
 extern struct thread_info *_switch(struct thread_info *old_ti,
 				   struct thread_info *new_ti);
+extern int lwa_flag;
 
 struct task_struct *__switch_to(struct task_struct *old,
 				struct task_struct *new)
@@ -243,6 +255,8 @@ struct task_struct *__switch_to(struct task_struct *old,
 	new_ti = new->stack;
 	old_ti = old->stack;
 
+	lwa_flag = 0;
+
 	current_thread_info_set[smp_processor_id()] = new_ti;
 	last = (_switch(old_ti, new_ti))->task;
 

+ 0 - 1
arch/openrisc/kernel/ptrace.c

@@ -16,7 +16,6 @@
  *      2 of the License, or (at your option) any later version.
  */
 
-#include <stddef.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/string.h>

+ 36 - 31
arch/openrisc/kernel/setup.c

@@ -117,13 +117,15 @@ static void print_cpuinfo(void)
 	if (upr & SPR_UPR_DCP)
 		printk(KERN_INFO
 		       "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-		       cpuinfo.dcache_size, cpuinfo.dcache_block_size, 1);
+		       cpuinfo.dcache_size, cpuinfo.dcache_block_size,
+		       cpuinfo.dcache_ways);
 	else
 		printk(KERN_INFO "-- dcache disabled\n");
 	if (upr & SPR_UPR_ICP)
 		printk(KERN_INFO
 		       "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n",
-		       cpuinfo.icache_size, cpuinfo.icache_block_size, 1);
+		       cpuinfo.icache_size, cpuinfo.icache_block_size,
+		       cpuinfo.icache_ways);
 	else
 		printk(KERN_INFO "-- icache disabled\n");
 
@@ -155,25 +157,25 @@ void __init setup_cpuinfo(void)
 {
 	struct device_node *cpu;
 	unsigned long iccfgr, dccfgr;
-	unsigned long cache_set_size, cache_ways;
+	unsigned long cache_set_size;
 
 	cpu = of_find_compatible_node(NULL, NULL, "opencores,or1200-rtlsvn481");
 	if (!cpu)
 		panic("No compatible CPU found in device tree...\n");
 
 	iccfgr = mfspr(SPR_ICCFGR);
-	cache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
+	cpuinfo.icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW);
 	cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3);
 	cpuinfo.icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7);
 	cpuinfo.icache_size =
-	    cache_set_size * cache_ways * cpuinfo.icache_block_size;
+	    cache_set_size * cpuinfo.icache_ways * cpuinfo.icache_block_size;
 
 	dccfgr = mfspr(SPR_DCCFGR);
-	cache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
+	cpuinfo.dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW);
 	cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3);
 	cpuinfo.dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7);
 	cpuinfo.dcache_size =
-	    cache_set_size * cache_ways * cpuinfo.dcache_block_size;
+	    cache_set_size * cpuinfo.dcache_ways * cpuinfo.dcache_block_size;
 
 	if (of_property_read_u32(cpu, "clock-frequency",
 				 &cpuinfo.clock_frequency)) {
@@ -308,30 +310,33 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	revision = vr & SPR_VR_REV;
 
 	seq_printf(m,
-		   "cpu\t\t: OpenRISC-%x\n"
-		   "revision\t: %d\n"
-		   "frequency\t: %ld\n"
-		   "dcache size\t: %d bytes\n"
-		   "dcache block size\t: %d bytes\n"
-		   "icache size\t: %d bytes\n"
-		   "icache block size\t: %d bytes\n"
-		   "immu\t\t: %d entries, %lu ways\n"
-		   "dmmu\t\t: %d entries, %lu ways\n"
-		   "bogomips\t: %lu.%02lu\n",
-		   version,
-		   revision,
-		   loops_per_jiffy * HZ,
-		   cpuinfo.dcache_size,
-		   cpuinfo.dcache_block_size,
-		   cpuinfo.icache_size,
-		   cpuinfo.icache_block_size,
-		   1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
-		   1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
-		   1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
-		   1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
-		   (loops_per_jiffy * HZ) / 500000,
-		   ((loops_per_jiffy * HZ) / 5000) % 100);
-
+		  "cpu\t\t: OpenRISC-%x\n"
+		  "revision\t: %d\n"
+		  "frequency\t: %ld\n"
+		  "dcache size\t: %d bytes\n"
+		  "dcache block size\t: %d bytes\n"
+		  "dcache ways\t: %d\n"
+		  "icache size\t: %d bytes\n"
+		  "icache block size\t: %d bytes\n"
+		  "icache ways\t: %d\n"
+		  "immu\t\t: %d entries, %lu ways\n"
+		  "dmmu\t\t: %d entries, %lu ways\n"
+		  "bogomips\t: %lu.%02lu\n",
+		  version,
+		  revision,
+		  loops_per_jiffy * HZ,
+		  cpuinfo.dcache_size,
+		  cpuinfo.dcache_block_size,
+		  cpuinfo.dcache_ways,
+		  cpuinfo.icache_size,
+		  cpuinfo.icache_block_size,
+		  cpuinfo.icache_ways,
+		  1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2),
+		  1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW),
+		  1 << ((mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTS) >> 2),
+		  1 + (mfspr(SPR_IMMUCFGR) & SPR_IMMUCFGR_NTW),
+		  (loops_per_jiffy * HZ) / 500000,
+		  ((loops_per_jiffy * HZ) / 5000) % 100);
 	return 0;
 }
 

+ 183 - 0
arch/openrisc/kernel/traps.c

@@ -40,6 +40,8 @@
 extern char _etext, _stext;
 
 int kstack_depth_to_print = 0x180;
+int lwa_flag;
+unsigned long __user *lwa_addr;
 
 static inline int valid_stack_ptr(struct thread_info *tinfo, void *p)
 {
@@ -334,10 +336,191 @@ asmlinkage void do_bus_fault(struct pt_regs *regs, unsigned long address)
 	}
 }
 
+static inline int in_delay_slot(struct pt_regs *regs)
+{
+#ifdef CONFIG_OPENRISC_NO_SPR_SR_DSX
+	/* No delay slot flag, do the old way */
+	unsigned int op, insn;
+
+	insn = *((unsigned int *)regs->pc);
+	op = insn >> 26;
+	switch (op) {
+	case 0x00: /* l.j */
+	case 0x01: /* l.jal */
+	case 0x03: /* l.bnf */
+	case 0x04: /* l.bf */
+	case 0x11: /* l.jr */
+	case 0x12: /* l.jalr */
+		return 1;
+	default:
+		return 0;
+	}
+#else
+	return regs->sr & SPR_SR_DSX;
+#endif
+}
+
+static inline void adjust_pc(struct pt_regs *regs, unsigned long address)
+{
+	int displacement;
+	unsigned int rb, op, jmp;
+
+	if (unlikely(in_delay_slot(regs))) {
+		/* In delay slot, instruction at pc is a branch, simulate it */
+		jmp = *((unsigned int *)regs->pc);
+
+		displacement = sign_extend32(((jmp) & 0x3ffffff) << 2, 27);
+		rb = (jmp & 0x0000ffff) >> 11;
+		op = jmp >> 26;
+
+		switch (op) {
+		case 0x00: /* l.j */
+			regs->pc += displacement;
+			return;
+		case 0x01: /* l.jal */
+			regs->pc += displacement;
+			regs->gpr[9] = regs->pc + 8;
+			return;
+		case 0x03: /* l.bnf */
+			if (regs->sr & SPR_SR_F)
+				regs->pc += 8;
+			else
+				regs->pc += displacement;
+			return;
+		case 0x04: /* l.bf */
+			if (regs->sr & SPR_SR_F)
+				regs->pc += displacement;
+			else
+				regs->pc += 8;
+			return;
+		case 0x11: /* l.jr */
+			regs->pc = regs->gpr[rb];
+			return;
+		case 0x12: /* l.jalr */
+			regs->pc = regs->gpr[rb];
+			regs->gpr[9] = regs->pc + 8;
+			return;
+		default:
+			break;
+		}
+	} else {
+		regs->pc += 4;
+	}
+}
+
+static inline void simulate_lwa(struct pt_regs *regs, unsigned long address,
+				unsigned int insn)
+{
+	unsigned int ra, rd;
+	unsigned long value;
+	unsigned long orig_pc;
+	long imm;
+
+	const struct exception_table_entry *entry;
+
+	orig_pc = regs->pc;
+	adjust_pc(regs, address);
+
+	ra = (insn >> 16) & 0x1f;
+	rd = (insn >> 21) & 0x1f;
+	imm = (short)insn;
+	lwa_addr = (unsigned long __user *)(regs->gpr[ra] + imm);
+
+	if ((unsigned long)lwa_addr & 0x3) {
+		do_unaligned_access(regs, address);
+		return;
+	}
+
+	if (get_user(value, lwa_addr)) {
+		if (user_mode(regs)) {
+			force_sig(SIGSEGV, current);
+			return;
+		}
+
+		if ((entry = search_exception_tables(orig_pc))) {
+			regs->pc = entry->fixup;
+			return;
+		}
+
+		/* kernel access in kernel space, load it directly */
+		value = *((unsigned long *)lwa_addr);
+	}
+
+	lwa_flag = 1;
+	regs->gpr[rd] = value;
+}
+
+static inline void simulate_swa(struct pt_regs *regs, unsigned long address,
+				unsigned int insn)
+{
+	unsigned long __user *vaddr;
+	unsigned long orig_pc;
+	unsigned int ra, rb;
+	long imm;
+
+	const struct exception_table_entry *entry;
+
+	orig_pc = regs->pc;
+	adjust_pc(regs, address);
+
+	ra = (insn >> 16) & 0x1f;
+	rb = (insn >> 11) & 0x1f;
+	imm = (short)(((insn & 0x2200000) >> 10) | (insn & 0x7ff));
+	vaddr = (unsigned long __user *)(regs->gpr[ra] + imm);
+
+	if (!lwa_flag || vaddr != lwa_addr) {
+		regs->sr &= ~SPR_SR_F;
+		return;
+	}
+
+	if ((unsigned long)vaddr & 0x3) {
+		do_unaligned_access(regs, address);
+		return;
+	}
+
+	if (put_user(regs->gpr[rb], vaddr)) {
+		if (user_mode(regs)) {
+			force_sig(SIGSEGV, current);
+			return;
+		}
+
+		if ((entry = search_exception_tables(orig_pc))) {
+			regs->pc = entry->fixup;
+			return;
+		}
+
+		/* kernel access in kernel space, store it directly */
+		*((unsigned long *)vaddr) = regs->gpr[rb];
+	}
+
+	lwa_flag = 0;
+	regs->sr |= SPR_SR_F;
+}
+
+#define INSN_LWA	0x1b
+#define INSN_SWA	0x33
+
 asmlinkage void do_illegal_instruction(struct pt_regs *regs,
 				       unsigned long address)
 {
 	siginfo_t info;
+	unsigned int op;
+	unsigned int insn = *((unsigned int *)address);
+
+	op = insn >> 26;
+
+	switch (op) {
+	case INSN_LWA:
+		simulate_lwa(regs, address, insn);
+		return;
+
+	case INSN_SWA:
+		simulate_swa(regs, address, insn);
+		return;
+
+	default:
+		break;
+	}
 
 	if (user_mode(regs)) {
 		/* Send a SIGILL */

+ 1 - 1
arch/openrisc/lib/Makefile

@@ -2,4 +2,4 @@
 # Makefile for or32 specific library files..
 #
 
-obj-y  = string.o delay.o
+obj-y	:= delay.o string.o memset.o memcpy.o

+ 124 - 0
arch/openrisc/lib/memcpy.c

@@ -0,0 +1,124 @@
+/*
+ * arch/openrisc/lib/memcpy.c
+ *
+ * Optimized memory copy routines for openrisc.  These are mostly copied
+ * from ohter sources but slightly entended based on ideas discuassed in
+ * #openrisc.
+ *
+ * The word unroll implementation is an extension to the arm byte
+ * unrolled implementation, but using word copies (if things are
+ * properly aligned)
+ *
+ * The great arm loop unroll algorithm can be found at:
+ *  arch/arm/boot/compressed/string.c
+ */
+
+#include <linux/export.h>
+
+#include <linux/string.h>
+
+#ifdef CONFIG_OR1K_1200
+/*
+ * Do memcpy with word copies and loop unrolling. This gives the
+ * best performance on the OR1200 and MOR1KX archirectures
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+	int i = 0;
+	unsigned char *d, *s;
+	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+
+	/* If both source and dest are word aligned copy words */
+	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+		/* Copy 32 bytes per loop */
+		for (i = n >> 5; i > 0; i--) {
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+		}
+
+		if (n & 1 << 4) {
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+		}
+
+		if (n & 1 << 3) {
+			*dest_w++ = *src_w++;
+			*dest_w++ = *src_w++;
+		}
+
+		if (n & 1 << 2)
+			*dest_w++ = *src_w++;
+
+		d = (unsigned char *)dest_w;
+		s = (unsigned char *)src_w;
+
+	} else {
+		d = (unsigned char *)dest_w;
+		s = (unsigned char *)src_w;
+
+		for (i = n >> 3; i > 0; i--) {
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+		}
+
+		if (n & 1 << 2) {
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+			*d++ = *s++;
+		}
+	}
+
+	if (n & 1 << 1) {
+		*d++ = *s++;
+		*d++ = *s++;
+	}
+
+	if (n & 1)
+		*d++ = *s++;
+
+	return dest;
+}
+#else
+/*
+ * Use word copies but no loop unrolling as we cannot assume there
+ * will be benefits on the archirecture
+ */
+void *memcpy(void *dest, __const void *src, __kernel_size_t n)
+{
+	unsigned char *d = (unsigned char *)dest, *s = (unsigned char *)src;
+	uint32_t *dest_w = (uint32_t *)dest, *src_w = (uint32_t *)src;
+
+	/* If both source and dest are word aligned copy words */
+	if (!((unsigned int)dest_w & 3) && !((unsigned int)src_w & 3)) {
+		for (; n >= 4; n -= 4)
+			*dest_w++ = *src_w++;
+	}
+
+	d = (unsigned char *)dest_w;
+	s = (unsigned char *)src_w;
+
+	/* For remaining or if not aligned, copy bytes */
+	for (; n >= 1; n -= 1)
+		*d++ = *s++;
+
+	return dest;
+
+}
+#endif
+
+EXPORT_SYMBOL(memcpy);

+ 98 - 0
arch/openrisc/lib/memset.S

@@ -0,0 +1,98 @@
+/*
+ * OpenRISC memset.S
+ *
+ * Hand-optimized assembler version of memset for OpenRISC.
+ * Algorithm inspired by several other arch-specific memset routines
+ * in the kernel tree
+ *
+ * Copyright (C) 2015 Olof Kindgren <olof.kindgren@gmail.com>
+ *
+ *      This program is free software; you can redistribute it and/or
+ *      modify it under the terms of the GNU General Public License
+ *      as published by the Free Software Foundation; either version
+ *      2 of the License, or (at your option) any later version.
+ */
+
+	.global memset
+	.type	memset, @function
+memset:
+	/* arguments:
+	 * r3 = *s
+	 * r4 = c
+	 * r5 = n
+	 * r13, r15, r17, r19 used as temp regs
+	*/
+
+	/* Exit if n == 0 */
+	l.sfeqi		r5, 0
+	l.bf		4f
+
+	/* Truncate c to char */
+	l.andi  	r13, r4, 0xff
+
+	/* Skip word extension if c is 0 */
+	l.sfeqi		r13, 0
+	l.bf		1f
+	/* Check for at least two whole words (8 bytes) */
+	 l.sfleui	r5, 7
+
+	/* Extend char c to 32-bit word cccc in r13 */
+	l.slli		r15, r13, 16  // r13 = 000c, r15 = 0c00
+	l.or		r13, r13, r15 // r13 = 0c0c, r15 = 0c00
+	l.slli		r15, r13, 8   // r13 = 0c0c, r15 = c0c0
+	l.or		r13, r13, r15 // r13 = cccc, r15 = c0c0
+
+1:	l.addi		r19, r3, 0 // Set r19 = src
+	/* Jump to byte copy loop if less than two words */
+	l.bf		3f
+	 l.or		r17, r5, r0 // Set r17 = n
+
+	/* Mask out two LSBs to check alignment */
+	l.andi		r15, r3, 0x3
+
+	/* lsb == 00, jump to word copy loop */
+	l.sfeqi		r15, 0
+	l.bf		2f
+	 l.addi		r19, r3, 0 // Set r19 = src
+
+	/* lsb == 01,10 or 11 */
+	l.sb		0(r3), r13   // *src = c
+	l.addi		r17, r17, -1 // Decrease n
+
+	l.sfeqi		r15, 3
+	l.bf		2f
+	 l.addi		r19, r3, 1  // src += 1
+
+	/* lsb == 01 or 10 */
+	l.sb		1(r3), r13   // *(src+1) = c
+	l.addi		r17, r17, -1 // Decrease n
+
+	l.sfeqi		r15, 2
+	l.bf		2f
+	 l.addi		r19, r3, 2  // src += 2
+
+	/* lsb == 01 */
+	l.sb		2(r3), r13   // *(src+2) = c
+	l.addi		r17, r17, -1 // Decrease n
+	l.addi		r19, r3, 3   // src += 3
+
+	/* Word copy loop */
+2:	l.sw		0(r19), r13  // *src = cccc
+	l.addi		r17, r17, -4 // Decrease n
+	l.sfgeui	r17, 4
+	l.bf		2b
+	 l.addi		r19, r19, 4  // Increase src
+
+	/* When n > 0, copy the remaining bytes, otherwise jump to exit */
+	l.sfeqi		r17, 0
+	l.bf		4f
+
+	/* Byte copy loop */
+3:	l.addi		r17, r17, -1 // Decrease n
+	l.sb		0(r19), r13  // *src = cccc
+	l.sfnei		r17, 0
+	l.bf		3b
+	 l.addi		r19, r19, 1  // Increase src
+
+4:	l.jr		r9
+	 l.ori		r11, r3, 0

+ 2 - 0
arch/openrisc/mm/ioremap.c

@@ -80,6 +80,7 @@ __ioremap(phys_addr_t addr, unsigned long size, pgprot_t prot)
 
 	return (void __iomem *)(offset + (char *)v);
 }
+EXPORT_SYMBOL(__ioremap);
 
 void iounmap(void *addr)
 {
@@ -106,6 +107,7 @@ void iounmap(void *addr)
 
 	return vfree((void *)(PAGE_MASK & (unsigned long)addr));
 }
+EXPORT_SYMBOL(iounmap);
 
 /**
  * OK, this one's a bit tricky... ioremap can get called before memory is

+ 2 - 0
include/asm-generic/atomic.h

@@ -223,6 +223,7 @@ static inline void atomic_dec(atomic_t *v)
 #define atomic_xchg(ptr, v)		(xchg(&(ptr)->counter, (v)))
 #define atomic_cmpxchg(v, old, new)	(cmpxchg(&((v)->counter), (old), (new)))
 
+#ifndef __atomic_add_unless
 static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 {
 	int c, old;
@@ -231,5 +232,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 		c = old;
 	return c;
 }
+#endif
 
 #endif /* __ASM_GENERIC_ATOMIC_H */

+ 3 - 0
scripts/checkstack.pl

@@ -81,6 +81,9 @@ my (@stack, $re, $dre, $x, $xs, $funcre);
 	} elsif ($arch eq 'nios2') {
 		#25a8:	defffb04 	addi	sp,sp,-20
 		$re = qr/.*addi.*sp,sp,-(([0-9]{2}|[3-9])[0-9]{2})/o;
+	} elsif ($arch eq 'openrisc') {
+		# c000043c:       9c 21 fe f0     l.addi r1,r1,-272
+		$re = qr/.*l\.addi.*r1,r1,-(([0-9]{2}|[3-9])[0-9]{2})/o;
 	} elsif ($arch eq 'parisc' || $arch eq 'parisc64') {
 		$re = qr/.*ldo ($x{1,8})\(sp\),sp/o;
 	} elsif ($arch eq 'ppc') {