|
@@ -42,20 +42,40 @@
|
|
|
|
|
|
typedef unsigned long pte_basic_t;
|
|
|
|
|
|
-static __inline__ void clear_page(void *addr)
|
|
|
+static inline void clear_page(void *addr)
|
|
|
{
|
|
|
- unsigned long lines, line_size;
|
|
|
-
|
|
|
- line_size = ppc64_caches.dline_size;
|
|
|
- lines = ppc64_caches.dlines_per_page;
|
|
|
-
|
|
|
- __asm__ __volatile__(
|
|
|
+ unsigned long iterations;
|
|
|
+ unsigned long onex, twox, fourx, eightx;
|
|
|
+
|
|
|
+ iterations = ppc64_caches.dlines_per_page / 8;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Some verisions of gcc use multiply instructions to
|
|
|
+ * calculate the offsets so lets give it a hand to
|
|
|
+ * do better.
|
|
|
+ */
|
|
|
+ onex = ppc64_caches.dline_size;
|
|
|
+ twox = onex << 1;
|
|
|
+ fourx = onex << 2;
|
|
|
+ eightx = onex << 3;
|
|
|
+
|
|
|
+ asm volatile(
|
|
|
"mtctr %1 # clear_page\n\
|
|
|
-1: dcbz 0,%0\n\
|
|
|
- add %0,%0,%3\n\
|
|
|
+ .balign 16\n\
|
|
|
+1: dcbz 0,%0\n\
|
|
|
+ dcbz %3,%0\n\
|
|
|
+ dcbz %4,%0\n\
|
|
|
+ dcbz %5,%0\n\
|
|
|
+ dcbz %6,%0\n\
|
|
|
+ dcbz %7,%0\n\
|
|
|
+ dcbz %8,%0\n\
|
|
|
+ dcbz %9,%0\n\
|
|
|
+ add %0,%0,%10\n\
|
|
|
bdnz+ 1b"
|
|
|
- : "=r" (addr)
|
|
|
- : "r" (lines), "0" (addr), "r" (line_size)
|
|
|
+ : "=&r" (addr)
|
|
|
+ : "r" (iterations), "0" (addr), "b" (onex), "b" (twox),
|
|
|
+ "b" (twox+onex), "b" (fourx), "b" (fourx+onex),
|
|
|
+ "b" (twox+fourx), "b" (eightx-onex), "r" (eightx)
|
|
|
: "ctr", "memory");
|
|
|
}
|
|
|
|