Selaa lähdekoodia

Merge branch 'sparc64-queued-locks'

Babu Moger says:

====================
Enable queued rwlock and queued spinlock for SPARC

This series of patches enables queued rwlock and queued spinlock support
for SPARC. These features were introduced some time ago in upstream.
Here are some of the earlier discussions.
https://lwn.net/Articles/572765/
https://lwn.net/Articles/582200/
https://lwn.net/Articles/561775/
https://lwn.net/Articles/590243/

Tests: Ran AIM7 benchmark to verify the performance on various workloads.
https://github.com/davidlohr/areaim. Same benchmark was used when this
feature was introduced and enabled on x86. Here are the test results.

Kernel				4.11.0-rc6     4.11.0-rc6 + 	Change
				baseline	queued locks
			      (Avg No.of jobs) (Avg No.of jobs)
Workload
High systime 10-100 user	 17290.48	 17295.18	+0.02
High systime 200-1000 users	109814.95	110248.87	+0.39
High systime 1200-2000 users	107912.40	127923.16	+18.54

Disk IO 10-100 users		168910.16	158834.17	-5.96
Disk IO 200-1000 users		242781.74	281285.80	+15.85
Disk IO 1200-2000 users		228518.23	218421.23	-4.41

Disk IO 10-100 users		183933.77	207928.67	+13.04
Disk IO 200-1000 users		491981.56	500162.33	+1.66
Disk IO 1200-2000 users		463395.66	467312.70	+0.84

fserver 10-100 users		254177.53	270283.08	+6.33
fserver IO 200-1000 users	269017.35	324812.2	+20.74
fserver IO 1200-2000 users	229538.87	284713.77	+24.03

Disk I/O results are little bit in negative territory. But majority of the
performance changes are in positive and it is significant in some cases.

Changes:
v3 -> v4:
 1. Took care of Geert Uytterhoeven's comment about patch #3(def_bool y)
 2. Working on separate patch sets to define CPU_BIG_ENDIAN for all the
    default big endian architectures based on feedback from Geert and Arnd.

v2 -> v3:
 1. Rebased the patches on top of 4.12-rc2.
 2. Re-ordered the patch #1 and patch #2. That is the same order I have seen
    the issues. So, it should be addressed in the same order. Patch #1 removes
    the check __LINUX_SPINLOCK_TYPES_H. Patch #2 addreses the compile error
    with qrwlock.c. This addresses the comments from Dave Miller on v2.

v1 -> v2:
Addressed the comments from David Miller.
1. Added CPU_BIG_ENDIAN for all SPARC
2. Removed #ifndef __LINUX_SPINLOCK_TYPES_H guard from spinlock_types.h
3. Removed check for CONFIG_QUEUED_RWLOCKS in SPARC64 as it is the
   default definition for SPARC64 now. Cleaned-up the previous arch_read_xxx
   and arch_write_xxx definitions as it is defined now in qrwlock.h.
4. Removed check for CONFIG_QUEUED_SPINLOCKS in SPARC64 as it is the default
   definition now for SPARC64 now. Cleaned-up the previous arch_spin_xxx
   definitions as it is defined in qspinlock.h.

v1: Initial version
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 8 vuotta sitten
vanhempi
commit
60925ee97e

+ 5 - 0
arch/sparc/Kconfig

@@ -83,6 +83,8 @@ config SPARC64
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select HAVE_NMI
 	select HAVE_REGS_AND_STACK_ACCESS_API
+	select ARCH_USE_QUEUED_RWLOCKS
+	select ARCH_USE_QUEUED_SPINLOCKS
 
 config ARCH_DEFCONFIG
 	string
@@ -92,6 +94,9 @@ config ARCH_DEFCONFIG
 config ARCH_PROC_KCORE_TEXT
 	def_bool y
 
+config CPU_BIG_ENDIAN
+	def_bool y
+
 config ARCH_ATU
 	bool
 	default y if SPARC64

+ 67 - 9
arch/sparc/include/asm/cmpxchg_64.h

@@ -6,6 +6,17 @@
 #ifndef __ARCH_SPARC64_CMPXCHG__
 #define __ARCH_SPARC64_CMPXCHG__
 
+static inline unsigned long
+__cmpxchg_u32(volatile int *m, int old, int new)
+{
+	__asm__ __volatile__("cas [%2], %3, %0"
+			     : "=&r" (new)
+			     : "0" (new), "r" (m), "r" (old)
+			     : "memory");
+
+	return new;
+}
+
 static inline unsigned long xchg32(__volatile__ unsigned int *m, unsigned int val)
 {
 	unsigned long tmp1, tmp2;
@@ -44,10 +55,38 @@ static inline unsigned long xchg64(__volatile__ unsigned long *m, unsigned long
 
 void __xchg_called_with_bad_pointer(void);
 
+/*
+ * Use 4 byte cas instruction to achieve 2 byte xchg. Main logic
+ * here is to get the bit shift of the byte we are interested in.
+ * The XOR is handy for reversing the bits for big-endian byte order.
+ */
+static inline unsigned long
+xchg16(__volatile__ unsigned short *m, unsigned short val)
+{
+	unsigned long maddr = (unsigned long)m;
+	int bit_shift = (((unsigned long)m & 2) ^ 2) << 3;
+	unsigned int mask = 0xffff << bit_shift;
+	unsigned int *ptr = (unsigned int  *) (maddr & ~2);
+	unsigned int old32, new32, load32;
+
+	/* Read the old value */
+	load32 = *ptr;
+
+	do {
+		old32 = load32;
+		new32 = (load32 & (~mask)) | val << bit_shift;
+		load32 = __cmpxchg_u32(ptr, old32, new32);
+	} while (load32 != old32);
+
+	return (load32 & mask) >> bit_shift;
+}
+
 static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 				       int size)
 {
 	switch (size) {
+	case 2:
+		return xchg16(ptr, x);
 	case 4:
 		return xchg32(ptr, x);
 	case 8:
@@ -65,10 +104,11 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr,
 
 #include <asm-generic/cmpxchg-local.h>
 
+
 static inline unsigned long
-__cmpxchg_u32(volatile int *m, int old, int new)
+__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
 {
-	__asm__ __volatile__("cas [%2], %3, %0"
+	__asm__ __volatile__("casx [%2], %3, %0"
 			     : "=&r" (new)
 			     : "0" (new), "r" (m), "r" (old)
 			     : "memory");
@@ -76,15 +116,31 @@ __cmpxchg_u32(volatile int *m, int old, int new)
 	return new;
 }
 
+/*
+ * Use 4 byte cas instruction to achieve 1 byte cmpxchg. Main logic
+ * here is to get the bit shift of the byte we are interested in.
+ * The XOR is handy for reversing the bits for big-endian byte order
+ */
 static inline unsigned long
-__cmpxchg_u64(volatile long *m, unsigned long old, unsigned long new)
+__cmpxchg_u8(volatile unsigned char *m, unsigned char old, unsigned char new)
 {
-	__asm__ __volatile__("casx [%2], %3, %0"
-			     : "=&r" (new)
-			     : "0" (new), "r" (m), "r" (old)
-			     : "memory");
-
-	return new;
+	unsigned long maddr = (unsigned long)m;
+	int bit_shift = (((unsigned long)m & 3) ^ 3) << 3;
+	unsigned int mask = 0xff << bit_shift;
+	unsigned int *ptr = (unsigned int *) (maddr & ~3);
+	unsigned int old32, new32, load;
+	unsigned int load32 = *ptr;
+
+	do {
+		new32 = (load32 & ~mask) | (new << bit_shift);
+		old32 = (load32 & ~mask) | (old << bit_shift);
+		load32 = __cmpxchg_u32(ptr, old32, new32);
+		if (load32 == old32)
+			return old;
+		load = (load32 & mask) >> bit_shift;
+	} while (load == old);
+
+	return load;
 }
 
 /* This function doesn't exist, so you'll get a linker error
@@ -95,6 +151,8 @@ static inline unsigned long
 __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size)
 {
 	switch (size) {
+		case 1:
+			return __cmpxchg_u8(ptr, old, new);
 		case 4:
 			return __cmpxchg_u32(ptr, old, new);
 		case 8:

+ 7 - 0
arch/sparc/include/asm/qrwlock.h

@@ -0,0 +1,7 @@
+#ifndef _ASM_SPARC_QRWLOCK_H
+#define _ASM_SPARC_QRWLOCK_H
+
+#include <asm-generic/qrwlock_types.h>
+#include <asm-generic/qrwlock.h>
+
+#endif /* _ASM_SPARC_QRWLOCK_H */

+ 7 - 0
arch/sparc/include/asm/qspinlock.h

@@ -0,0 +1,7 @@
+#ifndef _ASM_SPARC_QSPINLOCK_H
+#define _ASM_SPARC_QSPINLOCK_H
+
+#include <asm-generic/qspinlock_types.h>
+#include <asm-generic/qspinlock.h>
+
+#endif /* _ASM_SPARC_QSPINLOCK_H */

+ 2 - 206
arch/sparc/include/asm/spinlock_64.h

@@ -10,216 +10,12 @@
 
 #include <asm/processor.h>
 #include <asm/barrier.h>
-
-/* To get debugging spinlocks which detect and catch
- * deadlock situations, set CONFIG_DEBUG_SPINLOCK
- * and rebuild your kernel.
- */
-
-/* Because we play games to save cycles in the non-contention case, we
- * need to be extra careful about branch targets into the "spinning"
- * code.  They live in their own section, but the newer V9 branches
- * have a shorter range than the traditional 32-bit sparc branch
- * variants.  The rule is that the branches that go into and out of
- * the spinner sections must be pre-V9 branches.
- */
-
-#define arch_spin_is_locked(lp)	((lp)->lock != 0)
-
-static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
-{
-	smp_cond_load_acquire(&lock->lock, !VAL);
-}
-
-static inline void arch_spin_lock(arch_spinlock_t *lock)
-{
-	unsigned long tmp;
-
-	__asm__ __volatile__(
-"1:	ldstub		[%1], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	ldub		[%1], %0\n"
-"	brnz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 1b\n"
-"	.previous"
-	: "=&r" (tmp)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_spin_trylock(arch_spinlock_t *lock)
-{
-	unsigned long result;
-
-	__asm__ __volatile__(
-"	ldstub		[%1], %0\n"
-	: "=r" (result)
-	: "r" (lock)
-	: "memory");
-
-	return (result == 0UL);
-}
-
-static inline void arch_spin_unlock(arch_spinlock_t *lock)
-{
-	__asm__ __volatile__(
-"	stb		%%g0, [%0]"
-	: /* No outputs */
-	: "r" (lock)
-	: "memory");
-}
-
-static inline void arch_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__(
-"1:	ldstub		[%2], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	rdpr		%%pil, %1\n"
-"	wrpr		%3, %%pil\n"
-"3:	ldub		[%2], %0\n"
-"	brnz,pt		%0, 3b\n"
-"	 nop\n"
-"	ba,pt		%%xcc, 1b\n"
-"	 wrpr		%1, %%pil\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r"(lock), "r"(flags)
-	: "memory");
-}
-
-/* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */
-
-static inline void arch_read_lock(arch_rwlock_t *lock)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__ (
-"1:	ldsw		[%2], %0\n"
-"	brlz,pn		%0, 2f\n"
-"4:	 add		%0, 1, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	ldsw		[%2], %0\n"
-"	brlz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 4b\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_read_trylock(arch_rwlock_t *lock)
-{
-	int tmp1, tmp2;
-
-	__asm__ __volatile__ (
-"1:	ldsw		[%2], %0\n"
-"	brlz,a,pn	%0, 2f\n"
-"	 mov		0, %0\n"
-"	add		%0, 1, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 mov		1, %0\n"
-"2:"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-
-	return tmp1;
-}
-
-static inline void arch_read_unlock(arch_rwlock_t *lock)
-{
-	unsigned long tmp1, tmp2;
-
-	__asm__ __volatile__(
-"1:	lduw	[%2], %0\n"
-"	sub	%0, 1, %1\n"
-"	cas	[%2], %0, %1\n"
-"	cmp	%0, %1\n"
-"	bne,pn	%%xcc, 1b\n"
-"	 nop"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock)
-	: "memory");
-}
-
-static inline void arch_write_lock(arch_rwlock_t *lock)
-{
-	unsigned long mask, tmp1, tmp2;
-
-	mask = 0x80000000UL;
-
-	__asm__ __volatile__(
-"1:	lduw		[%2], %0\n"
-"	brnz,pn		%0, 2f\n"
-"4:	 or		%0, %3, %1\n"
-"	cas		[%2], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	.subsection	2\n"
-"2:	lduw		[%2], %0\n"
-"	brnz,pt		%0, 2b\n"
-"	 nop\n"
-"	ba,a,pt		%%xcc, 4b\n"
-"	.previous"
-	: "=&r" (tmp1), "=&r" (tmp2)
-	: "r" (lock), "r" (mask)
-	: "memory");
-}
-
-static inline void arch_write_unlock(arch_rwlock_t *lock)
-{
-	__asm__ __volatile__(
-"	stw		%%g0, [%0]"
-	: /* no outputs */
-	: "r" (lock)
-	: "memory");
-}
-
-static inline int arch_write_trylock(arch_rwlock_t *lock)
-{
-	unsigned long mask, tmp1, tmp2, result;
-
-	mask = 0x80000000UL;
-
-	__asm__ __volatile__(
-"	mov		0, %2\n"
-"1:	lduw		[%3], %0\n"
-"	brnz,pn		%0, 2f\n"
-"	 or		%0, %4, %1\n"
-"	cas		[%3], %0, %1\n"
-"	cmp		%0, %1\n"
-"	bne,pn		%%icc, 1b\n"
-"	 nop\n"
-"	mov		1, %2\n"
-"2:"
-	: "=&r" (tmp1), "=&r" (tmp2), "=&r" (result)
-	: "r" (lock), "r" (mask)
-	: "memory");
-
-	return result;
-}
+#include <asm/qrwlock.h>
+#include <asm/qspinlock.h>
 
 #define arch_read_lock_flags(p, f) arch_read_lock(p)
 #define arch_write_lock_flags(p, f) arch_write_lock(p)
 
-#define arch_read_can_lock(rw)		(!((rw)->lock & 0x80000000UL))
-#define arch_write_can_lock(rw)	(!(rw)->lock)
-
 #define arch_spin_relax(lock)	cpu_relax()
 #define arch_read_relax(lock)	cpu_relax()
 #define arch_write_relax(lock)	cpu_relax()

+ 8 - 4
arch/sparc/include/asm/spinlock_types.h

@@ -1,20 +1,24 @@
 #ifndef __SPARC_SPINLOCK_TYPES_H
 #define __SPARC_SPINLOCK_TYPES_H
 
-#ifndef __LINUX_SPINLOCK_TYPES_H
-# error "please don't include this file directly"
-#endif
+#ifdef CONFIG_QUEUED_SPINLOCKS
+#include <asm-generic/qspinlock_types.h>
+#else
 
 typedef struct {
 	volatile unsigned char lock;
 } arch_spinlock_t;
 
 #define __ARCH_SPIN_LOCK_UNLOCKED	{ 0 }
+#endif /* CONFIG_QUEUED_SPINLOCKS */
 
+#ifdef CONFIG_QUEUED_RWLOCKS
+#include <asm-generic/qrwlock_types.h>
+#else
 typedef struct {
 	volatile unsigned int lock;
 } arch_rwlock_t;
 
 #define __ARCH_RW_LOCK_UNLOCKED		{ 0 }
-
+#endif /* CONFIG_QUEUED_RWLOCKS */
 #endif

+ 1 - 0
kernel/locking/qrwlock.c

@@ -20,6 +20,7 @@
 #include <linux/cpumask.h>
 #include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <linux/spinlock.h>
 #include <asm/qrwlock.h>
 
 /*