浏览代码

lib/GCD.c: use binary GCD algorithm instead of Euclidean

The binary GCD algorithm is based on the following facts:
	1. If a and b are all evens, then gcd(a,b) = 2 * gcd(a/2, b/2)
	2. If a is even and b is odd, then gcd(a,b) = gcd(a/2, b)
	3. If a and b are all odds, then gcd(a,b) = gcd((a-b)/2, b) = gcd((a+b)/2, b)

Even on x86 machines with reasonable division hardware, the binary
algorithm runs about 25% faster (80% the execution time) than the
division-based Euclidian algorithm.

On platforms like Alpha and ARMv6 where division is a function call to
emulation code, it's even more significant.

There are two variants of the code here, depending on whether a fast
__ffs (find least significant set bit) instruction is available.  This
allows the unpredictable branches in the bit-at-a-time shifting loop to
be eliminated.

If fast __ffs is not available, the "even/odd" GCD variant is used.

I use the following code to benchmark:

	#include <stdio.h>
	#include <stdlib.h>
	#include <stdint.h>
	#include <string.h>
	#include <time.h>
	#include <unistd.h>

	#define swap(a, b) \
		do { \
			a ^= b; \
			b ^= a; \
			a ^= b; \
		} while (0)

	unsigned long gcd0(unsigned long a, unsigned long b)
	{
		unsigned long r;

		if (a < b) {
			swap(a, b);
		}

		if (b == 0)
			return a;

		while ((r = a % b) != 0) {
			a = b;
			b = r;
		}

		return b;
	}

	unsigned long gcd1(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd2(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	unsigned long gcd3(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		b >>= __builtin_ctzl(b);
		if (b == 1)
			return r & -r;

		for (;;) {
			a >>= __builtin_ctzl(a);
			if (a == 1)
				return r & -r;
			if (a == b)
				return a << __builtin_ctzl(r);

			if (a < b)
				swap(a, b);
			a -= b;
		}
	}

	unsigned long gcd4(unsigned long a, unsigned long b)
	{
		unsigned long r = a | b;

		if (!a || !b)
			return r;

		r &= -r;

		while (!(b & r))
			b >>= 1;
		if (b == r)
			return r;

		for (;;) {
			while (!(a & r))
				a >>= 1;
			if (a == r)
				return r;
			if (a == b)
				return a;

			if (a < b)
				swap(a, b);
			a -= b;
			a >>= 1;
			if (a & r)
				a += b;
			a >>= 1;
		}
	}

	static unsigned long (*gcd_func[])(unsigned long a, unsigned long b) = {
		gcd0, gcd1, gcd2, gcd3, gcd4,
	};

	#define TEST_ENTRIES (sizeof(gcd_func) / sizeof(gcd_func[0]))

	#if defined(__x86_64__)

	#define rdtscll(val) do { \
		unsigned long __a,__d; \
		__asm__ __volatile__("rdtsc" : "=a" (__a), "=d" (__d)); \
		(val) = ((unsigned long long)__a) | (((unsigned long long)__d)<<32); \
	} while(0)

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		unsigned long long start, end;
		unsigned long long ret;
		unsigned long gcd_res;

		rdtscll(start);
		gcd_res = gcd(a, b);
		rdtscll(end);

		if (end >= start)
			ret = end - start;
		else
			ret = ~0ULL - start + 1 + end;

		*res = gcd_res;
		return ret;
	}

	#else

	static inline struct timespec read_time(void)
	{
		struct timespec time;
		clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &time);
		return time;
	}

	static inline unsigned long long diff_time(struct timespec start, struct timespec end)
	{
		struct timespec temp;

		if ((end.tv_nsec - start.tv_nsec) < 0) {
			temp.tv_sec = end.tv_sec - start.tv_sec - 1;
			temp.tv_nsec = 1000000000ULL + end.tv_nsec - start.tv_nsec;
		} else {
			temp.tv_sec = end.tv_sec - start.tv_sec;
			temp.tv_nsec = end.tv_nsec - start.tv_nsec;
		}

		return temp.tv_sec * 1000000000ULL + temp.tv_nsec;
	}

	static unsigned long long benchmark_gcd_func(unsigned long (*gcd)(unsigned long, unsigned long),
								unsigned long a, unsigned long b, unsigned long *res)
	{
		struct timespec start, end;
		unsigned long gcd_res;

		start = read_time();
		gcd_res = gcd(a, b);
		end = read_time();

		*res = gcd_res;
		return diff_time(start, end);
	}

	#endif

	static inline unsigned long get_rand()
	{
		if (sizeof(long) == 8)
			return (unsigned long)rand() << 32 | rand();
		else
			return rand();
	}

	int main(int argc, char **argv)
	{
		unsigned int seed = time(0);
		int loops = 100;
		int repeats = 1000;
		unsigned long (*res)[TEST_ENTRIES];
		unsigned long long elapsed[TEST_ENTRIES];
		int i, j, k;

		for (;;) {
			int opt = getopt(argc, argv, "n:r:s:");
			/* End condition always first */
			if (opt == -1)
				break;

			switch (opt) {
			case 'n':
				loops = atoi(optarg);
				break;
			case 'r':
				repeats = atoi(optarg);
				break;
			case 's':
				seed = strtoul(optarg, NULL, 10);
				break;
			default:
				/* You won't actually get here. */
				break;
			}
		}

		res = malloc(sizeof(unsigned long) * TEST_ENTRIES * loops);
		memset(elapsed, 0, sizeof(elapsed));

		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			/* Do we have args? */
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			unsigned long long min_elapsed[TEST_ENTRIES];
			for (k = 0; k < repeats; k++) {
				for (i = 0; i < TEST_ENTRIES; i++) {
					unsigned long long tmp = benchmark_gcd_func(gcd_func[i], a, b, &res[j][i]);
					if (k == 0 || min_elapsed[i] > tmp)
						min_elapsed[i] = tmp;
				}
			}
			for (i = 0; i < TEST_ENTRIES; i++)
				elapsed[i] += min_elapsed[i];
		}

		for (i = 0; i < TEST_ENTRIES; i++)
			printf("gcd%d: elapsed %llu\n", i, elapsed[i]);

		k = 0;
		srand(seed);
		for (j = 0; j < loops; j++) {
			unsigned long a = get_rand();
			unsigned long b = argc > optind ? strtoul(argv[optind], NULL, 10) : get_rand();
			for (i = 1; i < TEST_ENTRIES; i++) {
				if (res[j][i] != res[j][0])
					break;
			}
			if (i < TEST_ENTRIES) {
				if (k == 0) {
					k = 1;
					fprintf(stderr, "Error:\n");
				}
				fprintf(stderr, "gcd(%lu, %lu): ", a, b);
				for (i = 0; i < TEST_ENTRIES; i++)
					fprintf(stderr, "%ld%s", res[j][i], i < TEST_ENTRIES - 1 ? ", " : "\n");
			}
		}

		if (k == 0)
			fprintf(stderr, "PASS\n");

		free(res);

		return 0;
	}

Compiled with "-O2", on "VirtualBox 4.4.0-22-generic #38-Ubuntu x86_64" got:

  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 10174
  gcd1: elapsed 2120
  gcd2: elapsed 2902
  gcd3: elapsed 2039
  gcd4: elapsed 2812
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9309
  gcd1: elapsed 2280
  gcd2: elapsed 2822
  gcd3: elapsed 2217
  gcd4: elapsed 2710
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9589
  gcd1: elapsed 2098
  gcd2: elapsed 2815
  gcd3: elapsed 2030
  gcd4: elapsed 2718
  PASS
  zhaoxiuzeng@zhaoxiuzeng-VirtualBox:~/develop$ ./gcd -r 500000 -n 10
  gcd0: elapsed 9914
  gcd1: elapsed 2309
  gcd2: elapsed 2779
  gcd3: elapsed 2228
  gcd4: elapsed 2709
  PASS

[akpm@linux-foundation.org: avoid #defining a CONFIG_ variable]
Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@gmail.com>
Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Zhaoxiu Zeng 9 年之前
父节点
当前提交
fff7fb0b2d

+ 3 - 0
arch/Kconfig

@@ -647,4 +647,7 @@ config COMPAT_OLD_SIGACTION
 config ARCH_NO_COHERENT_DMA_MMAP
 config ARCH_NO_COHERENT_DMA_MMAP
 	bool
 	bool
 
 
+config CPU_NO_EFFICIENT_FFS
+	def_bool n
+
 source "kernel/gcov/Kconfig"
 source "kernel/gcov/Kconfig"

+ 1 - 0
arch/alpha/Kconfig

@@ -26,6 +26,7 @@ config ALPHA
 	select MODULES_USE_ELF_RELA
 	select MODULES_USE_ELF_RELA
 	select ODD_RT_SIGACTION
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select OLD_SIGSUSPEND
+	select CPU_NO_EFFICIENT_FFS if !ALPHA_EV67
 	help
 	help
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  The Alpha is a 64-bit general-purpose processor designed and
 	  marketed by the Digital Equipment Corporation of blessed memory,
 	  marketed by the Digital Equipment Corporation of blessed memory,

+ 1 - 0
arch/arc/Kconfig

@@ -107,6 +107,7 @@ choice
 
 
 config ISA_ARCOMPACT
 config ISA_ARCOMPACT
 	bool "ARCompact ISA"
 	bool "ARCompact ISA"
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  The original ARC ISA of ARC600/700 cores
 	  The original ARC ISA of ARC600/700 cores
 
 

+ 3 - 0
arch/arm/mm/Kconfig

@@ -421,18 +421,21 @@ config CPU_32v3
 	select CPU_USE_DOMAINS if MMU
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 
 config CPU_32v4
 config CPU_32v4
 	bool
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 
 config CPU_32v4T
 config CPU_32v4T
 	bool
 	bool
 	select CPU_USE_DOMAINS if MMU
 	select CPU_USE_DOMAINS if MMU
 	select NEED_KUSER_HELPERS
 	select NEED_KUSER_HELPERS
 	select TLS_REG_EMUL if SMP || !MMU
 	select TLS_REG_EMUL if SMP || !MMU
+	select CPU_NO_EFFICIENT_FFS
 
 
 config CPU_32v5
 config CPU_32v5
 	bool
 	bool

+ 1 - 0
arch/h8300/Kconfig

@@ -20,6 +20,7 @@ config H8300
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_LZO
 	select HAVE_KERNEL_LZO
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_KGDB
+	select CPU_NO_EFFICIENT_FFS
 
 
 config RWSEM_GENERIC_SPINLOCK
 config RWSEM_GENERIC_SPINLOCK
 	def_bool y
 	def_bool y

+ 1 - 0
arch/m32r/Kconfig

@@ -17,6 +17,7 @@ config M32R
 	select ARCH_USES_GETTIMEOFFSET
 	select ARCH_USES_GETTIMEOFFSET
 	select MODULES_USE_ELF_RELA
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DEBUG_STACKOVERFLOW
+	select CPU_NO_EFFICIENT_FFS
 
 
 config SBUS
 config SBUS
 	bool
 	bool

+ 11 - 0
arch/m68k/Kconfig.cpu

@@ -40,6 +40,7 @@ config M68000
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_MULDIV64
 	select CPU_HAS_NO_UNALIGNED
 	select CPU_HAS_NO_UNALIGNED
 	select GENERIC_CSUM
 	select GENERIC_CSUM
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  The Freescale (was Motorola) 68000 CPU is the first generation of
 	  the well known M68K family of processors. The CPU core as well as
 	  the well known M68K family of processors. The CPU core as well as
@@ -51,6 +52,7 @@ config MCPU32
 	bool
 	bool
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_BITFIELDS
 	select CPU_HAS_NO_UNALIGNED
 	select CPU_HAS_NO_UNALIGNED
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  The Freescale (was then Motorola) CPU32 is a CPU core that is
 	  based on the 68020 processor. For the most part it is used in
 	  based on the 68020 processor. For the most part it is used in
@@ -130,6 +132,7 @@ config M5206
 	depends on !MMU
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5206 processor support.
 	  Motorola ColdFire 5206 processor support.
 
 
@@ -138,6 +141,7 @@ config M5206e
 	depends on !MMU
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5206e processor support.
 	  Motorola ColdFire 5206e processor support.
 
 
@@ -163,6 +167,7 @@ config M5249
 	depends on !MMU
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5249 processor support.
 	  Motorola ColdFire 5249 processor support.
 
 
@@ -171,6 +176,7 @@ config M525x
 	depends on !MMU
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 	  Freescale (Motorola) Coldfire 5251/5253 processor support.
 
 
@@ -189,6 +195,7 @@ config M5272
 	depends on !MMU
 	depends on !MMU
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5272 processor support.
 	  Motorola ColdFire 5272 processor support.
 
 
@@ -217,6 +224,7 @@ config M5307
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5307 processor support.
 	  Motorola ColdFire 5307 processor support.
 
 
@@ -242,6 +250,7 @@ config M5407
 	select COLDFIRE_SW_A7
 	select COLDFIRE_SW_A7
 	select HAVE_CACHE_CB
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Motorola ColdFire 5407 processor support.
 	  Motorola ColdFire 5407 processor support.
 
 
@@ -251,6 +260,7 @@ config M547x
 	select MMU_COLDFIRE if MMU
 	select MMU_COLDFIRE if MMU
 	select HAVE_CACHE_CB
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 	  Freescale ColdFire 5470/5471/5472/5473/5474/5475 processor support.
 
 
@@ -260,6 +270,7 @@ config M548x
 	select M54xx
 	select M54xx
 	select HAVE_CACHE_CB
 	select HAVE_CACHE_CB
 	select HAVE_MBAR
 	select HAVE_MBAR
+	select CPU_NO_EFFICIENT_FFS
 	help
 	help
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 	  Freescale ColdFire 5480/5481/5482/5483/5484/5485 processor support.
 
 

+ 1 - 0
arch/metag/Kconfig

@@ -30,6 +30,7 @@ config METAG
 	select OF
 	select OF
 	select OF_EARLY_FLATTREE
 	select OF_EARLY_FLATTREE
 	select SPARSE_IRQ
 	select SPARSE_IRQ
+	select CPU_NO_EFFICIENT_FFS
 
 
 config STACKTRACE_SUPPORT
 config STACKTRACE_SUPPORT
 	def_bool y
 	def_bool y

+ 1 - 0
arch/microblaze/Kconfig

@@ -32,6 +32,7 @@ config MICROBLAZE
 	select OF_EARLY_FLATTREE
 	select OF_EARLY_FLATTREE
 	select TRACING_SUPPORT
 	select TRACING_SUPPORT
 	select VIRT_TO_BUS
 	select VIRT_TO_BUS
+	select CPU_NO_EFFICIENT_FFS
 
 
 config SWAP
 config SWAP
 	def_bool n
 	def_bool n

+ 10 - 0
arch/mips/include/asm/cpu-features.h

@@ -204,6 +204,16 @@
 #endif
 #endif
 #endif
 #endif
 
 
+/* __builtin_constant_p(cpu_has_mips_r) && cpu_has_mips_r */
+#if !((defined(cpu_has_mips32r1) && cpu_has_mips32r1) || \
+	  (defined(cpu_has_mips32r2) && cpu_has_mips32r2) || \
+	  (defined(cpu_has_mips32r6) && cpu_has_mips32r6) || \
+	  (defined(cpu_has_mips64r1) && cpu_has_mips64r1) || \
+	  (defined(cpu_has_mips64r2) && cpu_has_mips64r2) || \
+	  (defined(cpu_has_mips64r6) && cpu_has_mips64r6))
+#define CPU_NO_EFFICIENT_FFS 1
+#endif
+
 #ifndef cpu_has_mips_1
 #ifndef cpu_has_mips_1
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 # define cpu_has_mips_1		(!cpu_has_mips_r6)
 #endif
 #endif

+ 1 - 0
arch/nios2/Kconfig

@@ -15,6 +15,7 @@ config NIOS2
 	select SOC_BUS
 	select SOC_BUS
 	select SPARSE_IRQ
 	select SPARSE_IRQ
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
 	select USB_ARCH_HAS_HCD if USB_SUPPORT
+	select CPU_NO_EFFICIENT_FFS
 
 
 config GENERIC_CSUM
 config GENERIC_CSUM
 	def_bool y
 	def_bool y

+ 1 - 0
arch/openrisc/Kconfig

@@ -25,6 +25,7 @@ config OPENRISC
 	select MODULES_USE_ELF_RELA
 	select MODULES_USE_ELF_RELA
 	select HAVE_DEBUG_STACKOVERFLOW
 	select HAVE_DEBUG_STACKOVERFLOW
 	select OR1K_PIC
 	select OR1K_PIC
+	select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1
 
 
 config MMU
 config MMU
 	def_bool y
 	def_bool y

+ 1 - 0
arch/parisc/Kconfig

@@ -32,6 +32,7 @@ config PARISC
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SECCOMP_FILTER
 	select ARCH_NO_COHERENT_DMA_MMAP
 	select ARCH_NO_COHERENT_DMA_MMAP
+	select CPU_NO_EFFICIENT_FFS
 
 
 	help
 	help
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used
 	  The PA-RISC microprocessor is designed by Hewlett-Packard and used

+ 1 - 0
arch/s390/Kconfig

@@ -123,6 +123,7 @@ config S390
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_AUDITSYSCALL
 	select HAVE_ARCH_EARLY_PFN_TO_NID
 	select HAVE_ARCH_EARLY_PFN_TO_NID
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_JUMP_LABEL
+	select CPU_NO_EFFICIENT_FFS if !HAVE_MARCH_Z9_109_FEATURES
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRACEHOOK

+ 1 - 0
arch/score/Kconfig

@@ -14,6 +14,7 @@ config SCORE
 	select VIRT_TO_BUS
 	select VIRT_TO_BUS
 	select MODULES_USE_ELF_REL
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
 	select CLONE_BACKWARDS
+	select CPU_NO_EFFICIENT_FFS
 
 
 choice
 choice
 	prompt "System type"
 	prompt "System type"

+ 1 - 0
arch/sh/Kconfig

@@ -20,6 +20,7 @@ config SUPERH
 	select PERF_USE_VMALLOC
 	select PERF_USE_VMALLOC
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_DEBUG_KMEMLEAK
 	select HAVE_KERNEL_GZIP
 	select HAVE_KERNEL_GZIP
+	select CPU_NO_EFFICIENT_FFS
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_BZIP2
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_LZMA
 	select HAVE_KERNEL_XZ
 	select HAVE_KERNEL_XZ

+ 1 - 0
arch/sparc/Kconfig

@@ -42,6 +42,7 @@ config SPARC
 	select ODD_RT_SIGACTION
 	select ODD_RT_SIGACTION
 	select OLD_SIGSUSPEND
 	select OLD_SIGSUSPEND
 	select ARCH_HAS_SG_CHAIN
 	select ARCH_HAS_SG_CHAIN
+	select CPU_NO_EFFICIENT_FFS
 
 
 config SPARC32
 config SPARC32
 	def_bool !64BIT
 	def_bool !64BIT

+ 67 - 10
lib/gcd.c

@@ -2,20 +2,77 @@
 #include <linux/gcd.h>
 #include <linux/gcd.h>
 #include <linux/export.h>
 #include <linux/export.h>
 
 
-/* Greatest common divisor */
+/*
+ * This implements the binary GCD algorithm. (Often attributed to Stein,
+ * but as Knuth has noted, appears in a first-century Chinese math text.)
+ *
+ * This is faster than the division-based algorithm even on x86, which
+ * has decent hardware division.
+ */
+
+#if !defined(CONFIG_CPU_NO_EFFICIENT_FFS) && !defined(CPU_NO_EFFICIENT_FFS)
+
+/* If __ffs is available, the even/odd algorithm benchmarks slower. */
 unsigned long gcd(unsigned long a, unsigned long b)
 unsigned long gcd(unsigned long a, unsigned long b)
 {
 {
-	unsigned long r;
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
 
 
-	if (a < b)
-		swap(a, b);
+	b >>= __ffs(b);
+	if (b == 1)
+		return r & -r;
 
 
-	if (!b)
-		return a;
-	while ((r = a % b) != 0) {
-		a = b;
-		b = r;
+	for (;;) {
+		a >>= __ffs(a);
+		if (a == 1)
+			return r & -r;
+		if (a == b)
+			return a << __ffs(r);
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
 	}
 	}
-	return b;
 }
 }
+
+#else
+
+/* If normalization is done by loops, the even/odd algorithm is a win. */
+unsigned long gcd(unsigned long a, unsigned long b)
+{
+	unsigned long r = a | b;
+
+	if (!a || !b)
+		return r;
+
+	/* Isolate lsbit of r */
+	r &= -r;
+
+	while (!(b & r))
+		b >>= 1;
+	if (b == r)
+		return r;
+
+	for (;;) {
+		while (!(a & r))
+			a >>= 1;
+		if (a == r)
+			return r;
+		if (a == b)
+			return a;
+
+		if (a < b)
+			swap(a, b);
+		a -= b;
+		a >>= 1;
+		if (a & r)
+			a += b;
+		a >>= 1;
+	}
+}
+
+#endif
+
 EXPORT_SYMBOL_GPL(gcd);
 EXPORT_SYMBOL_GPL(gcd);